From 75d76bd83609e828c3e11a4004e76cd4b768b391 Mon Sep 17 00:00:00 2001 From: Packit Service Date: Dec 08 2020 08:14:31 +0000 Subject: Changes after running %prep ignore: true --- diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt deleted file mode 100644 index 2719005..0000000 --- a/CONTRIBUTORS.txt +++ /dev/null @@ -1,98 +0,0 @@ -The Red Hat VDO Team: - Principal Engineer/Lead Architect: - J. corwin Coburn - - Primary Authors: - Joseph Chapman - Sweet Tea Dorminy - *Thomas Jaskiewicz - Bruce Johnston - Susan McGhee - Ken Raeburn - Michael Sclafani - Matthew Sakai - Joseph Shimkus - John Wiele - - Support, Testing, Documentation, and other things too numerous to mention: - Chung Chung : - Bryan Gurney - *Simon J. Hernandez - Jakub Krysl - Marek Suchanek - - Project Management & Technical Direction: - Jered Floyd - Louis Imershein - Dennis Keefe - Andrew Walsh - - *former team members - -Other Contributors: - Ji-Hyeon Gim : - Updates for FC26/Kernel 4.13 - Vojtech Trefny - Getting correct size of partitions - Achilles Gaikwad - Bash completion for the vdo and vdostats commands - Jin-young Kwon - Adding vdo --version command, and documentation fixes - -VDO was originally created at Permabit Technology Corporation, and was -subsequently acquired and open-sourced by Red Hat. - -Former Members of the Permabit VDO Team: - Engineers: - Mark Amidon - David Buckle - Jacky Chu - Joel Hoff - Dimitri Kountourogianni - Alexis Layton - Michael Lee - Rich Macchi - Dave Paniriti - Karl Ramm - Hooman Vassef - Assar Westurlund - - Support, Testing, Documentation, etc. - Carl Alexander - Mike Chu - Mark Iskra - Farid Jahanmir - Francesca Koulikov - Erik Lattimore - Jennifer Levine - Randy Long - Steve Looby - Uche Onyekwuluje - Catherine Powell - Jeff Pozz - Sarmad Sada - John Schmidt - Omri Schwarz - Jay Splaine - John Welle - Mary-Anne Wolf - Devon Yablonski - Robert Zupko - - Interns: - Ari Entlich - Lori Monteleone - - Project Management & Technical Direction: - Michael Fortson - -Other Past Permabit Contributors (for early work on the index): - James Clough - Dave Golombek - Albert Lin - Edwin Olson - Dave Pinkney - Rich Brennan - -And Very Special Thanks To: - Norman Margolis, who started the whole thing diff --git a/COPYING b/COPYING deleted file mode 100644 index 7d5393a..0000000 --- a/COPYING +++ /dev/null @@ -1,278 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Lesser General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. diff --git a/Makefile b/Makefile deleted file mode 100644 index 4084615..0000000 --- a/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-y += uds/ -obj-y += vdo/ diff --git a/README.md b/README.md deleted file mode 100644 index 528277d..0000000 --- a/README.md +++ /dev/null @@ -1,125 +0,0 @@ -# kvdo - -A pair of kernel modules which provide pools of deduplicated and/or compressed -block storage. - -## Background - -VDO (which includes [kvdo](https://github.com/dm-vdo/kvdo) and -[vdo](https://github.com/dm-vdo/vdo)) is software that provides inline -block-level deduplication, compression, and thin provisioning capabilities for -primary storage. VDO installs within the Linux device mapper framework, where -it takes ownership of existing physical block devices and remaps these to new, -higher-level block devices with data-efficiency capabilities. - -Deduplication is a technique for reducing the consumption of storage resources -by eliminating multiple copies of duplicate blocks. Compression takes the -individual unique blocks and shrinks them with coding algorithms; these reduced -blocks are then efficiently packed together into physical blocks. Thin -provisioning manages the mapping from LBAs presented by VDO to where the data -has actually been stored, and also eliminates any blocks of all zeroes. - -With deduplication, instead of writing the same data more than once each -duplicate block is detected and recorded as a reference to the original -block. VDO maintains a mapping from logical block addresses (used by the -storage layer above VDO) to physical block addresses (used by the storage layer -under VDO). After deduplication, multiple logical block addresses may be mapped -to the same physical block address; these are called shared blocks and are -reference-counted by the software. - -With VDO's compression, multiple blocks (or shared blocks) are compressed with -the fast LZ4 algorithm, and binned together where possible so that multiple -compressed blocks fit within a 4 KB block on the underlying storage. Mapping -from LBA is to a physical block address and index within it for the desired -compressed data. All compressed blocks are individually reference counted for -correctness. - -Block sharing and block compression are invisible to applications using the -storage, which read and write blocks as they would if VDO were not -present. When a shared block is overwritten, a new physical block is allocated -for storing the new block data to ensure that other logical block addresses -that are mapped to the shared physical block are not modified. - -This public source release of VDO includes two kernel modules, and a set of -userspace tools for managing them. The "kvdo" module implements fine-grained -storage virtualization, thin provisioning, block sharing, and compression; the -"uds" module provides memory-efficient duplicate identification. The userspace -tools include a pair of python scripts, "vdo" for creating and managing VDO -volumes, and "vdostats" for extracting statistics from those volumes. - -## Documentation - -- [RHEL8 VDO Documentation](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/deduplicating_and_compressing_storage/index) -- [RHEL7 VDO Integration Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/vdo-integration) -- [RHEL7 VDO Evaluation Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/vdo-evaluation) - -## Releases - -Each branch on this project is intended to work with a specific release of -Enterprise Linux (Red Hat Enterprise Linux, CentOS, etc.). We try to maintain -compatibility with active Fedora releases, but some modifications may be -required. - -Version | Intended Enterprise Linux Release | Supported With Modifications -------- | --------------------------------- | ------------------------------- -6.1.x.x | EL7 (3.10.0-*.el7) | -6.2.x.x | EL8 (4.18.0-*.el8) | Fedora 28, Fedora 29, Fedora 30, Rawhide -* Pre-built versions with the required modifications for the referenced Fedora - releases can be found - [here](https://copr.fedorainfracloud.org/coprs/rhawalsh/dm-vdo) and can be - used by running `dnf copr enable rhawalsh/dm-vdo`. - -## Status - -VDO was originally developed by Permabit Technology Corp. as a proprietary set -of kernel modules and userspace tools. This software and technology has been -acquired by Red Hat, has been relicensed under the GPL (v2 or later), and this -repository begins the process of preparing for integration with the upstream -kernel. - -While this software has been relicensed there are a number of issues that must -still be addressed to be ready for upstream. These include: - -- Conformance with kernel coding standards -- Use of existing EXPORT_SYMBOL_GPL kernel interfaces where appropriate -- Refactoring of primitives (e.g. cryptographic) to appropriate kernel - subsystems -- Support for non-x86-64 platforms -- Refactoring of platform layer abstractions and other changes requested by - upstream maintainers - -We expect addressing these issues to take some time. In the meanwhile, this -project allows interested parties to begin using VDO immediately. The -technology itself is thoroughly tested, mature, and in production use since -2014 in its previous proprietary form. - -## Building - -In order to build the kernel modules, invoke the following command -from the top directory of this tree: - - make -C /usr/src/kernels/`uname -r` M=`pwd` - -* Patched sources that work with the most recent upstream kernels can be found - [here](https://github.com/rhawalsh/kvdo). - -## Communication channels - -Community feedback, participation and patches are welcome to the -vdo-devel@redhat.com mailing list -- subscribe -[here](https://www.redhat.com/mailman/listinfo/vdo-devel). - -## Contributing - -This project is currently a stepping stone towards integration with the Linux -kernel. As such, contributions are welcome via a process similar to that for -Linux kernel development. Patches should be submitted to the -vdo-devel@redhat.com mailing list, where they will be considered for -inclusion. This project does not accept pull requests. - -## Licensing - -[GPL v2.0 or later](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html). -All contributions retain ownership by their original author, but must also -be licensed under the GPL 2.0 or later to be merged. - diff --git a/TODO b/TODO deleted file mode 100644 index d2d5cf5..0000000 --- a/TODO +++ /dev/null @@ -1,6 +0,0 @@ -- Conform to kernel coding standards -- Use existing EXPORT_SYMBOL_GPL kernel interfaces where appropriate -- Refactor primitives (e.g. cryptographic) to appropriate kernel subsystems -- Support non-x86-64 platforms -- Refactor platform layer abstractions and other changes requested by upstream - maintainers diff --git a/kvdo.spec b/kvdo.spec deleted file mode 100644 index e340b2c..0000000 --- a/kvdo.spec +++ /dev/null @@ -1,89 +0,0 @@ -%define spec_release 1 -%define kmod_name kvdo -%define kmod_driver_version 6.2.4.26 -%define kmod_rpm_release %{spec_release} -%define kmod_kernel_version 3.10.0-693.el7 - -# Disable the scanning for a debug package -%global debug_package %{nil} - -Source0: kmod-%{kmod_name}-%{kmod_driver_version}.tgz - -Name: kmod-kvdo -Version: %{kmod_driver_version} -Release: %{kmod_rpm_release}%{?dist} -Summary: Kernel Modules for Virtual Data Optimizer -License: GPLv2+ -URL: http://github.com/dm-vdo/kvdo -BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) -Requires: dkms -Requires: kernel-devel >= %{kmod_kernel_version} -Requires: make -ExclusiveArch: x86_64 -ExcludeArch: s390 -ExcludeArch: s390x -ExcludeArch: ppc -ExcludeArch: ppc64 -ExcludeArch: ppc64le -ExcludeArch: aarch64 -ExcludeArch: i686 - -%description -Virtual Data Optimizer (VDO) is a device mapper target that delivers -block-level deduplication, compression, and thin provisioning. - -This package provides the kernel modules for VDO. - -%post -set -x -/usr/sbin/dkms --rpm_safe_upgrade add -m %{kmod_name} -v %{version}-%{kmod_driver_version} -/usr/sbin/dkms --rpm_safe_upgrade build -m %{kmod_name} -v %{version}-%{kmod_driver_version} -/usr/sbin/dkms --rpm_safe_upgrade install -m %{kmod_name} -v %{version}-%{kmod_driver_version} - -%preun -# Check whether kvdo or uds is loaded, and if so attempt to remove it. A -# failure here means there is still something using the module, which should be -# cleared up before attempting to remove again. -for module in kvdo uds; do - if grep -q "^${module}" /proc/modules; then - modprobe -r ${module} - fi -done -/usr/sbin/dkms --rpm_safe_upgrade remove -m %{kmod_name} -v %{version}-%{kmod_driver_version} --all || : - -%prep -%setup -n kmod-%{kmod_name}-%{kmod_driver_version} - -%build -# Nothing doing here, as we're going to build on whatever kernel we end up -# running inside. - -%install -mkdir -p $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version} -cp -r * $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/ -cat > $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/dkms.conf < - 6.2.4.26-1 -HASH(0x5645fb62bab0) \ No newline at end of file diff --git a/source/CONTRIBUTORS.txt b/source/CONTRIBUTORS.txt new file mode 100644 index 0000000..2719005 --- /dev/null +++ b/source/CONTRIBUTORS.txt @@ -0,0 +1,98 @@ +The Red Hat VDO Team: + Principal Engineer/Lead Architect: + J. corwin Coburn + + Primary Authors: + Joseph Chapman + Sweet Tea Dorminy + *Thomas Jaskiewicz + Bruce Johnston + Susan McGhee + Ken Raeburn + Michael Sclafani + Matthew Sakai + Joseph Shimkus + John Wiele + + Support, Testing, Documentation, and other things too numerous to mention: + Chung Chung : + Bryan Gurney + *Simon J. Hernandez + Jakub Krysl + Marek Suchanek + + Project Management & Technical Direction: + Jered Floyd + Louis Imershein + Dennis Keefe + Andrew Walsh + + *former team members + +Other Contributors: + Ji-Hyeon Gim : + Updates for FC26/Kernel 4.13 + Vojtech Trefny + Getting correct size of partitions + Achilles Gaikwad + Bash completion for the vdo and vdostats commands + Jin-young Kwon + Adding vdo --version command, and documentation fixes + +VDO was originally created at Permabit Technology Corporation, and was +subsequently acquired and open-sourced by Red Hat. + +Former Members of the Permabit VDO Team: + Engineers: + Mark Amidon + David Buckle + Jacky Chu + Joel Hoff + Dimitri Kountourogianni + Alexis Layton + Michael Lee + Rich Macchi + Dave Paniriti + Karl Ramm + Hooman Vassef + Assar Westurlund + + Support, Testing, Documentation, etc. + Carl Alexander + Mike Chu + Mark Iskra + Farid Jahanmir + Francesca Koulikov + Erik Lattimore + Jennifer Levine + Randy Long + Steve Looby + Uche Onyekwuluje + Catherine Powell + Jeff Pozz + Sarmad Sada + John Schmidt + Omri Schwarz + Jay Splaine + John Welle + Mary-Anne Wolf + Devon Yablonski + Robert Zupko + + Interns: + Ari Entlich + Lori Monteleone + + Project Management & Technical Direction: + Michael Fortson + +Other Past Permabit Contributors (for early work on the index): + James Clough + Dave Golombek + Albert Lin + Edwin Olson + Dave Pinkney + Rich Brennan + +And Very Special Thanks To: + Norman Margolis, who started the whole thing diff --git a/source/COPYING b/source/COPYING new file mode 100644 index 0000000..7d5393a --- /dev/null +++ b/source/COPYING @@ -0,0 +1,278 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. diff --git a/source/Makefile b/source/Makefile new file mode 100644 index 0000000..4084615 --- /dev/null +++ b/source/Makefile @@ -0,0 +1,2 @@ +obj-y += uds/ +obj-y += vdo/ diff --git a/source/README.md b/source/README.md new file mode 100644 index 0000000..528277d --- /dev/null +++ b/source/README.md @@ -0,0 +1,125 @@ +# kvdo + +A pair of kernel modules which provide pools of deduplicated and/or compressed +block storage. + +## Background + +VDO (which includes [kvdo](https://github.com/dm-vdo/kvdo) and +[vdo](https://github.com/dm-vdo/vdo)) is software that provides inline +block-level deduplication, compression, and thin provisioning capabilities for +primary storage. VDO installs within the Linux device mapper framework, where +it takes ownership of existing physical block devices and remaps these to new, +higher-level block devices with data-efficiency capabilities. + +Deduplication is a technique for reducing the consumption of storage resources +by eliminating multiple copies of duplicate blocks. Compression takes the +individual unique blocks and shrinks them with coding algorithms; these reduced +blocks are then efficiently packed together into physical blocks. Thin +provisioning manages the mapping from LBAs presented by VDO to where the data +has actually been stored, and also eliminates any blocks of all zeroes. + +With deduplication, instead of writing the same data more than once each +duplicate block is detected and recorded as a reference to the original +block. VDO maintains a mapping from logical block addresses (used by the +storage layer above VDO) to physical block addresses (used by the storage layer +under VDO). After deduplication, multiple logical block addresses may be mapped +to the same physical block address; these are called shared blocks and are +reference-counted by the software. + +With VDO's compression, multiple blocks (or shared blocks) are compressed with +the fast LZ4 algorithm, and binned together where possible so that multiple +compressed blocks fit within a 4 KB block on the underlying storage. Mapping +from LBA is to a physical block address and index within it for the desired +compressed data. All compressed blocks are individually reference counted for +correctness. + +Block sharing and block compression are invisible to applications using the +storage, which read and write blocks as they would if VDO were not +present. When a shared block is overwritten, a new physical block is allocated +for storing the new block data to ensure that other logical block addresses +that are mapped to the shared physical block are not modified. + +This public source release of VDO includes two kernel modules, and a set of +userspace tools for managing them. The "kvdo" module implements fine-grained +storage virtualization, thin provisioning, block sharing, and compression; the +"uds" module provides memory-efficient duplicate identification. The userspace +tools include a pair of python scripts, "vdo" for creating and managing VDO +volumes, and "vdostats" for extracting statistics from those volumes. + +## Documentation + +- [RHEL8 VDO Documentation](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/deduplicating_and_compressing_storage/index) +- [RHEL7 VDO Integration Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/vdo-integration) +- [RHEL7 VDO Evaluation Guide](https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/storage_administration_guide/vdo-evaluation) + +## Releases + +Each branch on this project is intended to work with a specific release of +Enterprise Linux (Red Hat Enterprise Linux, CentOS, etc.). We try to maintain +compatibility with active Fedora releases, but some modifications may be +required. + +Version | Intended Enterprise Linux Release | Supported With Modifications +------- | --------------------------------- | ------------------------------- +6.1.x.x | EL7 (3.10.0-*.el7) | +6.2.x.x | EL8 (4.18.0-*.el8) | Fedora 28, Fedora 29, Fedora 30, Rawhide +* Pre-built versions with the required modifications for the referenced Fedora + releases can be found + [here](https://copr.fedorainfracloud.org/coprs/rhawalsh/dm-vdo) and can be + used by running `dnf copr enable rhawalsh/dm-vdo`. + +## Status + +VDO was originally developed by Permabit Technology Corp. as a proprietary set +of kernel modules and userspace tools. This software and technology has been +acquired by Red Hat, has been relicensed under the GPL (v2 or later), and this +repository begins the process of preparing for integration with the upstream +kernel. + +While this software has been relicensed there are a number of issues that must +still be addressed to be ready for upstream. These include: + +- Conformance with kernel coding standards +- Use of existing EXPORT_SYMBOL_GPL kernel interfaces where appropriate +- Refactoring of primitives (e.g. cryptographic) to appropriate kernel + subsystems +- Support for non-x86-64 platforms +- Refactoring of platform layer abstractions and other changes requested by + upstream maintainers + +We expect addressing these issues to take some time. In the meanwhile, this +project allows interested parties to begin using VDO immediately. The +technology itself is thoroughly tested, mature, and in production use since +2014 in its previous proprietary form. + +## Building + +In order to build the kernel modules, invoke the following command +from the top directory of this tree: + + make -C /usr/src/kernels/`uname -r` M=`pwd` + +* Patched sources that work with the most recent upstream kernels can be found + [here](https://github.com/rhawalsh/kvdo). + +## Communication channels + +Community feedback, participation and patches are welcome to the +vdo-devel@redhat.com mailing list -- subscribe +[here](https://www.redhat.com/mailman/listinfo/vdo-devel). + +## Contributing + +This project is currently a stepping stone towards integration with the Linux +kernel. As such, contributions are welcome via a process similar to that for +Linux kernel development. Patches should be submitted to the +vdo-devel@redhat.com mailing list, where they will be considered for +inclusion. This project does not accept pull requests. + +## Licensing + +[GPL v2.0 or later](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html). +All contributions retain ownership by their original author, but must also +be licensed under the GPL 2.0 or later to be merged. + diff --git a/source/TODO b/source/TODO new file mode 100644 index 0000000..d2d5cf5 --- /dev/null +++ b/source/TODO @@ -0,0 +1,6 @@ +- Conform to kernel coding standards +- Use existing EXPORT_SYMBOL_GPL kernel interfaces where appropriate +- Refactor primitives (e.g. cryptographic) to appropriate kernel subsystems +- Support non-x86-64 platforms +- Refactor platform layer abstractions and other changes requested by upstream + maintainers diff --git a/source/kvdo.spec b/source/kvdo.spec new file mode 100644 index 0000000..e340b2c --- /dev/null +++ b/source/kvdo.spec @@ -0,0 +1,89 @@ +%define spec_release 1 +%define kmod_name kvdo +%define kmod_driver_version 6.2.4.26 +%define kmod_rpm_release %{spec_release} +%define kmod_kernel_version 3.10.0-693.el7 + +# Disable the scanning for a debug package +%global debug_package %{nil} + +Source0: kmod-%{kmod_name}-%{kmod_driver_version}.tgz + +Name: kmod-kvdo +Version: %{kmod_driver_version} +Release: %{kmod_rpm_release}%{?dist} +Summary: Kernel Modules for Virtual Data Optimizer +License: GPLv2+ +URL: http://github.com/dm-vdo/kvdo +BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) +Requires: dkms +Requires: kernel-devel >= %{kmod_kernel_version} +Requires: make +ExclusiveArch: x86_64 +ExcludeArch: s390 +ExcludeArch: s390x +ExcludeArch: ppc +ExcludeArch: ppc64 +ExcludeArch: ppc64le +ExcludeArch: aarch64 +ExcludeArch: i686 + +%description +Virtual Data Optimizer (VDO) is a device mapper target that delivers +block-level deduplication, compression, and thin provisioning. + +This package provides the kernel modules for VDO. + +%post +set -x +/usr/sbin/dkms --rpm_safe_upgrade add -m %{kmod_name} -v %{version}-%{kmod_driver_version} +/usr/sbin/dkms --rpm_safe_upgrade build -m %{kmod_name} -v %{version}-%{kmod_driver_version} +/usr/sbin/dkms --rpm_safe_upgrade install -m %{kmod_name} -v %{version}-%{kmod_driver_version} + +%preun +# Check whether kvdo or uds is loaded, and if so attempt to remove it. A +# failure here means there is still something using the module, which should be +# cleared up before attempting to remove again. +for module in kvdo uds; do + if grep -q "^${module}" /proc/modules; then + modprobe -r ${module} + fi +done +/usr/sbin/dkms --rpm_safe_upgrade remove -m %{kmod_name} -v %{version}-%{kmod_driver_version} --all || : + +%prep +%setup -n kmod-%{kmod_name}-%{kmod_driver_version} + +%build +# Nothing doing here, as we're going to build on whatever kernel we end up +# running inside. + +%install +mkdir -p $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version} +cp -r * $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/ +cat > $RPM_BUILD_ROOT/%{_usr}/src/%{kmod_name}-%{version}-%{kmod_driver_version}/dkms.conf < - 6.2.4.26-1 +HASH(0x5645fb62bab0) \ No newline at end of file diff --git a/source/uds/Makefile b/source/uds/Makefile new file mode 100644 index 0000000..5afc64a --- /dev/null +++ b/source/uds/Makefile @@ -0,0 +1,21 @@ +UDS_VERSION = 8.0.2.4 + +SOURCES = $(notdir $(wildcard $(src)/*.c)) murmur/MurmurHash3.c +SOURCES += $(addprefix util/,$(notdir $(wildcard $(src)/util/*.c))) +OBJECTS = $(SOURCES:%.c=%.o) +INCLUDES = -I$(src) + +EXTRA_CFLAGS = -std=gnu99 \ + -fno-builtin-memset \ + -Werror \ + -Wframe-larger-than=400 \ + -Wno-declaration-after-statement \ + -DUDS_VERSION=\"$(UDS_VERSION)\" \ + $(INCLUDES) + +CFLAGS_REMOVE_deltaIndex.o = -std=gnu99 +CFLAGS_REMOVE_masterIndex005.o = -std=gnu99 + +obj-m += uds.o + +uds-objs = $(OBJECTS) diff --git a/source/uds/atomicDefs.h b/source/uds/atomicDefs.h new file mode 100644 index 0000000..0c82bca --- /dev/null +++ b/source/uds/atomicDefs.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/atomicDefs.h#2 $ + */ + +#ifndef LINUX_KERNEL_ATOMIC_DEFS_H +#define LINUX_KERNEL_ATOMIC_DEFS_H + +#include + +#endif /* LINUX_KERNEL_ATOMIC_DEFS_H */ diff --git a/source/uds/bits.c b/source/uds/bits.c new file mode 100644 index 0000000..eea4912 --- /dev/null +++ b/source/uds/bits.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bits.c#1 $ + */ + +#include "bits.h" + +#include "compiler.h" + +/** + * This is the largest field size supported by getBigField & setBigField. + * Any field that is larger is not guaranteed to fit in a single, byte + * aligned uint64_t. + **/ +enum { MAX_BIG_FIELD_BITS = (sizeof(uint64_t) - 1) * CHAR_BIT + 1 }; + +/** + * Get a big bit field from a bit stream + * + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE uint64_t getBigField(const byte *memory, + uint64_t offset, + int size) +{ + const void *addr = memory + offset / CHAR_BIT; + return (getUInt64LE(addr) >> (offset % CHAR_BIT)) & ((1UL << size) - 1); +} + +/** + * Set a big bit field in a bit stream + * + * @param value The value to put into the field + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE void setBigField(uint64_t value, byte *memory, uint64_t offset, + int size) +{ + void *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + uint64_t data = getUInt64LE(addr); + data &= ~(((1UL << size) - 1) << shift); + data |= value << shift; + storeUInt64LE(addr, data); +} + +/***********************************************************************/ +void getBytes(const byte *memory, uint64_t offset, byte *destination, int size) +{ + const byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + while (--size >= 0) { + *destination++ = getUInt16LE(addr++) >> shift; + } +} + +/***********************************************************************/ +void setBytes(byte *memory, uint64_t offset, const byte *source, int size) +{ + byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + uint16_t mask = ~((uint16_t) 0xFF << shift); + while (--size >= 0) { + uint16_t data = (getUInt16LE(addr) & mask) | (*source++ << shift); + storeUInt16LE(addr++, data); + } +} + +/***********************************************************************/ +void moveBits(const byte *sMemory, uint64_t source, byte *dMemory, + uint64_t destination, int size) +{ + enum { UINT32_BIT = sizeof(uint32_t) * CHAR_BIT }; + if (size > MAX_BIG_FIELD_BITS) { + if (source > destination) { + // This is a large move from a higher to a lower address. We move + // the lower addressed bits first. Start by moving one field that + // ends on a destination int boundary + int count + = MAX_BIG_FIELD_BITS - (destination + MAX_BIG_FIELD_BITS) % UINT32_BIT; + uint64_t field = getBigField(sMemory, source, count); + setBigField(field, dMemory, destination, count); + source += count; + destination += count; + size -= count; + // Now do the main loop to copy 32 bit chunks that are int-aligned + // at the destination. + int offset = source % UINT32_BIT; + const byte *src = sMemory + (source - offset) / CHAR_BIT; + byte *dest = dMemory + destination / CHAR_BIT; + while (size > MAX_BIG_FIELD_BITS) { + storeUInt32LE(dest, getUInt64LE(src) >> offset); + src += sizeof(uint32_t); + dest += sizeof(uint32_t); + source += UINT32_BIT; + destination += UINT32_BIT; + size -= UINT32_BIT; + } + } else { + // This is a large move from a lower to a higher address. We move + // the higher addressed bits first. Start by moving one field that + // begins on a destination int boundary + int count = (destination + size) % UINT32_BIT; + if (count > 0) { + size -= count; + uint64_t field = getBigField(sMemory, source + size, count); + setBigField(field, dMemory, destination + size, count); + } + // Now do the main loop to copy 32 bit chunks that are int-aligned + // at the destination. + int offset = (source + size) % UINT32_BIT; + const byte *src = sMemory + (source + size - offset) / CHAR_BIT; + byte *dest = dMemory + (destination + size) / CHAR_BIT; + while (size > MAX_BIG_FIELD_BITS) { + src -= sizeof(uint32_t); + dest -= sizeof(uint32_t); + size -= UINT32_BIT; + storeUInt32LE(dest, getUInt64LE(src) >> offset); + } + } + } + // Finish up by doing the last chunk, which can have any arbitrary alignment + if (size > 0) { + uint64_t field = getBigField(sMemory, source, size); + setBigField(field, dMemory, destination, size); + } +} + +/***********************************************************************/ +bool sameBits(const byte *mem1, uint64_t offset1, const byte *mem2, + uint64_t offset2, int size) +{ + while (size >= MAX_FIELD_BITS) { + unsigned int field1 = getField(mem1, offset1, MAX_FIELD_BITS); + unsigned int field2 = getField(mem2, offset2, MAX_FIELD_BITS); + if (field1 != field2) return false; + offset1 += MAX_FIELD_BITS; + offset2 += MAX_FIELD_BITS; + size -= MAX_FIELD_BITS; + } + if (size > 0) { + unsigned int field1 = getField(mem1, offset1, size); + unsigned int field2 = getField(mem2, offset2, size); + if (field1 != field2) return false; + } + return true; +} diff --git a/source/uds/bits.h b/source/uds/bits.h new file mode 100644 index 0000000..2c2d4ea --- /dev/null +++ b/source/uds/bits.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bits.h#1 $ + */ + +#ifndef BITS_H +#define BITS_H 1 + +#include "compiler.h" +#include "numeric.h" +#include "typeDefs.h" + +/* + * These bit stream and bit field utility routines are used for the + * non-byte aligned delta indices. + * + * Bits and bytes are numbered in little endian order. For example: Within + * a byte, bit 0 is the least significant bit (0x1), and bit 7 is the most + * significant bit (0x80). Within a bit stream, bit 7 is the most + * signficant bit of byte 0, and bit 8 is the least significant bit of byte + * 1. Within a byte array, a byte's number corresponds to it's index in + * the array. + * + * The implementation assumes that the native machine is little endian, and + * that performance is very important. These assumptions match our current + * operating environment. + */ + +/** + * This is the largest field size supported by getField & setField. Any + * field that is larger is not guaranteed to fit in a single, byte aligned + * uint32_t. + **/ +enum { MAX_FIELD_BITS = (sizeof(uint32_t) - 1) * CHAR_BIT + 1 }; + +/** + * This is the number of guard bytes needed at the end of the memory byte + * array when using the bit utilities. 3 bytes are needed when getField & + * setField access a field, because they will access some "extra" bytes + * past the end of the field. And 7 bytes are needed when getBigField & + * setBigField access a big field, for the same reason. Note that moveBits + * calls getBigField & setBigField. 7 is rewritten to make it clear how it + * is derived. + **/ +enum { POST_FIELD_GUARD_BYTES = sizeof(uint64_t) - 1 }; + +/** + * Get a bit field from a bit stream + * + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE unsigned int getField(const byte *memory, uint64_t offset, + int size) +{ + const void *addr = memory + offset / CHAR_BIT; + return (getUInt32LE(addr) >> (offset % CHAR_BIT)) & ((1 << size) - 1); +} + +/** + * Set a bit field in a bit stream + * + * @param value The value to put into the field + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE void setField(unsigned int value, byte *memory, uint64_t offset, + int size) +{ + void *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + uint32_t data = getUInt32LE(addr); + data &= ~(((1 << size) - 1) << shift); + data |= value << shift; + storeUInt32LE(addr, data); +} + +/** + * Set a bit field in a bit stream to all ones + * + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE void setOne(byte *memory, uint64_t offset, int size) +{ + if (size > 0) { + byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; + *addr++ |= ((1 << count) - 1) << shift; + for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { + *addr++ = 0xFF; + } + if (size) { + *addr |= ~(0xFF << size); + } + } +} + +/** + * Set a bit field in a bit stream to all zeros + * + * @param memory The base memory byte address + * @param offset The bit offset into the memory for the start of the field + * @param size The number of bits in the field + * + * @return the bit field + **/ +static INLINE void setZero(byte *memory, uint64_t offset, int size) +{ + if (size > 0) { + byte *addr = memory + offset / CHAR_BIT; + int shift = offset % CHAR_BIT; + int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; + *addr++ &= ~(((1 << count) - 1) << shift); + for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { + *addr++ = 0; + } + if (size) { + *addr &= 0xFF << size; + } + } +} + +/** + * Get a byte stream from a bit stream, reading a whole number of bytes + * from an arbitrary bit boundary. + * + * @param memory The base memory byte address for the bit stream + * @param offset The bit offset of the start of the bit stream + * @param destination Where to store the bytes + * @param size The number of bytes + **/ +void getBytes(const byte *memory, uint64_t offset, byte *destination, int size); + +/** + * Store a byte stream into a bit stream, writing a whole number of bytes + * to an arbitrary bit boundary. + * + * @param memory The base memory byte address for the bit stream + * @param offset The bit offset of the start of the bit stream + * @param source Where to read the bytes + * @param size The number of bytes + **/ +void setBytes(byte *memory, uint64_t offset, const byte *source, int size); + +/** + * Move bits from one field to another. When the fields overlap, behave as + * if we first move all the bits from the source to a temporary value, and + * then move all the bits from the temporary value to the destination. + * + * @param sMemory The base source memory byte address + * @param source Bit offset into memory for the source start + * @param dMemory The base destination memory byte address + * @param destination Bit offset into memory for the destination start + * @param size The number of bits in the field + **/ +void moveBits(const byte *sMemory, uint64_t source, byte *dMemory, + uint64_t destination, int size); + +/** + * Compare bits from one field to another, testing for sameness + * + * @param mem1 The base memory byte address (first field) + * @param offset1 Bit offset into the memory for the start (first field) + * @param mem2 The base memory byte address (second field) + * @param offset2 Bit offset into the memory for the start (second field) + * @param size The number of bits in the field + * + * @return true if fields are the same, false if different + **/ +bool sameBits(const byte *mem1, uint64_t offset1, const byte *mem2, + uint64_t offset2, int size) + __attribute__((warn_unused_result)); + +#endif /* BITS_H */ diff --git a/source/uds/buffer.c b/source/uds/buffer.c new file mode 100644 index 0000000..2bf6d20 --- /dev/null +++ b/source/uds/buffer.c @@ -0,0 +1,596 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/buffer.c#3 $ + */ + +#include "buffer.h" + +#include "bufferPrivate.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" +#include "typeDefs.h" + +/**********************************************************************/ +int wrapBuffer(byte *bytes, + size_t length, + size_t contentLength, + Buffer **bufferPtr) +{ + int result = ASSERT((contentLength <= length), + "content length, %zu, fits in buffer size, %zu", + length, contentLength); + Buffer *buffer; + result = ALLOCATE(1, Buffer, "buffer", &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + buffer->data = bytes; + buffer->start = 0; + buffer->end = contentLength; + buffer->length = length; + buffer->wrapped = true; + + *bufferPtr = buffer; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int makeBuffer(size_t size, Buffer **newBuffer) +{ + byte *data; + int result = ALLOCATE(size, byte, "buffer data", &data); + if (result != UDS_SUCCESS) { + return result; + } + + Buffer *buffer; + result = wrapBuffer(data, size, 0, &buffer); + if (result != UDS_SUCCESS) { + FREE(data); + return result; + } + + buffer->wrapped = false; + *newBuffer = buffer; + return UDS_SUCCESS; +} + +/***********************************************************************/ +void freeBuffer(Buffer **pBuffer) +{ + Buffer *buffer = *pBuffer; + *pBuffer = NULL; + if (buffer == NULL) { + return; + } + if (!buffer->wrapped) { + FREE(buffer->data); + } + FREE(buffer); +} + +/**********************************************************************/ +size_t bufferLength(Buffer *buffer) +{ + return buffer->length; +} + +/**********************************************************************/ +size_t contentLength(Buffer *buffer) +{ + return buffer->end - buffer->start; +} + +/**********************************************************************/ +size_t uncompactedAmount(Buffer *buffer) +{ + return buffer->start; +} + +/**********************************************************************/ +size_t availableSpace(Buffer *buffer) +{ + return buffer->length - buffer->end; +} + +/**********************************************************************/ +size_t bufferUsed(Buffer *buffer) +{ + return buffer->end; +} + +/***********************************************************************/ +int growBuffer(Buffer *buffer, size_t length) +{ + if (buffer == NULL) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot resize NULL buffer"); + } + + if (buffer->wrapped) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot resize wrapped buffer"); + } + if (buffer->end > length) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot shrink buffer"); + } + + byte *data; + int result = reallocateMemory(buffer->data, buffer->length, length, + "buffer data", &data); + if (result != UDS_SUCCESS) { + return result; + } + + buffer->data = data; + buffer->length = length; + return UDS_SUCCESS; +} + +/***********************************************************************/ +bool ensureAvailableSpace(Buffer *buffer, size_t bytes) +{ + if (availableSpace(buffer) >= bytes) { + return true; + } + compactBuffer(buffer); + return (availableSpace(buffer) >= bytes); +} + +/***********************************************************************/ +void clearBuffer(Buffer *buffer) +{ + buffer->start = 0; + buffer->end = buffer->length; +} + +/***********************************************************************/ +void compactBuffer(Buffer *buffer) +{ + if ((buffer->start == 0) || (buffer->end == 0)) { + return; + } + size_t bytesToMove = buffer->end - buffer->start; + memmove(buffer->data, buffer->data + buffer->start, bytesToMove); + buffer->start = 0; + buffer->end = bytesToMove; +} + +/**********************************************************************/ +int resetBufferEnd(Buffer *buffer, size_t end) +{ + if (end > buffer->length) { + return UDS_BUFFER_ERROR; + } + buffer->end = end; + if (buffer->start > buffer->end) { + buffer->start = buffer->end; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int skipForward(Buffer *buffer, size_t bytesToSkip) +{ + if (contentLength(buffer) < bytesToSkip) { + return UDS_BUFFER_ERROR; + } + + buffer->start += bytesToSkip; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int rewindBuffer(Buffer *buffer, size_t bytesToRewind) +{ + if (buffer->start < bytesToRewind) { + return UDS_BUFFER_ERROR; + } + + buffer->start -= bytesToRewind; + return UDS_SUCCESS; +} + +/**********************************************************************/ +bool hasSameBytes(Buffer *buffer, const byte *data, size_t length) +{ + return ((contentLength(buffer) >= length) + && (memcmp(buffer->data + buffer->start, data, length) == 0)); +} + +/**********************************************************************/ +bool equalBuffers(Buffer *buffer1, Buffer *buffer2) +{ + return hasSameBytes(buffer1, buffer2->data + buffer2->start, + contentLength(buffer2)); +} + +/**********************************************************************/ +int getByte(Buffer *buffer, byte *bytePtr) +{ + if (contentLength(buffer) < sizeof(byte)) { + return UDS_BUFFER_ERROR; + } + + *bytePtr = buffer->data[buffer->start++]; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int peekByte(Buffer *buffer, size_t offset, byte *bytePtr) +{ + if (contentLength(buffer) < (offset + sizeof(byte))) { + return UDS_BUFFER_ERROR; + } + + *bytePtr = buffer->data[buffer->start + offset]; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putByte(Buffer *buffer, byte b) +{ + if (!ensureAvailableSpace(buffer, sizeof(byte))) { + return UDS_BUFFER_ERROR; + } + + buffer->data[buffer->end++] = b; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getBytesFromBuffer(Buffer *buffer, size_t length, void *destination) +{ + if (contentLength(buffer) < length) { + return UDS_BUFFER_ERROR; + } + + memcpy(destination, buffer->data + buffer->start, length); + buffer->start += length; + return UDS_SUCCESS; +} + +/**********************************************************************/ +byte *getBufferContents(Buffer *buffer) +{ + return buffer->data + buffer->start; +} + +/**********************************************************************/ +int copyBytes(Buffer *buffer, size_t length, byte **destinationPtr) +{ + byte *destination; + int result = ALLOCATE(length, byte, "copyBytes() buffer", + &destination); + if (result != UDS_SUCCESS) { + return result; + } + + result = getBytesFromBuffer(buffer, length, destination); + if (result != UDS_SUCCESS) { + FREE(destination); + } else { + *destinationPtr = destination; + } + return result; +} + +/**********************************************************************/ +int putBytes(Buffer *buffer, size_t length, const void *source) +{ + if (!ensureAvailableSpace(buffer, length)) { + return UDS_BUFFER_ERROR; + } + memcpy(buffer->data + buffer->end, source, length); + buffer->end += length; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putBuffer(Buffer *target, Buffer *source, size_t length) +{ + if (contentLength(source) < length) { + return UDS_BUFFER_ERROR; + } + + int result = putBytes(target, length, getBufferContents(source)); + if (result != UDS_SUCCESS) { + return result; + } + + source->start += length; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int zeroBytes(Buffer *buffer, size_t length) +{ + if (!ensureAvailableSpace(buffer, length)) { + return UDS_BUFFER_ERROR; + } + memset(buffer->data + buffer->end, 0, length); + buffer->end += length; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getBoolean(Buffer *buffer, bool *b) +{ + byte by; + int result = getByte(buffer, &by); + if (result == UDS_SUCCESS) { + *b = (by == 1); + } + return result; +} + +/**********************************************************************/ +int putBoolean(Buffer *buffer, bool b) +{ + return putByte(buffer, (byte) (b ? 1 : 0)); +} + +/**********************************************************************/ +int getUInt16BEFromBuffer(Buffer *buffer, uint16_t *ui) +{ + if (contentLength(buffer) < sizeof(uint16_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt16BE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt16BEIntoBuffer(Buffer *buffer, uint16_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint16_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt16BE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt32BEFromBuffer(Buffer *buffer, uint32_t *ui) +{ + if (contentLength(buffer) < sizeof(uint32_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt32BE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt32BEIntoBuffer(Buffer *buffer, uint32_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint32_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt32BE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt32BEsFromBuffer(Buffer *buffer, size_t count, uint32_t *ui) +{ + if (contentLength(buffer) < (sizeof(uint32_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + decodeUInt32BE(buffer->data, &buffer->start, ui + i); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt32BEsIntoBuffer(Buffer *buffer, size_t count, const uint32_t *ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint32_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + encodeUInt32BE(buffer->data, &buffer->end, ui[i]); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt64BEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) +{ + if (contentLength(buffer) < (sizeof(uint64_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + decodeUInt64BE(buffer->data, &buffer->start, ui + i); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt64BEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint64_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + encodeUInt64BE(buffer->data, &buffer->end, ui[i]); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt16LEFromBuffer(Buffer *buffer, uint16_t *ui) +{ + if (contentLength(buffer) < sizeof(uint16_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt16LE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt16LEIntoBuffer(Buffer *buffer, uint16_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint16_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt16LE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt16LEsFromBuffer(Buffer *buffer, size_t count, uint16_t *ui) +{ + if (contentLength(buffer) < (sizeof(uint16_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + decodeUInt16LE(buffer->data, &buffer->start, ui + i); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt16LEsIntoBuffer(Buffer *buffer, size_t count, const uint16_t *ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint16_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + encodeUInt16LE(buffer->data, &buffer->end, ui[i]); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getInt32LEFromBuffer(Buffer *buffer, int32_t *i) +{ + if (contentLength(buffer) < sizeof(int32_t)) { + return UDS_BUFFER_ERROR; + } + + decodeInt32LE(buffer->data, &buffer->start, i); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt32LEFromBuffer(Buffer *buffer, uint32_t *ui) +{ + if (contentLength(buffer) < sizeof(uint32_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt32LE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt32LEIntoBuffer(Buffer *buffer, uint32_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint32_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt32LE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putInt64LEIntoBuffer(Buffer *buffer, int64_t i) +{ + if (!ensureAvailableSpace(buffer, sizeof(int64_t))) { + return UDS_BUFFER_ERROR; + } + + encodeInt64LE(buffer->data, &buffer->end, i); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt64LEFromBuffer(Buffer *buffer, uint64_t *ui) +{ + if (contentLength(buffer) < sizeof(uint64_t)) { + return UDS_BUFFER_ERROR; + } + + decodeUInt64LE(buffer->data, &buffer->start, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt64LEIntoBuffer(Buffer *buffer, uint64_t ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint64_t))) { + return UDS_BUFFER_ERROR; + } + + encodeUInt64LE(buffer->data, &buffer->end, ui); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getUInt64LEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) +{ + if (contentLength(buffer) < (sizeof(uint64_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + decodeUInt64LE(buffer->data, &buffer->start, ui + i); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putUInt64LEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) +{ + if (!ensureAvailableSpace(buffer, sizeof(uint64_t) * count)) { + return UDS_BUFFER_ERROR; + } + + unsigned int i; + for (i = 0; i < count; i++) { + encodeUInt64LE(buffer->data, &buffer->end, ui[i]); + } + return UDS_SUCCESS; +} + diff --git a/source/uds/buffer.h b/source/uds/buffer.h new file mode 100644 index 0000000..22df042 --- /dev/null +++ b/source/uds/buffer.h @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/buffer.h#2 $ + */ + +#ifndef BUFFER_H +#define BUFFER_H + +#include "common.h" + +typedef struct buffer Buffer; + +/** + * Create a buffer which wraps an existing byte array. + * + * @param bytes The bytes to wrap + * @param length The length of the buffer + * @param contentLength The length of the current contents of the buffer + * @param bufferPtr A pointer to hold the buffer + * + * @return UDS_SUCCESS or an error code + **/ +int wrapBuffer(byte *bytes, + size_t length, + size_t contentLength, + Buffer **bufferPtr) + __attribute__((warn_unused_result)); + +/** + * Create a new buffer and allocate its memory. + * + * @param length The length of the buffer + * @param bufferPtr A pointer to hold the buffer + * + * @return UDS_SUCCESS or an error code + **/ +int makeBuffer(size_t length, Buffer **bufferPtr) + __attribute__((warn_unused_result)); + +/** + * Release a buffer and, if not wrapped, free its memory. + * + * @param pBuffer Pointer to the buffer to release + **/ +void freeBuffer(Buffer **pBuffer); + +/** + * Grow a non-wrapped buffer. + * + * @param buffer The buffer to resize + * @param length The new length of the buffer + * + * @return UDS_SUCCESS or an error code + **/ +int growBuffer(Buffer *buffer, size_t length) + __attribute__((warn_unused_result)); + +/** + * Ensure that a buffer has a given amount of space available, compacting the + * buffer if necessary. + * + * @param buffer The buffer + * @param bytes The number of available bytes desired + * + * @return true if the requested number of bytes are now available + **/ +bool ensureAvailableSpace(Buffer *buffer, size_t bytes) + __attribute__((warn_unused_result)); + +/** + * Clear the buffer. The start position is set to zero and the end position + * is set to the buffer length. + **/ +void clearBuffer(Buffer *buffer); + +/** + * Eliminate buffer contents which have been extracted. This function copies + * any data between the start and end pointers to the beginning of the buffer, + * moves the start pointer to the beginning, and the end pointer to the end + * of the copied data. + * + * @param buffer The buffer to compact + **/ +void compactBuffer(Buffer *buffer); + +/** + * Skip forward the specified number of bytes in a buffer (advance the + * start pointer). + * + * @param buffer The buffer + * @param bytesToSkip The number of bytes to skip + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long + * enough to skip forward the requested number of bytes + **/ +int skipForward(Buffer *buffer, size_t bytesToSkip) + __attribute__((warn_unused_result)); + +/** + * Rewind the specified number of bytes in a buffer (back up the start + * pointer). + * + * @param buffer The buffer + * @param bytesToRewind The number of bytes to rewind + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long + * enough to rewind backward the requested number of bytes + **/ +int rewindBuffer(Buffer *buffer, size_t bytesToRewind) + __attribute__((warn_unused_result)); + +/** + * Return the length of the buffer. + * + * @param buffer the buffer + * + * @return the buffer length + **/ +size_t bufferLength(Buffer *buffer); + +/** + * Compute the amount of data current in the buffer. + * + * @param buffer The buffer to examine + * + * @return The number of bytes between the start and end pointers of the buffer + **/ +size_t contentLength(Buffer *buffer); + +/** + * Compute the amount of available space in this buffer. + * + * @param buffer The buffer to examine + * + * @return The number of bytes between the end pointer and the end of the buffer + **/ +size_t availableSpace(Buffer *buffer); + +/** + * Amount of buffer that has already been processed. + * + * @param buffer the buffer to examine + * + * @return The number of bytes between the beginning of the buffer and the + * start pointer. + **/ +size_t uncompactedAmount(Buffer *buffer); + +/** + * Return the amount of the buffer that is currently utilized. + * + * @param buffer the buffer to examine + * + * @return The number of bytes between the beginning of the buffer and + * the end pointer. + **/ +size_t bufferUsed(Buffer *buffer); + +/** + * Reset the end of buffer to a different position. + * + * @param buffer the buffer + * @param end the new end of the buffer + * + * @return UDS_SUCCESS unless the end is larger than can fit + **/ +int resetBufferEnd(Buffer *buffer, size_t end) + __attribute__((warn_unused_result)); + +/** + * Check whether the start of the content of a buffer matches a specified + * array of bytes. + * + * @param buffer The buffer to check + * @param data The desired data + * @param length The length of the desired data + * + * @return true if the first length bytes of the buffer's + * contents match data + **/ +bool hasSameBytes(Buffer *buffer, const byte *data, size_t length) + __attribute__((warn_unused_result)); + +/** + * Check whether two buffers have the same contents. + * + * @param buffer1 The first buffer + * @param buffer2 The second buffer + * + * @return true if the contents of the two buffers are the + * same + **/ +bool equalBuffers(Buffer *buffer1, Buffer *buffer2); + +/** + * Get a single byte from a buffer and advance the start pointer. + * + * @param buffer The buffer + * @param bytePtr A pointer to hold the byte + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are no bytes to + * retrieve + **/ +int getByte(Buffer *buffer, byte *bytePtr) __attribute__((warn_unused_result)); + +/** + * Get a single byte from a buffer without advancing the start pointer. + * + * @param buffer The buffer + * @param offset The offset past the start pointer of the desired byte + * @param bytePtr A pointer to hold the byte + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the offset is past the end + * of the buffer + **/ +int peekByte(Buffer *buffer, size_t offset, byte *bytePtr) + __attribute__((warn_unused_result)); + +/** + * Put a single byte into a buffer and advance the end pointer. + * + * @param buffer The buffer + * @param b The byte to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer + **/ +int putByte(Buffer *buffer, byte b) __attribute__((warn_unused_result)); + +/** + * Get bytes out of a buffer and advance the start of the buffer past the + * copied data. + * + * @param buffer The buffer from which to copy + * @param length The number of bytes to copy + * @param destination A pointer to hold the data + * + * @return UDS_SUCCESS or an error code + **/ +int getBytesFromBuffer(Buffer *buffer, size_t length, void *destination) + __attribute__((warn_unused_result)); + +/** + * Get a pointer to the current contents of the buffer. This will be a pointer + * to the actual memory managed by the buffer. It is the caller's responsibility + * to ensure that the buffer is not modified while this pointer is in use. + * + * @param buffer The buffer from which to get the contents + * + * @return a pointer to the current contents of the buffer + **/ +byte *getBufferContents(Buffer *buffer); + +/** + * Copy bytes out of a buffer and advance the start of the buffer past the + * copied data. Memory will be allocated to hold the copy. + * + * @param buffer The buffer from which to copy + * @param length The number of bytes to copy + * @param destinationPtr A pointer to hold the copied data + * + * @return UDS_SUCCESS or an error code + **/ +int copyBytes(Buffer *buffer, size_t length, byte **destinationPtr) + __attribute__((warn_unused_result)); + +/** + * Copy bytes into a buffer and advance the end of the buffer past the + * copied data. + * + * @param buffer The buffer to copy into + * @param length The length of the data to copy + * @param source The data to copy + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have + * length bytes available + **/ +int putBytes(Buffer *buffer, size_t length, const void *source) + __attribute__((warn_unused_result)); + +/** + * Copy the contents of a source buffer into the target buffer. Advances the + * start of the source buffer and the end of the target buffer past the copied + * data. + * + * @param target The buffer to receive the copy of the data + * @param source The buffer containing the data to copy + * @param length The length of the data to copy + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the target buffer does not have + * length bytes available or if the source buffer does not have length + * bytes of content + **/ +int putBuffer(Buffer *target, Buffer *source, size_t length) + __attribute__((warn_unused_result)); + +/** + * Zero bytes in a buffer starting at the start pointer, and advance the + * end of the buffer past the zeros. + * + * @param buffer The buffer to zero + * @param length The number of bytes to zero + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have + * length bytes available + **/ +int zeroBytes(Buffer *buffer, size_t length) + __attribute__((warn_unused_result)); + +/** + * Get a boolean value from a buffer and advance the start pointer. + * + * @param buffer The buffer + * @param b A pointer to hold the boolean value + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getBoolean(Buffer *buffer, bool *b) __attribute__((warn_unused_result)); + +/** + * Put a boolean value into a buffer and advance the end pointer. + * + * @param buffer The buffer + * @param b The boolean to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer + **/ +int putBoolean(Buffer *buffer, bool b) __attribute__((warn_unused_result)); + +/** + * Get a 2 byte, big endian encoded integer from a buffer and advance the + * start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 + * bytes available + **/ +int getUInt16BEFromBuffer(Buffer *buffer, uint16_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a 2 byte, big endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 + * bytes available + **/ +int putUInt16BEIntoBuffer(Buffer *buffer, uint16_t ui) + __attribute__((warn_unused_result)); + +/** + * Get a 4 byte, big endian encoded integer from a buffer and advance the + * start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int getUInt32BEFromBuffer(Buffer *buffer, uint32_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a 4 byte, big endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int putUInt32BEIntoBuffer(Buffer *buffer, uint32_t ui) + __attribute__((warn_unused_result)); + +/** + * Get a series of 4 byte, big endian encoded integer from a buffer and + * advance the start pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to get + * @param ui A pointer to hold the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getUInt32BEsFromBuffer(Buffer *buffer, size_t count, uint32_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a series of 4 byte, big endian encoded integers into a buffer and + * advance the end pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to put + * @param ui A pointer to the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space + * in the buffer + **/ +int putUInt32BEsIntoBuffer(Buffer *buffer, size_t count, const uint32_t *ui) + __attribute__((warn_unused_result)); + +/** + * Get a series of 8 byte, big endian encoded integer from a buffer and + * advance the start pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to get + * @param ui A pointer to hold the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getUInt64BEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a series of 8 byte, big endian encoded integers into a buffer and + * advance the end pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to put + * @param ui A pointer to the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space + * in the buffer + **/ +int putUInt64BEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) + __attribute__((warn_unused_result)); + +/** + * Get a 2 byte, little endian encoded integer from a buffer and + * advance the start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 + * bytes available + **/ +int getUInt16LEFromBuffer(Buffer *buffer, uint16_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a 2 byte, little endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 + * bytes available + **/ +int putUInt16LEIntoBuffer(Buffer *buffer, uint16_t ui) + __attribute__((warn_unused_result)); + +/** + * Get a series of 2 byte, little endian encoded integer from a buffer + * and advance the start pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to get + * @param ui A pointer to hold the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getUInt16LEsFromBuffer(Buffer *buffer, size_t count, uint16_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a series of 2 byte, little endian encoded integers into a + * buffer and advance the end pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to put + * @param ui A pointer to the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space + * in the buffer + **/ +int putUInt16LEsIntoBuffer(Buffer *buffer, size_t count, const uint16_t *ui) + __attribute__((warn_unused_result)); + +/** + * Get a 4 byte, little endian encoded integer from a buffer and advance the + * start pointer past it. + * + * @param buffer The buffer + * @param i A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int getInt32LEFromBuffer(Buffer *buffer, int32_t *i) + __attribute__((warn_unused_result)); + +/** + * Get a 4 byte, little endian encoded integer from a buffer and advance the + * start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int getUInt32LEFromBuffer(Buffer *buffer, uint32_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a 4 byte, little endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 + * bytes available + **/ +int putUInt32LEIntoBuffer(Buffer *buffer, uint32_t ui) + __attribute__((warn_unused_result)); + +/** + * Get an 8 byte, little endian encoded, unsigned integer from a + * buffer and advance the start pointer past it. + * + * @param buffer The buffer + * @param ui A pointer to hold the integer + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 + * bytes available + **/ +int getUInt64LEFromBuffer(Buffer *buffer, uint64_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put an 8 byte, little endian encoded signed integer into a buffer + * and advance the end pointer past it. + * + * @param buffer The buffer + * @param i The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 + * bytes available + **/ +int putInt64LEIntoBuffer(Buffer *buffer, int64_t i) + __attribute__((warn_unused_result)); + + /** + * Put an 8 byte, little endian encoded integer into a buffer and advance the + * end pointer past it. + * + * @param buffer The buffer + * @param ui The integer to put + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 + * bytes available + **/ +int putUInt64LEIntoBuffer(Buffer *buffer, uint64_t ui) + __attribute__((warn_unused_result)); + +/** + * Get a series of 8 byte, little endian encoded integer from a buffer + * and advance the start pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to get + * @param ui A pointer to hold the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data + * in the buffer + **/ +int getUInt64LEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) + __attribute__((warn_unused_result)); + +/** + * Put a series of 8 byte, little endian encoded integers into a buffer and + * advance the end pointer past them. + * + * @param buffer The buffer + * @param count The number of integers to put + * @param ui A pointer to the integers + * + * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space + * in the buffer + **/ +int putUInt64LEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) + __attribute__((warn_unused_result)); + +#endif /* BUFFER_H */ diff --git a/source/uds/bufferPrivate.h b/source/uds/bufferPrivate.h new file mode 100644 index 0000000..8a0f46a --- /dev/null +++ b/source/uds/bufferPrivate.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferPrivate.h#1 $ + */ + +#ifndef BUFFER_PRIVATE_H +#define BUFFER_PRIVATE_H + +#include "common.h" + +struct buffer { + size_t start; + size_t end; + size_t length; + byte *data; + bool wrapped; +}; + +#endif /* BUFFER_PRIVATE_H */ diff --git a/source/uds/bufferedReader.c b/source/uds/bufferedReader.c new file mode 100644 index 0000000..b67d33d --- /dev/null +++ b/source/uds/bufferedReader.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferedReader.c#5 $ + */ + +#include "bufferedReader.h" + +#include "compiler.h" +#include "ioFactory.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" + +#ifndef __KERNEL__ +/* + * Define sector_t. The kernel really wants us to use it. The code becomes + * ugly if we need to #ifdef every usage of sector_t. Note that the of #define + * means that even if a user mode include typedefs sector_t, it will not affect + * this module. + */ +#define sector_t uint64_t +#endif + +struct bufferedReader { +#ifdef __KERNEL__ + // IOFactory owning the block device + IOFactory *br_factory; + // The dm_bufio_client to read from + struct dm_bufio_client *br_client; + // The current dm_buffer + struct dm_buffer *br_buffer; + // The number of blocks that can be read from + sector_t br_limit; + // Number of the current block + sector_t br_blockNumber; +#else + // Region to read from + IORegion *br_region; + // Number of the current block + uint64_t br_blockNumber; +#endif + // Start of the buffer + byte *br_start; + // End of the data read from the buffer + byte *br_pointer; +}; + +#ifdef __KERNEL__ +/*****************************************************************************/ +static void readAhead(BufferedReader *br, sector_t blockNumber) +{ + if (blockNumber < br->br_limit) { + enum { MAX_READ_AHEAD = 4 }; + size_t readAhead = minSizeT(MAX_READ_AHEAD, br->br_limit - blockNumber); + dm_bufio_prefetch(br->br_client, blockNumber, readAhead); + } +} +#endif + +/*****************************************************************************/ +#ifdef __KERNEL__ +int makeBufferedReader(IOFactory *factory, + struct dm_bufio_client *client, + sector_t blockLimit, + BufferedReader **readerPtr) +{ + BufferedReader *reader = NULL; + int result = ALLOCATE(1, BufferedReader, "buffered reader", &reader); + if (result != UDS_SUCCESS) { + return result; + } + + *reader = (BufferedReader) { + .br_factory = factory, + .br_client = client, + .br_buffer = NULL, + .br_limit = blockLimit, + .br_blockNumber = 0, + .br_start = NULL, + .br_pointer = NULL, + }; + + readAhead(reader,0); + getIOFactory(factory); + *readerPtr = reader; + return UDS_SUCCESS; +} +#else +int makeBufferedReader(IORegion *region, BufferedReader **readerPtr) +{ + byte *data; + int result = ALLOCATE_IO_ALIGNED(UDS_BLOCK_SIZE, byte, + "buffer writer buffer", &data); + if (result != UDS_SUCCESS) { + return result; + } + + BufferedReader *reader = NULL; + result = ALLOCATE(1, BufferedReader, "buffered reader", &reader); + if (result != UDS_SUCCESS) { + FREE(data); + return result; + } + + *reader = (BufferedReader) { + .br_region = region, + .br_blockNumber = 0, + .br_start = data, + .br_pointer = NULL, + }; + + getIORegion(region); + *readerPtr = reader; + return UDS_SUCCESS; +} +#endif + +/*****************************************************************************/ +void freeBufferedReader(BufferedReader *br) +{ + if (br == NULL) { + return; + } +#ifdef __KERNEL__ + if (br->br_buffer != NULL) { + dm_bufio_release(br->br_buffer); + } + dm_bufio_client_destroy(br->br_client); + putIOFactory(br->br_factory); +#else + putIORegion(br->br_region); + FREE(br->br_start); +#endif + FREE(br); +} + +/*****************************************************************************/ +static int positionReader(BufferedReader *br, + sector_t blockNumber, + off_t offset) +{ + if ((br->br_pointer == NULL) || (blockNumber != br->br_blockNumber)) { +#ifdef __KERNEL__ + if (blockNumber >= br->br_limit) { + return UDS_OUT_OF_RANGE; + } + if (br->br_buffer != NULL) { + dm_bufio_release(br->br_buffer); + br->br_buffer = NULL; + } + struct dm_buffer *buffer = NULL; + void *data = dm_bufio_read(br->br_client, blockNumber, &buffer); + if (IS_ERR(data)) { + return -PTR_ERR(data); + } + br->br_buffer = buffer; + br->br_start = data; + if (blockNumber == br->br_blockNumber + 1) { + readAhead(br, blockNumber + 1); + } +#else + int result = readFromRegion(br->br_region, blockNumber * UDS_BLOCK_SIZE, + br->br_start, UDS_BLOCK_SIZE, NULL); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "%s got readFromRegion error", + __func__); + return result; + } +#endif + } + br->br_blockNumber = blockNumber; + br->br_pointer = br->br_start + offset; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static size_t bytesRemainingInReadBuffer(BufferedReader *br) +{ + return (br->br_pointer == NULL + ? 0 + : br->br_start + UDS_BLOCK_SIZE - br->br_pointer); +} + +/*****************************************************************************/ +int readFromBufferedReader(BufferedReader *br, void *data, size_t length) +{ + byte *dp = data; + int result = UDS_SUCCESS; + while (length > 0) { + if (bytesRemainingInReadBuffer(br) == 0) { + sector_t blockNumber = br->br_blockNumber; + if (br->br_pointer != NULL) { + ++blockNumber; + } + result = positionReader(br, blockNumber, 0); + if (result != UDS_SUCCESS) { + break; + } + } + + size_t avail = bytesRemainingInReadBuffer(br); + size_t chunk = minSizeT(length, avail); + memcpy(dp, br->br_pointer, chunk); + length -= chunk; + dp += chunk; + br->br_pointer += chunk; + } + + if (((result == UDS_OUT_OF_RANGE) || (result == UDS_END_OF_FILE)) + && (dp - (byte *) data > 0)) { + result = UDS_SHORT_READ; + } + return result; +} + +/*****************************************************************************/ +int verifyBufferedData(BufferedReader *br, + const void *value, + size_t length) +{ + const byte *vp = value; + sector_t startingBlockNumber = br->br_blockNumber; + int startingOffset = br->br_pointer - br->br_start; + while (length > 0) { + if (bytesRemainingInReadBuffer(br) == 0) { + sector_t blockNumber = br->br_blockNumber; + if (br->br_pointer != NULL) { + ++blockNumber; + } + int result = positionReader(br, blockNumber, 0); + if (result != UDS_SUCCESS) { + positionReader(br, startingBlockNumber, startingOffset); + return UDS_CORRUPT_FILE; + } + } + + size_t avail = bytesRemainingInReadBuffer(br); + size_t chunk = minSizeT(length, avail); + if (memcmp(vp, br->br_pointer, chunk) != 0) { + positionReader(br, startingBlockNumber, startingOffset); + return UDS_CORRUPT_FILE; + } + length -= chunk; + vp += chunk; + br->br_pointer += chunk; + } + + return UDS_SUCCESS; +} diff --git a/source/uds/bufferedReader.h b/source/uds/bufferedReader.h new file mode 100644 index 0000000..4da8119 --- /dev/null +++ b/source/uds/bufferedReader.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferedReader.h#3 $ + */ + +#ifndef BUFFERED_READER_H +#define BUFFERED_READER_H 1 + +#include "common.h" + +#ifdef __KERNEL__ +struct dm_bufio_client; +struct ioFactory; +#else +struct ioRegion; +#endif + +/** + * The buffered reader allows efficient IO for IORegions, which may be + * file- or block-based. The internal buffer always reads aligned data + * from the underlying region. + **/ +typedef struct bufferedReader BufferedReader; + +#ifdef __KERNEL__ +/** + * Make a new buffered reader. + * + * @param factory The IOFactory creating the buffered reader. + * @param client The dm_bufio_client to read from. + * @param blockLimit The number of blocks that may be read. + * @param readerPtr The pointer to hold the newly allocated buffered reader + * + * @return UDS_SUCCESS or error code. + **/ +int makeBufferedReader(struct ioFactory *factory, + struct dm_bufio_client *client, + sector_t blockLimit, + BufferedReader **readerPtr) + __attribute__((warn_unused_result)); +#else +/** + * Make a new buffered reader. + * + * @param region An IORegion to read from. + * @param readerPtr The pointer to hold the newly allocated buffered reader. + * + * @return UDS_SUCCESS or error code. + **/ +int makeBufferedReader(struct ioRegion *region, BufferedReader **readerPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Free a buffered reader. + * + * @param reader The buffered reader + **/ +void freeBufferedReader(BufferedReader *reader); + +/** + * Retrieve data from a buffered reader, reading from the region when needed. + * + * @param reader The buffered reader + * @param data The buffer to read data into + * @param length The length of the data to read + * + * @return UDS_SUCCESS or an error code. + **/ +int readFromBufferedReader(BufferedReader *reader, void *data, size_t length) + __attribute__((warn_unused_result)); + +/** + * Verify that the data currently in the buffer matches the required value. + * + * @param reader The buffered reader. + * @param value The value that must match the buffer contents. + * @param length The length of the value that must match. + * + * @return UDS_SUCCESS or an error code, specifically UDS_CORRUPT_FILE + * if the required value fails to match. + * + * @note If the value matches, the matching contents are consumed. However, + * if the match fails, any buffer contents are left as is. + **/ +int verifyBufferedData(BufferedReader *reader, + const void *value, + size_t length) + __attribute__((warn_unused_result)); + +#endif // BUFFERED_READER_H diff --git a/source/uds/bufferedWriter.c b/source/uds/bufferedWriter.c new file mode 100644 index 0000000..abfb9cf --- /dev/null +++ b/source/uds/bufferedWriter.c @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferedWriter.c#6 $ + */ + +#include "bufferedWriter.h" + +#include "compiler.h" +#include "errors.h" +#include "ioFactory.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" + + +struct bufferedWriter { +#ifdef __KERNEL__ + // IOFactory owning the block device + IOFactory *bw_factory; + // The dm_bufio_client to write to + struct dm_bufio_client *bw_client; + // The current dm_buffer + struct dm_buffer *bw_buffer; + // The number of blocks that can be written to + sector_t bw_limit; + // Number of the current block + sector_t bw_blockNumber; +#else + // Region to write to + IORegion *bw_region; + // Number of the current block + uint64_t bw_blockNumber; +#endif + // Start of the buffer + byte *bw_start; + // End of the data written to the buffer + byte *bw_pointer; + // Error code + int bw_error; + // Have writes been done? + bool bw_used; +}; + +#ifdef __KERNEL__ +/*****************************************************************************/ +__attribute__((warn_unused_result)) +int prepareNextBuffer(BufferedWriter *bw) +{ + if (bw->bw_blockNumber >= bw->bw_limit) { + bw->bw_error = UDS_OUT_OF_RANGE; + return UDS_OUT_OF_RANGE; + } + + struct dm_buffer *buffer = NULL; + void *data = dm_bufio_new(bw->bw_client, bw->bw_blockNumber, &buffer); + if (IS_ERR(data)) { + bw->bw_error = -PTR_ERR(data); + return bw->bw_error; + } + bw->bw_buffer = buffer; + bw->bw_start = data; + bw->bw_pointer = data; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int flushPreviousBuffer(BufferedWriter *bw) +{ + if (bw->bw_buffer != NULL) { + if (bw->bw_error == UDS_SUCCESS) { + size_t avail = spaceRemainingInWriteBuffer(bw); + if (avail > 0) { + memset(bw->bw_pointer, 0, avail); + } + dm_bufio_mark_buffer_dirty(bw->bw_buffer); + } + dm_bufio_release(bw->bw_buffer); + bw->bw_buffer = NULL; + bw->bw_start = NULL; + bw->bw_pointer = NULL; + bw->bw_blockNumber++; + } + return bw->bw_error; +} +#endif + +/*****************************************************************************/ +#ifdef __KERNEL__ +int makeBufferedWriter(IOFactory *factory, + struct dm_bufio_client *client, + sector_t blockLimit, + BufferedWriter **writerPtr) +{ + BufferedWriter *writer; + int result = ALLOCATE(1, BufferedWriter, "buffered writer", &writer); + if (result != UDS_SUCCESS) { + return result; + } + + *writer = (BufferedWriter) { + .bw_factory = factory, + .bw_client = client, + .bw_buffer = NULL, + .bw_limit = blockLimit, + .bw_start = NULL, + .bw_pointer = NULL, + .bw_blockNumber = 0, + .bw_error = UDS_SUCCESS, + .bw_used = false, + }; + + getIOFactory(factory); + *writerPtr = writer; + return UDS_SUCCESS; +} +#else +int makeBufferedWriter(IORegion *region, BufferedWriter **writerPtr) +{ + byte *data; + int result = ALLOCATE_IO_ALIGNED(UDS_BLOCK_SIZE, byte, + "buffer writer buffer", &data); + if (result != UDS_SUCCESS) { + return result; + } + + BufferedWriter *writer; + result = ALLOCATE(1, BufferedWriter, "buffered writer", &writer); + if (result != UDS_SUCCESS) { + FREE(data); + return result; + } + + *writer = (BufferedWriter) { + .bw_region = region, + .bw_start = data, + .bw_pointer = data, + .bw_blockNumber = 0, + .bw_error = UDS_SUCCESS, + .bw_used = false, + }; + + getIORegion(region); + *writerPtr = writer; + return UDS_SUCCESS; +} +#endif + +/*****************************************************************************/ +void freeBufferedWriter(BufferedWriter *bw) +{ + if (bw == NULL) { + return; + } +#ifdef __KERNEL__ + flushPreviousBuffer(bw); + int result = -dm_bufio_write_dirty_buffers(bw->bw_client); +#else + int result = syncRegionContents(bw->bw_region); +#endif + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "%s cannot sync storage", __func__); + } +#ifdef __KERNEL__ + dm_bufio_client_destroy(bw->bw_client); + putIOFactory(bw->bw_factory); +#else + putIORegion(bw->bw_region); + FREE(bw->bw_start); +#endif + FREE(bw); +} + +/*****************************************************************************/ +static INLINE size_t spaceUsedInBuffer(BufferedWriter *bw) +{ + return bw->bw_pointer - bw->bw_start; +} + +/*****************************************************************************/ +size_t spaceRemainingInWriteBuffer(BufferedWriter *bw) +{ + return UDS_BLOCK_SIZE - spaceUsedInBuffer(bw); +} + +/*****************************************************************************/ +int writeToBufferedWriter(BufferedWriter *bw, const void *data, size_t len) +{ + if (bw->bw_error != UDS_SUCCESS) { + return bw->bw_error; + } + + const byte *dp = data; + int result = UDS_SUCCESS; + while ((len > 0) && (result == UDS_SUCCESS)) { +#ifdef __KERNEL__ + if (bw->bw_buffer == NULL) { + result = prepareNextBuffer(bw); + continue; + } +#endif + + size_t avail = spaceRemainingInWriteBuffer(bw); + size_t chunk = minSizeT(len, avail); + memcpy(bw->bw_pointer, dp, chunk); + len -= chunk; + dp += chunk; + bw->bw_pointer += chunk; + + if (spaceRemainingInWriteBuffer(bw) == 0) { + result = flushBufferedWriter(bw); + } + } + + bw->bw_used = true; + return result; +} + +/*****************************************************************************/ +int writeZerosToBufferedWriter(BufferedWriter *bw, size_t len) +{ + if (bw->bw_error != UDS_SUCCESS) { + return bw->bw_error; + } + + int result = UDS_SUCCESS; + while ((len > 0) && (result == UDS_SUCCESS)) { +#ifdef __KERNEL__ + if (bw->bw_buffer == NULL) { + result = prepareNextBuffer(bw); + continue; + } +#endif + + size_t avail = spaceRemainingInWriteBuffer(bw); + size_t chunk = minSizeT(len, avail); + memset(bw->bw_pointer, 0, chunk); + len -= chunk; + bw->bw_pointer += chunk; + + if (spaceRemainingInWriteBuffer(bw) == 0) { + result = flushBufferedWriter(bw); + } + } + + bw->bw_used = true; + return result; +} + +/*****************************************************************************/ +int flushBufferedWriter(BufferedWriter *bw) +{ + if (bw->bw_error != UDS_SUCCESS) { + return bw->bw_error; + } + +#ifdef __KERNEL__ + return flushPreviousBuffer(bw); +#else + size_t n = spaceUsedInBuffer(bw); + if (n > 0) { + int result = writeToRegion(bw->bw_region, + bw->bw_blockNumber * UDS_BLOCK_SIZE, + bw->bw_start, UDS_BLOCK_SIZE, n); + if (result != UDS_SUCCESS) { + return bw->bw_error = result; + } else { + bw->bw_pointer = bw->bw_start; + bw->bw_blockNumber++; + } + } + return UDS_SUCCESS; +#endif +} + +/*****************************************************************************/ +bool wasBufferedWriterUsed(const BufferedWriter *bw) +{ + return bw->bw_used; +} + +/*****************************************************************************/ +void noteBufferedWriterUsed(BufferedWriter *bw) +{ + bw->bw_used = true; +} diff --git a/source/uds/bufferedWriter.h b/source/uds/bufferedWriter.h new file mode 100644 index 0000000..8774b5b --- /dev/null +++ b/source/uds/bufferedWriter.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/bufferedWriter.h#5 $ + */ + +#ifndef BUFFERED_WRITER_H +#define BUFFERED_WRITER_H 1 + +#include "common.h" + +#ifdef __KERNEL__ +struct dm_bufio_client; +struct ioFactory; +#else +struct ioRegion; +#endif + +typedef struct bufferedWriter BufferedWriter; + +#ifdef __KERNEL__ +/** + * Make a new buffered writer. + * + * @param factory The IOFactory creating the buffered writer + * @param client The dm_bufio_client to write to. + * @param blockLimit The number of blocks that may be written to. + * @param writerPtr The new buffered writer goes here. + * + * @return UDS_SUCCESS or an error code. + **/ +int makeBufferedWriter(struct ioFactory *factory, + struct dm_bufio_client *client, + sector_t blockLimit, + BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); +#else +/** + * Make a new buffered writer. + * + * @param region The IOregion to write to. + * @param writerPtr The new buffered writer goes here. + * + * @return UDS_SUCCESS or an error code. + **/ +int makeBufferedWriter(struct ioRegion *region, BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Free a buffered writer, without flushing. + * + * @param [in] buffer The buffered writer object. + **/ +void freeBufferedWriter(BufferedWriter *buffer); + +/** + * Append data to buffer, writing as needed. + * + * @param buffer The buffered writer object. + * @param data The data to write. + * @param len The length of the data written. + * + * @return UDS_SUCCESS or an error code. + * The error may reflect previous attempts to write + * or flush the buffer. Once a write or flush error + * occurs it is sticky. + **/ +int writeToBufferedWriter(BufferedWriter *buffer, const void *data, size_t len) + __attribute__((warn_unused_result)); + +/** + * Zero data in the buffer, writing as needed. + * + * @param buffer The buffered writer object. + * @param len The number of zero bytes to write. + * + * @return UDS_SUCCESS or an error code. + * The error may reflect previous attempts to write + * or flush the buffer. Once a write or flush error + * occurs it is sticky. + **/ +int writeZerosToBufferedWriter(BufferedWriter *bw, size_t len) + __attribute__((warn_unused_result)); + + +/** + * Flush any partial data from the buffer. + * + * @param buffer The buffered writer object. + * + * @return UDS_SUCCESS or an error code. + * The error may reflect previous attempts to write + * or flush the buffer. Once a write or flush error + * occurs it is sticky. + **/ +int flushBufferedWriter(BufferedWriter *buffer) + __attribute__((warn_unused_result)); + +/** + * Return the size of the remaining space in the buffer (for testing) + * + * @param [in] buffer The buffered writer object. + * + * @return The number of available bytes in the buffer. + **/ +size_t spaceRemainingInWriteBuffer(BufferedWriter *buffer) + __attribute__((warn_unused_result)); + +/** + * Return whether the buffer was ever written to. + * + * @param buffer The buffered writer object. + * + * @return True if at least one call to writeToBufferedWriter + * was made. + **/ +bool wasBufferedWriterUsed(const BufferedWriter *buffer) + __attribute__((warn_unused_result)); + +/** + * Note the buffer has been used. + * + * @param buffer The buffered writer object. + **/ +void noteBufferedWriterUsed(BufferedWriter *buffer); + +#endif // BUFFERED_WRITER_H diff --git a/source/uds/cacheCounters.c b/source/uds/cacheCounters.c new file mode 100644 index 0000000..8bf7ad4 --- /dev/null +++ b/source/uds/cacheCounters.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cacheCounters.c#1 $ + */ + +#include "cacheCounters.h" + +#include "atomicDefs.h" +#include "compiler.h" +#include "errors.h" +#include "permassert.h" +#include "stringUtils.h" +#include "uds.h" + +/**********************************************************************/ +void incrementCacheCounter(CacheCounters *counters, + int probeType, + CacheResultKind kind) +{ + CacheProbeType basicProbeType = probeType & ~CACHE_PROBE_IGNORE_FAILURE; + int result = ASSERT(basicProbeType <= CACHE_PROBE_RECORD_RETRY, + "invalid cache probe type %#x", probeType); + if (result != UDS_SUCCESS) { + return; + } + result = ASSERT(kind <= CACHE_RESULT_QUEUED, + "invalid cache probe result type %#x", kind); + if (result != UDS_SUCCESS) { + return; + } + + if (((probeType & CACHE_PROBE_IGNORE_FAILURE) != 0) + && (kind != CACHE_RESULT_HIT)) { + return; + } + + CacheCountsByKind *kindCounts; + switch (basicProbeType) { + case CACHE_PROBE_INDEX_FIRST: + kindCounts = &counters->firstTime.indexPage; + break; + case CACHE_PROBE_RECORD_FIRST: + kindCounts = &counters->firstTime.recordPage; + break; + case CACHE_PROBE_INDEX_RETRY: + kindCounts = &counters->retried.indexPage; + break; + case CACHE_PROBE_RECORD_RETRY: + kindCounts = &counters->retried.recordPage; + break; + default: + // Never used but the compiler hasn't figured that out. + return; + } + + uint64_t *myCounter; + switch (kind) { + case CACHE_RESULT_MISS: + myCounter = &kindCounts->misses; + break; + case CACHE_RESULT_QUEUED: + myCounter = &kindCounts->queued; + break; + case CACHE_RESULT_HIT: + myCounter = &kindCounts->hits; + break; + default: + // Never used but the compiler hasn't figured that out. + return; + } + // XXX Vile case makes many assumptions. Counters should be declared atomic. + atomic64_inc((atomic64_t *) myCounter); +} diff --git a/source/uds/cacheCounters.h b/source/uds/cacheCounters.h new file mode 100644 index 0000000..9029453 --- /dev/null +++ b/source/uds/cacheCounters.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cacheCounters.h#1 $ + */ + +#ifndef CACHE_COUNTERS_H +#define CACHE_COUNTERS_H + +#include "typeDefs.h" + +/** + * Basic counts of hits and misses for a given type of cache probe. + **/ +typedef struct cacheCountsByKind { + /** Number of hits */ + uint64_t hits; + /** Number of misses */ + uint64_t misses; + /** Number of probes for data already queued for read */ + uint64_t queued; +} CacheCountsByKind; + +/** + * The various types of cache probes we care about. + **/ +typedef enum cacheProbeType { + /** First attempt to look up an index page, for a given request. */ + CACHE_PROBE_INDEX_FIRST = 0, + /** First attempt to look up a record page, for a given request. */ + CACHE_PROBE_RECORD_FIRST, + /** Second or later attempt to look up an index page, for a given request. */ + CACHE_PROBE_INDEX_RETRY, + /** Second or later attempt to look up a record page, for a given request. */ + CACHE_PROBE_RECORD_RETRY +} CacheProbeType; + +enum { + /** Flag bit to indicate that failures shouldn't be recorded. */ + CACHE_PROBE_IGNORE_FAILURE = 128 +}; + +/** + * Result-type counts for both kinds of data pages in the page cache. + **/ +typedef struct cacheCountsByPageType { + /** His/miss counts for index pages. */ + CacheCountsByKind indexPage; + /** Hit/miss counts for record pages. */ + CacheCountsByKind recordPage; +} CacheCountsByPageType; + +/** + * All the counters used for an entry cache. + **/ +typedef struct cacheCounters { + // counters for the page cache + /** Hit/miss counts for the first attempt per request */ + CacheCountsByPageType firstTime; + /** Hit/miss counts when a second (or later) attempt is needed */ + CacheCountsByPageType retried; + + /** Number of cache entry invalidations due to single-entry eviction */ + uint64_t evictions; + /** Number of cache entry invalidations due to chapter expiration */ + uint64_t expirations; + + // counters for the sparse chapter index cache + /** Hit/miss counts for the sparse cache chapter probes */ + CacheCountsByKind sparseChapters; + /** Hit/miss counts for the sparce cache name searches */ + CacheCountsByKind sparseSearches; +} CacheCounters; + +/** + * Success/failure assessment of cache probe result. + **/ +typedef enum cacheResultKind { + /** The requested entry was found in the cache */ + CACHE_RESULT_HIT, + /** The requested entry was not found in the cache */ + CACHE_RESULT_MISS, + /** The requested entry wasn't found in the cache but is queued for read */ + CACHE_RESULT_QUEUED +} CacheResultKind; + +/** + * Increment one of the cache counters. + * + * @param counters pointer to the counters + * @param probeType type of access done + * @param kind result of probe + **/ +void incrementCacheCounter(CacheCounters *counters, + int probeType, + CacheResultKind kind); + +#endif /* CACHE_COUNTERS_H */ diff --git a/source/uds/cachedChapterIndex.c b/source/uds/cachedChapterIndex.c new file mode 100644 index 0000000..ae0a22c --- /dev/null +++ b/source/uds/cachedChapterIndex.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cachedChapterIndex.c#3 $ + */ + +#include "cachedChapterIndex.h" + +#include "memoryAlloc.h" + +/**********************************************************************/ +int initializeCachedChapterIndex(CachedChapterIndex *chapter, + const Geometry *geometry) +{ + chapter->virtualChapter = UINT64_MAX; + chapter->indexPagesCount = geometry->indexPagesPerChapter; + + int result = ALLOCATE(chapter->indexPagesCount, DeltaIndexPage, __func__, + &chapter->indexPages); + if (result != UDS_SUCCESS) { + return result; + } + + result = ALLOCATE(chapter->indexPagesCount, struct volume_page, + "sparse index VolumePages", &chapter->volumePages); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int i; + for (i = 0; i < chapter->indexPagesCount; i++) { + result = initializeVolumePage(geometry, &chapter->volumePages[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void destroyCachedChapterIndex(CachedChapterIndex *chapter) +{ + if (chapter->volumePages != NULL) { + unsigned int i; + for (i = 0; i < chapter->indexPagesCount; i++) { + destroyVolumePage(&chapter->volumePages[i]); + } + } + FREE(chapter->indexPages); + FREE(chapter->volumePages); +} + +/**********************************************************************/ +int cacheChapterIndex(CachedChapterIndex *chapter, + uint64_t virtualChapter, + const Volume *volume) +{ + // Mark the cached chapter as unused in case the update fails midway. + chapter->virtualChapter = UINT64_MAX; + + // Read all the page data and initialize the entire DeltaIndexPage array. + // (It's not safe for the zone threads to do it lazily--they'll race.) + int result = readChapterIndexFromVolume(volume, virtualChapter, + chapter->volumePages, + chapter->indexPages); + if (result != UDS_SUCCESS) { + return result; + } + + // Reset all chapter counter values to zero. + chapter->counters.searchHits = 0; + chapter->counters.searchMisses = 0; + chapter->counters.consecutiveMisses = 0; + + // Mark the entry as valid--it's now in the cache. + chapter->virtualChapter = virtualChapter; + chapter->skipSearch = false; + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int searchCachedChapterIndex(CachedChapterIndex *chapter, + const Geometry *geometry, + const IndexPageMap *indexPageMap, + const UdsChunkName *name, + int *recordPagePtr) +{ + // Find the indexPageNumber in the chapter that would have the chunk name. + unsigned int physicalChapter + = mapToPhysicalChapter(geometry, chapter->virtualChapter); + unsigned int indexPageNumber; + int result = findIndexPageNumber(indexPageMap, name, physicalChapter, + &indexPageNumber); + if (result != UDS_SUCCESS) { + return result; + } + + return searchChapterIndexPage(&chapter->indexPages[indexPageNumber], + geometry, name, recordPagePtr); +} diff --git a/source/uds/cachedChapterIndex.h b/source/uds/cachedChapterIndex.h new file mode 100644 index 0000000..f759d5d --- /dev/null +++ b/source/uds/cachedChapterIndex.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cachedChapterIndex.h#3 $ + */ + +#ifndef CACHED_CHAPTER_INDEX_H +#define CACHED_CHAPTER_INDEX_H + +#include "chapterIndex.h" +#include "common.h" +#include "compiler.h" +#include "cpu.h" +#include "geometry.h" +#include "indexPageMap.h" +#include "typeDefs.h" +#include "volume.h" +#include "volumeStore.h" + +/** + * These counters are essentially fields of the CachedChapterIndex, but are + * segregated into this structure because they are frequently modified. They + * are grouped and aligned to keep them on different cache lines from the + * chapter fields that are accessed far more often than they are updated. + **/ +struct __attribute__((aligned(CACHE_LINE_BYTES))) cachedIndexCounters { + /** the total number of search hits since this chapter was cached */ + uint64_t searchHits; + + /** the total number of search misses since this chapter was cached */ + uint64_t searchMisses; + + /** the number of consecutive search misses since the last cache hit */ + uint64_t consecutiveMisses; +}; +typedef struct cachedIndexCounters CachedIndexCounters; + +/** + * CachedChapterIndex is the structure for a cache entry, representing a + * single cached chapter index in the sparse chapter index cache. + **/ +struct __attribute__((aligned(CACHE_LINE_BYTES))) cachedChapterIndex { + /* + * The virtual chapter number of the cached chapter index. UINT64_MAX means + * this cache entry is unused. Must only be modified in the critical section + * in updateSparseCache(). + */ + uint64_t virtualChapter; + + /* The number of index pages in a chapter */ + unsigned int indexPagesCount; + + /* + * This flag is mutable between cache updates, but it rarely changes and + * is frequently accessed, so it groups with the immutable fields. + * + * If set, skip the chapter when searching the entire cache. This flag is + * just a performance optimization. If we do not see a recent change to it, + * it will be corrected when we pass through a memory barrier while getting + * the next request from the queue. So we may do one extra search of the + * chapter index, or miss one deduplication opportunity. + */ + bool skipSearch; + + // These pointers are immutable during the life of the cache. The contents + // of the arrays change when the cache entry is replaced. + + /* pointer to a cache-aligned array of ChapterIndexPages */ + DeltaIndexPage *indexPages; + + /* pointer to an array of VolumePages containing the index pages */ + struct volume_page *volumePages; + + // The cache-aligned counters change often and are placed at the end of the + // structure to prevent false sharing with the more stable fields above. + + /* counter values updated by the thread servicing zone zero */ + CachedIndexCounters counters; +}; +typedef struct cachedChapterIndex CachedChapterIndex; + +/** + * Initialize a CachedChapterIndex, allocating the memory for the array of + * ChapterIndexPages and the raw index page data. The chapter index will be + * marked as unused (virtualChapter == UINT64_MAX). + * + * @param chapter the chapter index cache entry to initialize + * @param geometry the geometry governing the volume + **/ +int initializeCachedChapterIndex(CachedChapterIndex *chapter, + const Geometry *geometry) + __attribute__((warn_unused_result)); + +/** + * Destroy a CachedChapterIndex, freeing the memory allocated for the + * ChapterIndexPages and raw index page data. + * + * @param chapter the chapter index cache entry to destroy + **/ +void destroyCachedChapterIndex(CachedChapterIndex *chapter); + +/** + * Assign a new value to the skipSearch flag of a cached chapter index. + * + * @param chapter the chapter index cache entry to modify + * @param skipSearch the new value of the skipSearch falg + **/ +static INLINE void setSkipSearch(CachedChapterIndex *chapter, bool skipSearch) +{ + // Explicitly check if the field is set so we don't keep dirtying the memory + // cache line on continued search hits. + if (READ_ONCE(chapter->skipSearch) != skipSearch) { + WRITE_ONCE(chapter->skipSearch, skipSearch); + } +} + +/** + * Check if a cached sparse chapter index should be skipped over in the search + * for a chunk name. Filters out unused, invalid, disabled, and irrelevant + * cache entries. + * + * @param zone the zone doing the check + * @param chapter the cache entry search candidate + * @param virtualChapter the virtualChapter containing a hook, or UINT64_MAX + * if searching the whole cache for a non-hook + * + * @return true if the provided chapter index should be skipped + **/ +static INLINE bool shouldSkipChapterIndex(const IndexZone *zone, + const CachedChapterIndex *chapter, + uint64_t virtualChapter) +{ + // Don't search unused entries (contents undefined) or invalid entries + // (the chapter is no longer the zone's view of the volume). + if ((chapter->virtualChapter == UINT64_MAX) + || (chapter->virtualChapter < zone->oldestVirtualChapter)) { + return true; + } + + if (virtualChapter != UINT64_MAX) { + // If the caller specified a virtual chapter, only search the cache + // entry containing that chapter. + return (virtualChapter != chapter->virtualChapter); + } else { + // When searching the entire cache, save time by skipping over chapters + // that have had too many consecutive misses. + return READ_ONCE(chapter->skipSearch); + } +} + +/** + * Cache a chapter index, reading all the index pages from the volume and + * initializing the array of ChapterIndexPages in the cache entry to represent + * them. The virtualChapter field of the cache entry will be set to UINT64_MAX + * if there is any error since the remaining mutable fields will be in an + * undefined state. + * + * @param chapter the chapter index cache entry to replace + * @param virtualChapter the virtual chapter number of the index to read + * @param volume the volume containing the chapter index + * + * @return UDS_SUCCESS or an error code + **/ +int cacheChapterIndex(CachedChapterIndex *chapter, + uint64_t virtualChapter, + const Volume *volume) + __attribute__((warn_unused_result)); + +/** + * Search a single cached sparse chapter index for a chunk name, returning the + * record page number that may contain the name. + * + * @param [in] chapter the cache entry for the chapter to search + * @param [in] geometry the geometry governing the volume + * @param [in] indexPageMap the index page number map for the volume + * @param [in] name the chunk name to search for + * @param [out] recordPagePtr the record page number of a match, else + * NO_CHAPTER_INDEX_ENTRY if nothing matched + * + * @return UDS_SUCCESS or an error code + **/ +int searchCachedChapterIndex(CachedChapterIndex *chapter, + const Geometry *geometry, + const IndexPageMap *indexPageMap, + const UdsChunkName *name, + int *recordPagePtr) + __attribute__((warn_unused_result)); + +#endif /* CACHED_CHAPTER_INDEX_H */ diff --git a/source/uds/chapterIndex.c b/source/uds/chapterIndex.c new file mode 100644 index 0000000..5653a41 --- /dev/null +++ b/source/uds/chapterIndex.c @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/chapterIndex.c#5 $ + */ + +#include "chapterIndex.h" + +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" + + +/**********************************************************************/ +int makeOpenChapterIndex(OpenChapterIndex **openChapterIndex, + const Geometry *geometry, + bool chapterIndexHeaderNativeEndian, + uint64_t volumeNonce) +{ + + int result = ALLOCATE(1, OpenChapterIndex, "open chapter index", + openChapterIndex); + if (result != UDS_SUCCESS) { + return result; + } + + // The delta index will rebalance delta lists when memory gets tight, so + // give the chapter index one extra page. + size_t memorySize + = (geometry->indexPagesPerChapter + 1) * geometry->bytesPerPage; + (*openChapterIndex)->geometry = geometry; + (*openChapterIndex)->volumeNonce = volumeNonce; + (*openChapterIndex)->headerNativeEndian = chapterIndexHeaderNativeEndian, + result = initializeDeltaIndex(&(*openChapterIndex)->deltaIndex, 1, + geometry->deltaListsPerChapter, + geometry->chapterMeanDelta, + geometry->chapterPayloadBits, memorySize); + if (result != UDS_SUCCESS) { + FREE(*openChapterIndex); + *openChapterIndex = NULL; + } + return result; +} + +/**********************************************************************/ +void freeOpenChapterIndex(OpenChapterIndex *openChapterIndex) +{ + if (openChapterIndex == NULL) { + return; + } + + + uninitializeDeltaIndex(&openChapterIndex->deltaIndex); + FREE(openChapterIndex); +} + +/**********************************************************************/ +void emptyOpenChapterIndex(OpenChapterIndex *openChapterIndex, + uint64_t virtualChapterNumber) +{ + emptyDeltaIndex(&openChapterIndex->deltaIndex); + openChapterIndex->virtualChapterNumber = virtualChapterNumber; +} + +/** + * Check whether a delta list entry reflects a successful search for a given + * address. + * + * @param entry the delta list entry from the search + * @param address the address of the desired entry + * + * @return true iff the address was found + **/ +static INLINE bool wasEntryFound(const DeltaIndexEntry *entry, + unsigned int address) +{ + return (!entry->atEnd && (entry->key == address)); +} + +/**********************************************************************/ +int putOpenChapterIndexRecord(OpenChapterIndex *openChapterIndex, + const UdsChunkName *name, + unsigned int pageNumber) +{ + const Geometry *geometry = openChapterIndex->geometry; + int result + = ASSERT_WITH_ERROR_CODE(pageNumber < geometry->recordPagesPerChapter, + UDS_INVALID_ARGUMENT, + "Page number within chapter (%u) exceeds" + " the maximum value %u", + pageNumber, geometry->recordPagesPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaIndexEntry entry; + unsigned int address = hashToChapterDeltaAddress(name, geometry); + result = getDeltaIndexEntry(&openChapterIndex->deltaIndex, + hashToChapterDeltaList(name, geometry), + address, name->name, false, &entry); + if (result != UDS_SUCCESS) { + return result; + } + bool found = wasEntryFound(&entry, address); + result = ASSERT_WITH_ERROR_CODE(!(found && entry.isCollision), + UDS_BAD_STATE, + "Chunk appears more than once in chapter %" + PRIu64, + openChapterIndex->virtualChapterNumber); + if (result != UDS_SUCCESS) { + return result; + } + return putDeltaIndexEntry(&entry, address, pageNumber, + (found ? name->name : NULL)); +} + +/**********************************************************************/ +int packOpenChapterIndexPage(OpenChapterIndex *openChapterIndex, + byte *memory, + unsigned int firstList, + bool lastPage, + unsigned int *numLists) +{ + DeltaIndex *deltaIndex = &openChapterIndex->deltaIndex; + const Geometry *geometry = openChapterIndex->geometry; + unsigned int removals = 0; + for (;;) { + int result = packDeltaIndexPage(deltaIndex, openChapterIndex->volumeNonce, + openChapterIndex->headerNativeEndian, + memory, geometry->bytesPerPage, + openChapterIndex->virtualChapterNumber, + firstList, numLists); + if (result != UDS_SUCCESS) { + return result; + } + if ((firstList + *numLists) == geometry->deltaListsPerChapter) { + // All lists are packed + break; + } else if (*numLists == 0) { + // The next delta list does not fit on a page. This delta list will + // be removed. + } else if (lastPage) { + /* + * This is the last page and there are lists left unpacked, but all of + * the remaining lists must fit on the page. Find a list that contains + * entries and remove the entire list. Try the first list that does not + * fit. If it is empty, we will select the last list that already fits + * and has any entries. + */ + } else { + // This page is done + break; + } + if (removals == 0) { + DeltaIndexStats stats; + getDeltaIndexStats(deltaIndex, &stats); + logWarning("The chapter index for chapter %" PRIu64 + " contains %ld entries with %ld collisions", + openChapterIndex->virtualChapterNumber, + stats.recordCount, stats.collisionCount); + } + DeltaIndexEntry entry; + int listNumber = *numLists; + do { + if (listNumber < 0) { + return UDS_OVERFLOW; + } + result = startDeltaIndexSearch(deltaIndex, firstList + listNumber--, + 0, false, &entry); + if (result != UDS_SUCCESS) { + return result; + } + result = nextDeltaIndexEntry(&entry); + if (result != UDS_SUCCESS) { + return result; + } + } while (entry.atEnd); + do { + result = removeDeltaIndexEntry(&entry); + if (result != UDS_SUCCESS) { + return result; + } + removals++; + } while (!entry.atEnd); + } + if (removals > 0) { + logWarning("To avoid chapter index page overflow in chapter %" PRIu64 + ", %u entries were removed from the chapter index", + openChapterIndex->virtualChapterNumber, removals); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getOpenChapterIndexSize(OpenChapterIndex *openChapterIndex) +{ + DeltaIndexStats stats; + getDeltaIndexStats(&openChapterIndex->deltaIndex, &stats); + return stats.recordCount; +} + +/**********************************************************************/ +size_t getOpenChapterIndexMemoryAllocated(OpenChapterIndex *openChapterIndex) +{ + DeltaIndexStats stats; + getDeltaIndexStats(&openChapterIndex->deltaIndex, &stats); + return stats.memoryAllocated + sizeof(OpenChapterIndex); +} + +/**********************************************************************/ +int initializeChapterIndexPage(DeltaIndexPage *chapterIndexPage, + const Geometry *geometry, + byte *indexPage, + uint64_t volumeNonce) +{ + return initializeDeltaIndexPage(chapterIndexPage, volumeNonce, + geometry->chapterMeanDelta, + geometry->chapterPayloadBits, + indexPage, geometry->bytesPerPage); +} + +/**********************************************************************/ +int validateChapterIndexPage(const DeltaIndexPage *chapterIndexPage, + const Geometry *geometry) +{ + const DeltaIndex *deltaIndex = &chapterIndexPage->deltaIndex; + unsigned int first = chapterIndexPage->lowestListNumber; + unsigned int last = chapterIndexPage->highestListNumber; + // We walk every delta list from start to finish. + unsigned int listNumber; + for (listNumber = first; listNumber <= last; listNumber++) { + DeltaIndexEntry entry; + int result = startDeltaIndexSearch(deltaIndex, listNumber - first, 0, true, + &entry); + if (result != UDS_SUCCESS) { + return result; + } + for (;;) { + result = nextDeltaIndexEntry(&entry); + if (result != UDS_SUCCESS) { + if (result == UDS_CORRUPT_DATA) { + // A random bit stream is highly likely to arrive here when we go + // past the end of the delta list + return UDS_CORRUPT_COMPONENT; + } + return result; + } + if (entry.atEnd) { + break; + } + // Also make sure that the record page field contains a plausible value + if (getDeltaEntryValue(&entry) >= geometry->recordPagesPerChapter) { + // Do not log this as an error. It happens in normal operation when + // we are doing a rebuild but haven't written the entire volume once. + return UDS_CORRUPT_COMPONENT; + } + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int searchChapterIndexPage(DeltaIndexPage *chapterIndexPage, + const Geometry *geometry, + const UdsChunkName *name, + int *recordPagePtr) +{ + DeltaIndex *deltaIndex = &chapterIndexPage->deltaIndex; + unsigned int address = hashToChapterDeltaAddress(name, geometry); + unsigned int deltaListNumber = hashToChapterDeltaList(name, geometry); + unsigned int subListNumber + = deltaListNumber - chapterIndexPage->lowestListNumber;; + DeltaIndexEntry entry; + int result = getDeltaIndexEntry(deltaIndex, subListNumber, address, + name->name, true, &entry); + if (result != UDS_SUCCESS) { + return result; + } + + if (wasEntryFound(&entry, address)) { + *recordPagePtr = getDeltaEntryValue(&entry); + } else { + *recordPagePtr = NO_CHAPTER_INDEX_ENTRY; + } + return UDS_SUCCESS; +} diff --git a/source/uds/chapterIndex.h b/source/uds/chapterIndex.h new file mode 100644 index 0000000..4dd425b --- /dev/null +++ b/source/uds/chapterIndex.h @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/chapterIndex.h#4 $ + */ + +#ifndef CHAPTER_INDEX_H +#define CHAPTER_INDEX_H 1 + +#include "deltaIndex.h" +#include "geometry.h" + +enum { + // The value returned as the record page number when an entry is not found + // in the chapter index. + NO_CHAPTER_INDEX_ENTRY = -1 +}; + +typedef struct openChapterIndex { + const Geometry *geometry; + DeltaIndex deltaIndex; + uint64_t virtualChapterNumber; + bool headerNativeEndian; + uint64_t volumeNonce; +} OpenChapterIndex; + + +/** + * Make a new open chapter index. + * + * @param openChapterIndex Location to hold new open chapter index pointer + * @param geometry The geometry + * @param chapterIndexHeaderNativeEndian chapter index header format + * @param volumeNonce The volume nonce. + * + * @return error code or UDS_SUCCESS + **/ +int makeOpenChapterIndex(OpenChapterIndex **openChapterIndex, + const Geometry *geometry, + bool chapterIndexHeaderNativeEndian, + uint64_t volumeNonce) + __attribute__((warn_unused_result)); + +/** + * Terminate and clean up an open chapter index. + * + * @param openChapterIndex The open chapter index to terminate + **/ +void freeOpenChapterIndex(OpenChapterIndex *openChapterIndex); + +/** + * Empty an open chapter index, and prepare it for writing a new virtual + * chapter. + * + * @param openChapterIndex The open chapter index to empty + * @param virtualChapterNumber The virtual chapter number + **/ +void emptyOpenChapterIndex(OpenChapterIndex *openChapterIndex, + uint64_t virtualChapterNumber); + +/** + * Create a new record in an open chapter index, associating a chunk name with + * the number of the record page containing the metadata for the chunk. + * + * @param openChapterIndex The open chapter index + * @param name The chunk name + * @param pageNumber The number of the record page containing the name + * + * @return UDS_SUCCESS or an error code + **/ +int putOpenChapterIndexRecord(OpenChapterIndex *openChapterIndex, + const UdsChunkName *name, + unsigned int pageNumber) + __attribute__((warn_unused_result)); + +/** + * Pack a section of an open chapter index into a chapter index page. A + * range of delta lists (starting with a specified list index) is copied + * from the open chapter index into a memory page. The number of lists + * copied onto the page is returned to the caller. + * + * @param openChapterIndex The open chapter index + * @param memory The memory page to use + * @param firstList The first delta list number to be copied + * @param lastPage If true, this is the last page of the chapter + * index and all the remaining lists must be packed + * onto this page + * @param numLists The number of delta lists that were copied + * + * @return error code or UDS_SUCCESS. On UDS_SUCCESS, the numLists + * argument contains the number of lists copied. + **/ +int packOpenChapterIndexPage(OpenChapterIndex *openChapterIndex, + byte *memory, + unsigned int firstList, + bool lastPage, + unsigned int *numLists) + __attribute__((warn_unused_result)); + +/** + * Get the number of records in an open chapter index. + * + * @param openChapterIndex The open chapter index + * + * @return The number of records + **/ +int getOpenChapterIndexSize(OpenChapterIndex *openChapterIndex) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes allocated for the open chapter index. + * + * @param openChapterIndex The open chapter index + * + * @return the number of bytes allocated + **/ +size_t getOpenChapterIndexMemoryAllocated(OpenChapterIndex *openChapterIndex); + +/** + * Make a new chapter index page, initializing it with the data from the + * given buffer. + * + * @param chapterIndexPage The new chapter index page + * @param geometry The geometry + * @param indexPage The memory page to use + * @param volumeNonce If non-zero, the volume nonce to verify + * + * @return UDS_SUCCESS or an error code + **/ +int initializeChapterIndexPage(DeltaIndexPage *chapterIndexPage, + const Geometry *geometry, + byte *indexPage, + uint64_t volumeNonce) + __attribute__((warn_unused_result)); + +/** + * Validate a chapter index page. This is called at rebuild time to ensure + * that the volume file contains a coherent chapter index. + * + * @param chapterIndexPage The chapter index page + * @param geometry The geometry of the volume + * + * @return The result code: + * UDS_SUCCESS for a good chapter index page + * UDS_CORRUPT_COMPONENT if the chapter index code detects invalid data + * UDS_CORRUPT_DATA if there is a problem in a delta list bit stream + * UDS_BAD_STATE if the code follows an invalid code path + **/ +int validateChapterIndexPage(const DeltaIndexPage *chapterIndexPage, + const Geometry *geometry) + __attribute__((warn_unused_result)); + +/** + * Search a chapter index page for a chunk name, returning the record page + * number that may contain the name. + * + * @param [in] chapterIndexPage The chapter index page + * @param [in] geometry The geometry of the volume + * @param [in] name The chunk name + * @param [out] recordPagePtr The record page number + * or NO_CHAPTER_INDEX_ENTRY if not found + * + * @return UDS_SUCCESS or an error code + **/ +int searchChapterIndexPage(DeltaIndexPage *chapterIndexPage, + const Geometry *geometry, + const UdsChunkName *name, + int *recordPagePtr) + __attribute__((warn_unused_result)); + +#endif /* CHAPTER_INDEX_H */ diff --git a/source/uds/chapterWriter.c b/source/uds/chapterWriter.c new file mode 100644 index 0000000..3a926ab --- /dev/null +++ b/source/uds/chapterWriter.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/chapterWriter.c#2 $ + */ + +#include "chapterWriter.h" + +#include "errors.h" +#include "index.h" +#include "indexCheckpoint.h" +#include "indexComponent.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "openChapter.h" +#include "threads.h" + + +struct chapterWriter { + /* The index to which we belong */ + Index *index; + /* The thread to do the writing */ + Thread thread; + /* lock protecting the following fields */ + Mutex mutex; + /* condition signalled on state changes */ + CondVar cond; + /* Set to true to stop the thread */ + bool stop; + /* The result from the most recent write */ + int result; + /* The number of bytes allocated by the chapter writer */ + size_t memoryAllocated; + /* The number of zones which have submitted a chapter for writing */ + unsigned int zonesToWrite; + /* Open chapter index used by closeOpenChapter() */ + OpenChapterIndex *openChapterIndex; + /* Collated records used by closeOpenChapter() */ + UdsChunkRecord *collatedRecords; + /* The chapters to write (one per zone) */ + OpenChapterZone *chapters[]; +}; + +/** + * This is the driver function for the writer thread. It loops until + * terminated, waiting for a chapter to provided to close. + **/ +static void closeChapters(void *arg) +{ + ChapterWriter *writer = arg; + logDebug("chapter writer starting"); + lockMutex(&writer->mutex); + for (;;) { + while (writer->zonesToWrite < writer->index->zoneCount) { + if (writer->stop && (writer->zonesToWrite == 0)) { + // We've been told to stop, and all of the zones are in the same + // open chapter, so we can exit now. + unlockMutex(&writer->mutex); + logDebug("chapter writer stopping"); + return; + } + waitCond(&writer->cond, &writer->mutex); + } + + /* + * Release the lock while closing a chapter. We probably don't need to do + * this, but it seems safer in principle. It's OK to access the chapter + * and chapterNumber fields without the lock since those aren't allowed to + * change until we're done. + */ + unlockMutex(&writer->mutex); + + if (writer->index->hasSavedOpenChapter) { + writer->index->hasSavedOpenChapter = false; + /* + * Remove the saved open chapter as that chapter is about to be written + * to the volume. This matters the first time we close the open chapter + * after loading from a clean shutdown, or after doing a clean save. + */ + IndexComponent *oc = findIndexComponent(writer->index->state, + &OPEN_CHAPTER_INFO); + int result = discardIndexComponent(oc); + if (result == UDS_SUCCESS) { + logDebug("Discarding saved open chapter"); + } + } + + int result = closeOpenChapter(writer->chapters, + writer->index->zoneCount, + writer->index->volume, + writer->openChapterIndex, + writer->collatedRecords, + writer->index->newestVirtualChapter); + + if (result == UDS_SUCCESS) { + result = processChapterWriterCheckpointSaves(writer->index); + } + + + lockMutex(&writer->mutex); + // Note that the index is totally finished with the writing chapter + advanceActiveChapters(writer->index); + writer->result = result; + writer->zonesToWrite = 0; + broadcastCond(&writer->cond); + } +} + +/**********************************************************************/ +int makeChapterWriter(Index *index, + const struct index_version *indexVersion, + ChapterWriter **writerPtr) +{ + size_t collatedRecordsSize + = (sizeof(UdsChunkRecord) + * (1 + index->volume->geometry->recordsPerChapter)); + ChapterWriter *writer; + int result = ALLOCATE_EXTENDED(ChapterWriter, + index->zoneCount, OpenChapterZone *, + "Chapter Writer", &writer); + if (result != UDS_SUCCESS) { + return result; + } + writer->index = index; + + result = initMutex(&writer->mutex); + if (result != UDS_SUCCESS) { + FREE(writer); + return result; + } + result = initCond(&writer->cond); + if (result != UDS_SUCCESS) { + destroyMutex(&writer->mutex); + FREE(writer); + return result; + } + + // Now that we have the mutex+cond, it is safe to call freeChapterWriter. + result = allocateCacheAligned(collatedRecordsSize, "collated records", + &writer->collatedRecords); + if (result != UDS_SUCCESS) { + freeChapterWriter(writer); + return makeUnrecoverable(result); + } + result = makeOpenChapterIndex(&writer->openChapterIndex, + index->volume->geometry, + indexVersion->chapterIndexHeaderNativeEndian, + index->volume->nonce); + if (result != UDS_SUCCESS) { + freeChapterWriter(writer); + return makeUnrecoverable(result); + } + + size_t openChapterIndexMemoryAllocated + = getOpenChapterIndexMemoryAllocated(writer->openChapterIndex); + writer->memoryAllocated = (sizeof(ChapterWriter) + + index->zoneCount * sizeof(OpenChapterZone *) + + collatedRecordsSize + + openChapterIndexMemoryAllocated); + + // We're initialized, so now it's safe to start the writer thread. + result = createThread(closeChapters, writer, "writer", &writer->thread); + if (result != UDS_SUCCESS) { + freeChapterWriter(writer); + return makeUnrecoverable(result); + } + + *writerPtr = writer; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeChapterWriter(ChapterWriter *writer) +{ + if (writer == NULL) { + return; + } + + int result __attribute__((unused)) = stopChapterWriter(writer); + destroyMutex(&writer->mutex); + destroyCond(&writer->cond); + freeOpenChapterIndex(writer->openChapterIndex); + FREE(writer->collatedRecords); + FREE(writer); +} + +/**********************************************************************/ +unsigned int startClosingChapter(ChapterWriter *writer, + unsigned int zoneNumber, + OpenChapterZone *chapter) +{ + lockMutex(&writer->mutex); + unsigned int finishedZones = ++writer->zonesToWrite; + writer->chapters[zoneNumber] = chapter; + broadcastCond(&writer->cond); + unlockMutex(&writer->mutex); + + return finishedZones; +} + +/**********************************************************************/ +int finishPreviousChapter(ChapterWriter *writer, uint64_t currentChapterNumber) +{ + int result; + lockMutex(&writer->mutex); + while (writer->index->newestVirtualChapter < currentChapterNumber) { + waitCond(&writer->cond, &writer->mutex); + } + result = writer->result; + unlockMutex(&writer->mutex); + + if (result != UDS_SUCCESS) { + return logUnrecoverable(result, "Writing of previous open chapter failed"); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void waitForIdleChapterWriter(ChapterWriter *writer) +{ + lockMutex(&writer->mutex); + while (writer->zonesToWrite > 0) { + // The chapter writer is probably writing a chapter. If it is not, it will + // soon wake up and write a chapter. + waitCond(&writer->cond, &writer->mutex); + } + unlockMutex(&writer->mutex); +} + +/**********************************************************************/ +int stopChapterWriter(ChapterWriter *writer) +{ + Thread writerThread = 0; + + lockMutex(&writer->mutex); + if (writer->thread != 0) { + writerThread = writer->thread; + writer->thread = 0; + writer->stop = true; + broadcastCond(&writer->cond); + } + int result = writer->result; + unlockMutex(&writer->mutex); + + if (writerThread != 0) { + joinThreads(writerThread); + } + + if (result != UDS_SUCCESS) { + return logUnrecoverable(result, "Writing of previous open chapter failed"); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t getChapterWriterMemoryAllocated(ChapterWriter *writer) +{ + return writer->memoryAllocated; +} diff --git a/source/uds/chapterWriter.h b/source/uds/chapterWriter.h new file mode 100644 index 0000000..85c1f42 --- /dev/null +++ b/source/uds/chapterWriter.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/chapterWriter.h#2 $ + */ + +#ifndef CHAPTER_WRITER_H +#define CHAPTER_WRITER_H + +#include "atomicDefs.h" +#include "indexVersion.h" +#include "openChapterZone.h" + +typedef struct chapterWriter ChapterWriter; + +// This opaque declaration breaks the dependency loop with index.h +struct index; + + +/** + * Create a chapter writer and start its thread. + * + * @param index the index containing the chapters to be written + * @param indexVersion the index version parameters + * @param writerPtr pointer to hold the new writer + * + * @return UDS_SUCCESS or an error code + **/ +int makeChapterWriter(struct index *index, + const struct index_version *indexVersion, + ChapterWriter **writerPtr) + __attribute__((warn_unused_result)); + +/** + * Free a chapter writer, waiting for its thread to finish. + * + * @param writer the chapter writer to destroy + **/ +void freeChapterWriter(ChapterWriter *writer); + +/** + * Asychronously close and write a chapter by passing it to the writer + * thread. Writing won't start until all zones have submitted a chapter. + * + * @param writer the chapter writer + * @param zoneNumber the number of the zone submitting a chapter + * @param chapter the chapter to write + * + * @return The number of zones which have submitted the current chapter + **/ +unsigned int startClosingChapter(ChapterWriter *writer, + unsigned int zoneNumber, + OpenChapterZone *chapter) + __attribute__((warn_unused_result)); + +/** + * Wait for the chapter writer thread to finish closing the chapter previous + * to the one specified. + * + * @param writer the chapter writer + * @param currentChapterNumber the currentChapter number + * + * @return UDS_SUCCESS or an error code from the most recent write + * request + **/ +int finishPreviousChapter(ChapterWriter *writer, uint64_t currentChapterNumber) + __attribute__((warn_unused_result)); + + +/** + * Wait for the chapter writer thread to finish all writes to storage. + * + * @param writer the chapter writer + **/ +void waitForIdleChapterWriter(ChapterWriter *writer); + +/** + * Stop the chapter writer and wait for it to finish. + * + * @param writer the chapter writer to stop + * + * @return UDS_SUCCESS or an error code from the most recent write + * request + **/ +int stopChapterWriter(ChapterWriter *writer) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes allocated for the chapter writer. + * + * @param writer the chapter writer + * + * @return the number of bytes allocated + **/ +size_t getChapterWriterMemoryAllocated(ChapterWriter *writer); + +#endif /* CHAPTER_WRITER_H */ diff --git a/source/uds/common.h b/source/uds/common.h new file mode 100644 index 0000000..bea27e5 --- /dev/null +++ b/source/uds/common.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/common.h#1 $ + */ + +#ifndef COMMON_H +#define COMMON_H + +#include "stringUtils.h" +#include "typeDefs.h" +#include "uds.h" +#include "uds-block.h" + +enum { + KILOBYTE = 1024, + MEGABYTE = KILOBYTE * KILOBYTE, + GIGABYTE = KILOBYTE * MEGABYTE +}; + +typedef struct udsChunkData UdsChunkData; + +typedef struct { + UdsChunkName name; + UdsChunkData data; +} UdsChunkRecord; + +#endif /* COMMON_H */ diff --git a/source/uds/compiler.h b/source/uds/compiler.h new file mode 100644 index 0000000..cd57590 --- /dev/null +++ b/source/uds/compiler.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/compiler.h#1 $ + */ + +#ifndef COMMON_COMPILER_H +#define COMMON_COMPILER_H + +#include "compilerDefs.h" + +// Count the elements in a static array while attempting to catch some type +// errors. (See http://stackoverflow.com/a/1598827 for an explanation.) +#define COUNT_OF(x) ((sizeof(x) / sizeof(0[x])) \ + / ((size_t) (!(sizeof(x) % sizeof(0[x]))))) + +#define const_container_of(ptr, type, member) \ + __extension__ ({ \ + const __typeof__(((type *)0)->member) *__mptr = (ptr); \ + (const type *)((const char *)__mptr - offsetof(type,member)); \ + }) + +// The "inline" keyword alone takes affect only when the optimization level +// is high enough. Define INLINE to force the gcc to "always inline". +#define INLINE __attribute__((always_inline)) inline + +#endif /* COMMON_COMPILER_H */ diff --git a/source/uds/compilerDefs.h b/source/uds/compilerDefs.h new file mode 100644 index 0000000..cc81ce2 --- /dev/null +++ b/source/uds/compilerDefs.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/compilerDefs.h#1 $ + */ + +#ifndef LINUX_KERNEL_COMPILER_DEFS_H +#define LINUX_KERNEL_COMPILER_DEFS_H + +#include + +#define __STRING(x) #x + +#endif /* LINUX_KERNEL_COMPILER_DEFS_H */ diff --git a/source/uds/config.c b/source/uds/config.c new file mode 100644 index 0000000..a953da3 --- /dev/null +++ b/source/uds/config.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/config.c#2 $ + */ + +#include "config.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" + +/**********************************************************************/ +void freeIndexLocation(IndexLocation *loc) +{ + if (loc == NULL) { + return; + } + + FREE(loc->host); + FREE(loc->port); + FREE(loc->directory); +} + +/**********************************************************************/ +bool areUdsConfigurationsEqual(UdsConfiguration a, UdsConfiguration b) +{ + bool result = true; + if (a->recordPagesPerChapter != b->recordPagesPerChapter) { + logError("Record pages per chapter (%u) does not match (%u)", + a->recordPagesPerChapter, b->recordPagesPerChapter); + result = false; + } + if (a->chaptersPerVolume != b->chaptersPerVolume) { + logError("Chapter count (%u) does not match (%u)", + a->chaptersPerVolume, b->chaptersPerVolume); + result = false; + } + if (a->sparseChaptersPerVolume != b->sparseChaptersPerVolume) { + logError("Sparse chapter count (%u) does not match (%u)", + a->sparseChaptersPerVolume, b->sparseChaptersPerVolume); + result = false; + } + if (a->cacheChapters != b->cacheChapters) { + logError("Cache size (%u) does not match (%u)", + a->cacheChapters, b->cacheChapters); + result = false; + } + if (a->masterIndexMeanDelta != b->masterIndexMeanDelta) { + logError("Master index mean delta (%u) does not match (%u)", + a->masterIndexMeanDelta, b->masterIndexMeanDelta); + result = false; + } + if (a->bytesPerPage != b->bytesPerPage) { + logError("Bytes per page value (%u) does not match (%u)", + a->bytesPerPage, b->bytesPerPage); + result = false; + } + if (a->sparseSampleRate != b->sparseSampleRate) { + logError("Sparse sample rate (%u) does not match (%u)", + a->sparseSampleRate, b->sparseSampleRate); + result = false; + } + if (a->nonce != b->nonce) { + logError("Nonce (%llu) does not match (%llu)", + a->nonce, b->nonce); + result = false; + } + return result; +} + +/**********************************************************************/ +void logUdsConfiguration(UdsConfiguration conf) +{ + logDebug("Configuration:"); + logDebug(" Record pages per chapter: %10u", conf->recordPagesPerChapter); + logDebug(" Chapters per volume: %10u", conf->chaptersPerVolume); + logDebug(" Sparse chapters per volume: %10u", conf->sparseChaptersPerVolume); + logDebug(" Cache size (chapters): %10u", conf->cacheChapters); + logDebug(" Master index mean delta: %10u", conf->masterIndexMeanDelta); + logDebug(" Bytes per page: %10u", conf->bytesPerPage); + logDebug(" Sparse sample rate: %10u", conf->sparseSampleRate); + logDebug(" Nonce: %llu", conf->nonce); +} diff --git a/source/uds/config.h b/source/uds/config.h new file mode 100644 index 0000000..f31efab --- /dev/null +++ b/source/uds/config.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/config.h#2 $ + */ + +#ifndef CONFIG_H +#define CONFIG_H + +#include "bufferedReader.h" +#include "bufferedWriter.h" +#include "geometry.h" +#include "uds.h" + +enum { + DEFAULT_MASTER_INDEX_MEAN_DELTA = 4096, + DEFAULT_CACHE_CHAPTERS = 7, + DEFAULT_SPARSE_SAMPLE_RATE = 0 +}; + +/** + * Data that are used for configuring a new index. + **/ +struct udsConfiguration { + /** Smaller (16), Small (64) or large (256) indices */ + unsigned int recordPagesPerChapter; + /** Total number of chapters per volume */ + unsigned int chaptersPerVolume; + /** Number of sparse chapters per volume */ + unsigned int sparseChaptersPerVolume; + /** Size of the page cache, in chapters */ + unsigned int cacheChapters; + /** Frequency with which to checkpoint */ + // XXX the checkpointFrequency is not used - it is now a runtime parameter + unsigned int checkpointFrequency; + /** The master index mean delta to use */ + unsigned int masterIndexMeanDelta; + /** Size of a page, used for both record pages and index pages */ + unsigned int bytesPerPage; + /** Sampling rate for sparse indexing */ + unsigned int sparseSampleRate; + /** Index Owner's nonce */ + UdsNonce nonce; +}; + +/** + * Data that are used for a 6.01 index. + **/ +struct udsConfiguration6_01 { + /** Smaller (16), Small (64) or large (256) indices */ + unsigned int recordPagesPerChapter; + /** Total number of chapters per volume */ + unsigned int chaptersPerVolume; + /** Number of sparse chapters per volume */ + unsigned int sparseChaptersPerVolume; + /** Size of the page cache, in chapters */ + unsigned int cacheChapters; + /** Frequency with which to checkpoint */ + unsigned int checkpointFrequency; + /** The master index mean delta to use */ + unsigned int masterIndexMeanDelta; + /** Size of a page, used for both record pages and index pages */ + unsigned int bytesPerPage; + /** Sampling rate for sparse indexing */ + unsigned int sparseSampleRate; +}; + +typedef struct indexLocation { + char *host; + char *port; + char *directory; +} IndexLocation; + +/** + * A set of configuration parameters for the indexer. + **/ +typedef struct configuration Configuration; + +/** + * Construct a new indexer configuration. + * + * @param conf UdsConfiguration to use + * @param configPtr The new index configuration + * + * @return UDS_SUCCESS or an error code + **/ +int makeConfiguration(UdsConfiguration conf, + Configuration **configPtr) + __attribute__((warn_unused_result)); + +/** + * Clean up the configuration struct. + **/ +void freeConfiguration(Configuration *config); + +/** + * Read the index configuration from stable storage. + * + * @param reader A buffered reader. + * @param config The index configuration to overwrite. + * + * @return UDS_SUCCESS or an error code. + **/ +int readConfigContents(BufferedReader *reader, + UdsConfiguration config) + __attribute__((warn_unused_result)); + +/** + * Write the index configuration information to stable storage. + * + * @param writer A buffered writer. + * @param config The index configuration. + * + * @return UDS_SUCCESS or an error code. + **/ +int writeConfigContents(BufferedWriter *writer, + UdsConfiguration config) + __attribute__((warn_unused_result)); + +/** + * Free the memory used by an IndexLocation. + * + * @param loc index location to free + **/ +void freeIndexLocation(IndexLocation *loc); + +/** + * Compare two configurations for equality. + * + * @param a The first configuration to compare + * @param b The second configuration to compare + * + * @return true iff they are equal + **/ +bool areUdsConfigurationsEqual(UdsConfiguration a, UdsConfiguration b) + __attribute__((warn_unused_result)); + +/** + * Log a user configuration. + * + * @param conf The configuration + **/ +void logUdsConfiguration(UdsConfiguration conf); + +#endif /* CONFIG_H */ diff --git a/source/uds/cpu.h b/source/uds/cpu.h new file mode 100644 index 0000000..9314985 --- /dev/null +++ b/source/uds/cpu.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/cpu.h#1 $ + */ + +#ifndef CPU_H +#define CPU_H + +#include "compiler.h" +#include "typeDefs.h" + +/** + * The number of bytes in a CPU cache line. In the future, we'll probably need + * to move this to a processor-specific file or discover it at compilation + * time (or runtime, if sufficiently heterogeneous), but this will do for now. + * (Must be a \#define since enums are not proper compile-time constants.) + **/ +#ifdef __PPC__ +// N.B.: Some PPC processors have smaller cache lines. +#define CACHE_LINE_BYTES 128 +#elif defined(__s390x__) +#define CACHE_LINE_BYTES 256 +#elif defined(__x86_64__) || defined(__aarch64__) +#define CACHE_LINE_BYTES 64 +#else +#error "unknown cache line size" +#endif + +/** + * Minimize cache-miss latency by moving data into a CPU cache before it is + * accessed. + * + * @param address the address to fetch (may be invalid) + * @param forWrite must be constant at compile time--false if + * for reading, true if for writing + **/ +static INLINE void prefetchAddress(const void *address, bool forWrite) +{ + // forWrite won't won't be a constant if we are compiled with optimization + // turned off, in which case prefetching really doesn't matter. + if (__builtin_constant_p(forWrite)) { + __builtin_prefetch(address, forWrite); + } +} + +/** + * Minimize cache-miss latency by moving a range of addresses into a + * CPU cache before they are accessed. + * + * @param start the starting address to fetch (may be invalid) + * @param size the number of bytes in the address range + * @param forWrite must be constant at compile time--false if + * for reading, true if for writing + **/ +static INLINE void prefetchRange(const void *start, + unsigned int size, + bool forWrite) +{ + // Count the number of cache lines to fetch, allowing for the address range + // to span an extra cache line boundary due to address alignment. + const char *address = (const char *) start; + unsigned int offset = ((uintptr_t) address % CACHE_LINE_BYTES); + size += offset; + + unsigned int cacheLines = (1 + (size / CACHE_LINE_BYTES)); + while (cacheLines-- > 0) { + prefetchAddress(address, forWrite); + address += CACHE_LINE_BYTES; + } +} + +#endif /* CPU_H */ diff --git a/source/uds/deltaIndex.c b/source/uds/deltaIndex.c new file mode 100644 index 0000000..0c43e9b --- /dev/null +++ b/source/uds/deltaIndex.c @@ -0,0 +1,1707 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/deltaIndex.c#7 $ + */ +#include "deltaIndex.h" + +#include "bits.h" +#include "buffer.h" +#include "compiler.h" +#include "cpu.h" +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "stringUtils.h" +#include "typeDefs.h" +#include "uds.h" +#include "zone.h" + +/* + * A delta index is a key-value store, where each entry maps an address + * (the key) to a payload (the value). The entries are sorted by address, + * and only the delta between successive addresses is stored in the entry. + * The addresses are assumed to be uniformly distributed,and the deltas are + * therefore exponentially distributed. + * + * The entries could be stored in a single DeltaList, but for efficiency we + * use multiple DeltaLists. These lists are stored in a single chunk of + * memory managed by the DeltaMemory module. The DeltaMemory module can + * move the data around in memory, so we never keep any byte pointers into + * DeltaList memory. We only keep offsets into the memory. + * + * The delta lists are stored as bit streams. These bit streams are stored + * in little endian order, and all offsets into DeltaMemory are bit + * offsets. + * + * All entries are stored as a fixed length payload (the value) followed by a + * variable length key (the delta). Always strictly in little endian order. + * + * A collision entry is used when two block names have the same delta list + * address. A collision entry is encoded with DELTA==0, and has 256 + * extension bits containing the full block name. + * + * There is a special exception to be noted. The DELTA==0 encoding usually + * indicates a collision with the preceding entry. But for the first entry + * in any delta list there is no preceding entry, so the DELTA==0 encoding + * at the beginning of a delta list indicates a normal entry. + * + * The Huffman code is driven by 3 parameters: + * + * MINBITS This is the number of bits in the smallest code + * + * BASE This is the number of values coded using a code of length MINBITS + * + * INCR This is the number of values coded by using one additional bit. + * + * These parameters are related by: + * + * BASE + INCR == 1 << MINBITS + * + * When we create an index, we need to know the mean delta. From the mean + * delta, we compute these three parameters. The math for the Huffman code + * of an exponential distribution says that we compute: + * + * INCR = log(2) * MEAN_DELTA + * + * Then we find the smallest MINBITS so that + * + * 1 << MINBITS > INCR + * + * And then: + * + * BASE = (1 << MINBITS) - INCR + * + * Now we need a code such that + * + * - The first BASE values code using MINBITS bits + * - The next INCR values code using MINBITS+1 bits. + * - The next INCR values code using MINBITS+2 bits. + * - The next INCR values code using MINBITS+3 bits. + * - (and so on). + * + * ENCODE(DELTA): + * + * if (DELTA < BASE) { + * put DELTA in MINBITS bits; + * } else { + * T1 = (DELTA - BASE) % INCR + BASE; + * T2 = (DELTA - BASE) / INCR; + * put T1 in MINBITS bits; + * put 0 in T2 bits; + * put 1 in 1 bit; + * } + * + * DECODE(BIT_STREAM): + * + * T1 = next MINBITS bits of stream; + * if (T1 < BASE) { + * DELTA = T1; + * } else { + * Scan bits in the stream until reading a 1, + * setting T2 to the number of 0 bits read; + * DELTA = T2 * INCR + T1; + * } + * + * The bit field utilities that we use on the delta lists assume that it is + * possible to read a few bytes beyond the end of the bit field. So we + * make sure to allocates some extra bytes at the end of memory containing + * the delta lists. Look for POST_FIELD_GUARD_BYTES to find the code + * related to this. + * + * And note that the decode bit stream code includes a step that skips over + * 0 bits until the first 1 bit is found. A corrupted delta list could + * cause this step to run off the end of the delta list memory. As an + * extra protection against this happening, the guard bytes at the end + * should be set to all ones. + */ + +/** + * Constants and structures for the saved delta index. "DI" is for + * deltaIndex, and -##### is a number to increment when the format of the + * data changes. + **/ +enum { MAGIC_SIZE = 8 }; +static const char MAGIC_DI_START[] = "DI-00002"; + +struct di_header { + char magic[MAGIC_SIZE]; // MAGIC_DI_START + uint32_t zoneNumber; + uint32_t numZones; + uint32_t firstList; + uint32_t numLists; + uint64_t recordCount; + uint64_t collisionCount; +}; + +//********************************************************************** +// Methods for dealing with mutable delta list headers +//********************************************************************** + +/** + * Move the start of the delta list bit stream without moving the end. + * + * @param deltaList The delta list header + * @param increment The change in the start of the delta list + **/ +static INLINE void moveDeltaListStart(DeltaList *deltaList, int increment) +{ + deltaList->startOffset += increment; + deltaList->size -= increment; +} + +/** + * Move the end of the delta list bit stream without moving the start. + * + * @param deltaList The delta list header + * @param increment The change in the end of the delta list + **/ +static INLINE void moveDeltaListEnd(DeltaList *deltaList, int increment) +{ + deltaList->size += increment; +} + +//********************************************************************** +// Methods for dealing with immutable delta list headers packed +//********************************************************************** + +// Header data used for immutable delta index pages. These data are +// followed by the delta list offset table. +typedef struct __attribute__((packed)) deltaPageHeader { + uint64_t nonce; // Externally-defined nonce + uint64_t virtualChapterNumber; // The virtual chapter number + uint16_t firstList; // Index of the first delta list on the page + uint16_t numLists; // Number of delta lists on the page +} DeltaPageHeader; + +// Immutable delta lists are packed into pages containing a header that +// encodes the delta list information into 19 bits per list (64KB bit offset) + +enum { IMMUTABLE_HEADER_SIZE = 19 }; + +/** + * Get the bit offset to the immutable delta list header + * + * @param listNumber The delta list number + * + * @return the offset of immutable delta list header + **/ +static INLINE unsigned int getImmutableHeaderOffset(unsigned int listNumber) +{ + return (sizeof(DeltaPageHeader) * CHAR_BIT + + listNumber * IMMUTABLE_HEADER_SIZE); +} + +/** + * Get the bit offset to the start of the immutable delta list bit stream + * + * @param memory The memory page containing the delta lists + * @param listNumber The delta list number + * + * @return the start of the delta list + **/ +static INLINE unsigned int getImmutableStart(const byte *memory, + unsigned int listNumber) +{ + return getField(memory, getImmutableHeaderOffset(listNumber), + IMMUTABLE_HEADER_SIZE); +} + +/** + * Set the bit offset to the start of the immutable delta list bit stream + * + * @param memory The memory page containing the delta lists + * @param listNumber The delta list number + * @param startOffset The start of the delta list + **/ +static INLINE void setImmutableStart(byte *memory, unsigned int listNumber, + unsigned int startOffset) +{ + setField(startOffset, memory, getImmutableHeaderOffset(listNumber), + IMMUTABLE_HEADER_SIZE); +} + +//********************************************************************** +// Methods for dealing with Delta List Entries +//********************************************************************** + +/** + * Decode a delta index entry delta value. The DeltaIndexEntry basically + * describes the previous list entry, and has had its offset field changed to + * point to the subsequent entry. We decode the bit stream and update the + * DeltaListEntry to describe the entry. + * + * @param deltaEntry The delta index entry + **/ +static INLINE void decodeDelta(DeltaIndexEntry *deltaEntry) +{ + const DeltaMemory *deltaZone = deltaEntry->deltaZone; + const byte *memory = deltaZone->memory; + uint64_t deltaOffset + = getDeltaEntryOffset(deltaEntry) + deltaEntry->valueBits; + const byte *addr = memory + deltaOffset / CHAR_BIT; + int offset = deltaOffset % CHAR_BIT; + uint32_t data = getUInt32LE(addr) >> offset; + addr += sizeof(uint32_t); + int keyBits = deltaZone->minBits; + unsigned int delta = data & ((1 << keyBits) - 1); + if (delta >= deltaZone->minKeys) { + data >>= keyBits; + if (data == 0) { + keyBits = sizeof(uint32_t) * CHAR_BIT - offset; + while ((data = getUInt32LE(addr)) == 0) { + addr += sizeof(uint32_t); + keyBits += sizeof(uint32_t) * CHAR_BIT; + } + } + keyBits += ffs(data); + delta += (keyBits - deltaZone->minBits - 1) * deltaZone->incrKeys; + } + deltaEntry->delta = delta; + deltaEntry->key += delta; + + // Check for a collision, a delta of zero not at the start of the list. + if (unlikely((delta == 0) && (deltaEntry->offset > 0))) { + deltaEntry->isCollision = true; + // The small duplication of this math in the two arms of this if statement + // makes a tiny but measurable difference in performance. + deltaEntry->entryBits = deltaEntry->valueBits + keyBits + COLLISION_BITS; + } else { + deltaEntry->isCollision = false; + deltaEntry->entryBits = deltaEntry->valueBits + keyBits; + } +} + +/** + * Delete bits from a delta list at the offset of the specified delta index + * entry. + * + * @param deltaEntry The delta index entry + * @param size The number of bits to delete + **/ +static void deleteBits(const DeltaIndexEntry *deltaEntry, int size) +{ + DeltaList *deltaList = deltaEntry->deltaList; + byte *memory = deltaEntry->deltaZone->memory; + // Compute how many bits are retained before and after the deleted bits + uint32_t totalSize = getDeltaListSize(deltaList); + uint32_t beforeSize = deltaEntry->offset; + uint32_t afterSize = totalSize - deltaEntry->offset - size; + + // Determine whether to add to the available space either before or after + // the delta list. We prefer to move the least amount of data. If it is + // exactly the same, try to add to the smaller amount of free space. + bool beforeFlag; + if (beforeSize < afterSize) { + beforeFlag = true; + } else if (afterSize < beforeSize) { + beforeFlag = false; + } else { + uint64_t freeBefore + = getDeltaListStart(&deltaList[0]) - getDeltaListEnd(&deltaList[-1]); + uint64_t freeAfter + = getDeltaListStart(&deltaList[1]) - getDeltaListEnd(&deltaList[ 0]); + beforeFlag = freeBefore < freeAfter; + } + + uint64_t source, destination; + uint32_t count; + if (beforeFlag) { + source = getDeltaListStart(deltaList); + destination = source + size; + moveDeltaListStart(deltaList, size); + count = beforeSize; + } else { + moveDeltaListEnd(deltaList, -size); + destination = getDeltaListStart(deltaList) + deltaEntry->offset; + source = destination + size; + count = afterSize; + } + moveBits(memory, source, memory, destination, count); +} + +/** + * Get the offset of the collision field in a DeltaIndexEntry + * + * @param entry The delta index record + * + * @return the offset of the start of the collision name + **/ +static INLINE uint64_t getCollisionOffset(const DeltaIndexEntry *entry) +{ + return (getDeltaEntryOffset(entry) + entry->entryBits - COLLISION_BITS); +} + +/** + * Encode a delta index entry delta. + * + * @param deltaEntry The delta index entry + **/ +static void encodeDelta(const DeltaIndexEntry *deltaEntry) +{ + const DeltaMemory *deltaZone = deltaEntry->deltaZone; + byte *memory = deltaZone->memory; + uint64_t offset = getDeltaEntryOffset(deltaEntry) + deltaEntry->valueBits; + if (deltaEntry->delta < deltaZone->minKeys) { + setField(deltaEntry->delta, memory, offset, deltaZone->minBits); + return; + } + unsigned int temp = deltaEntry->delta - deltaZone->minKeys; + unsigned int t1 = (temp % deltaZone->incrKeys) + deltaZone->minKeys; + unsigned int t2 = temp / deltaZone->incrKeys; + setField(t1, memory, offset, deltaZone->minBits); + setZero(memory, offset + deltaZone->minBits, t2); + setOne(memory, offset + deltaZone->minBits + t2, 1); +} + +/** + * Encode a delta index entry. + * + * @param deltaEntry The delta index entry + * @param value The value associated with the entry + * @param name For collision entries, the 256 bit full name. + **/ +static void encodeEntry(const DeltaIndexEntry *deltaEntry, unsigned int value, + const byte *name) +{ + byte *memory = deltaEntry->deltaZone->memory; + uint64_t offset = getDeltaEntryOffset(deltaEntry); + setField(value, memory, offset, deltaEntry->valueBits); + encodeDelta(deltaEntry); + if (name != NULL) { + setBytes(memory, getCollisionOffset(deltaEntry), name, COLLISION_BYTES); + } +} + +/** + * Insert bits into a delta list at the offset of the specified delta index + * entry. + * + * @param deltaEntry The delta index entry + * @param size The number of bits to insert + * + * @return UDS_SUCCESS or an error code + **/ +static int insertBits(DeltaIndexEntry *deltaEntry, int size) +{ + DeltaMemory *deltaZone = deltaEntry->deltaZone; + DeltaList *deltaList = deltaEntry->deltaList; + // Compute how many bits are in use before and after the inserted bits + uint32_t totalSize = getDeltaListSize(deltaList); + uint32_t beforeSize = deltaEntry->offset; + uint32_t afterSize = totalSize - deltaEntry->offset; + if ((unsigned int) (totalSize + size) > UINT16_MAX) { + deltaEntry->listOverflow = true; + deltaZone->overflowCount++; + return UDS_OVERFLOW; + } + + // Compute how many bits are available before and after the delta list + uint64_t freeBefore + = getDeltaListStart(&deltaList[0]) - getDeltaListEnd(&deltaList[-1]); + uint64_t freeAfter + = getDeltaListStart(&deltaList[1]) - getDeltaListEnd(&deltaList[ 0]); + + bool beforeFlag; + if (((unsigned int) size <= freeBefore) + && ((unsigned int) size <= freeAfter)) { + // We have enough space to use either before or after the list. Prefer + // to move the least amount of data. If it is exactly the same, try to + // take from the larger amount of free space. + if (beforeSize < afterSize) { + beforeFlag = true; + } else if (afterSize < beforeSize) { + beforeFlag = false; + } else { + beforeFlag = freeBefore > freeAfter; + } + } else if ((unsigned int) size <= freeBefore) { + // There is space before but not after + beforeFlag = true; + } else if ((unsigned int) size <= freeAfter) { + // There is space after but not before + beforeFlag = false; + } else { + // Neither of the surrounding spaces is large enough for this request, + // Extend and/or rebalance the delta list memory choosing to move the + // least amount of data. + unsigned int growingIndex = deltaEntry->listNumber + 1; + beforeFlag = beforeSize < afterSize; + if (!beforeFlag) { + growingIndex++; + } + int result = extendDeltaMemory(deltaZone, growingIndex, + (size + CHAR_BIT - 1) / CHAR_BIT, true); + if (result != UDS_SUCCESS) { + return result; + } + } + + uint64_t source, destination; + uint32_t count; + if (beforeFlag) { + source = getDeltaListStart(deltaList); + destination = source - size; + moveDeltaListStart(deltaList, -size); + count = beforeSize; + } else { + moveDeltaListEnd(deltaList, size); + source = getDeltaListStart(deltaList) + deltaEntry->offset; + destination = source + size; + count = afterSize; + } + byte *memory = deltaZone->memory; + moveBits(memory, source, memory, destination, count); + return UDS_SUCCESS; +} + +/** + * Get the amount of memory to allocate for each zone + * + * @param numZones The number of zones in the index + * @param memorySize The number of bytes in memory for the index + * + * @return the number of bytes to allocate for a single zone + **/ +static INLINE size_t getZoneMemorySize(unsigned int numZones, + size_t memorySize) +{ + size_t zoneSize = memorySize / numZones; + // Round the size up so that each zone is a multiple of 64K in size. + enum { ALLOC_BOUNDARY = 64 * KILOBYTE }; + return (zoneSize + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY; +} + +/** + * Validate delta index parameters + * + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + **/ +static bool invalidParameters(unsigned int meanDelta, + unsigned int numPayloadBits) +{ + const unsigned int minDelta = 10; + const unsigned int maxDelta = 1 << MAX_FIELD_BITS; + if ((meanDelta < minDelta) || (meanDelta > maxDelta)) { + logWarning("error initializing delta index: " + "meanDelta (%u) is not in the range %u to %u", + meanDelta, minDelta, maxDelta); + return true; + } + if (numPayloadBits > MAX_FIELD_BITS) { + logWarning("error initializing delta index: Too many payload bits (%u)", + numPayloadBits); + return true; + } + return false; +} + +/** + * Set a delta index entry to be a collision + * + * @param deltaEntry The delta index entry + **/ +static void setCollision(DeltaIndexEntry *deltaEntry) +{ + deltaEntry->isCollision = true; + deltaEntry->entryBits += COLLISION_BITS; +} + +/** + * Set the delta in a delta index entry. + * + * @param deltaEntry The delta index entry + * @param delta The new delta + **/ +static void setDelta(DeltaIndexEntry *deltaEntry, unsigned int delta) +{ + const DeltaMemory *deltaZone = deltaEntry->deltaZone; + deltaEntry->delta = delta; + int keyBits = (deltaZone->minBits + + ((deltaZone->incrKeys - deltaZone->minKeys + delta) + / deltaZone->incrKeys)); + deltaEntry->entryBits = deltaEntry->valueBits + keyBits; +} + +//********************************************************************** +// External functions declared in deltaIndex.h +//********************************************************************** + +int initializeDeltaIndex(DeltaIndex *deltaIndex, unsigned int numZones, + unsigned int numLists, unsigned int meanDelta, + unsigned int numPayloadBits, size_t memorySize) +{ + size_t memSize = getZoneMemorySize(numZones, memorySize); + if (invalidParameters(meanDelta, numPayloadBits)) { + return UDS_INVALID_ARGUMENT; + } + + int result = ALLOCATE(numZones, DeltaMemory, "Delta Index Zones", + &deltaIndex->deltaZones); + if (result != UDS_SUCCESS) { + return result; + } + + deltaIndex->numZones = numZones; + deltaIndex->numLists = numLists; + deltaIndex->listsPerZone = (numLists + numZones - 1) / numZones; + deltaIndex->isMutable = true; + deltaIndex->tag = 'm'; + + unsigned int z; + for (z = 0; z < numZones; z++) { + unsigned int firstListInZone = z * deltaIndex->listsPerZone; + unsigned int numListsInZone = deltaIndex->listsPerZone; + if (z == numZones - 1) { + /* + * The last zone gets fewer lists if numZones doesn't evenly divide + * numLists. We'll have an underflow if the assertion below doesn't + * hold. (And it turns out that the assertion is equivalent to + * numZones <= 1 + (numLists / numZones) + (numLists % numZones) + * in the case that numZones doesn't evenly divide numlists. + * If numLists >= numZones * numZones, then the above inequality + * will always hold.) + */ + if (deltaIndex->numLists <= firstListInZone) { + uninitializeDeltaIndex(deltaIndex); + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "%u delta-lists not enough for %u zones", + numLists, numZones); + } + numListsInZone = deltaIndex->numLists - firstListInZone; + } + int result = initializeDeltaMemory(&deltaIndex->deltaZones[z], memSize, + firstListInZone, numListsInZone, + meanDelta, numPayloadBits); + if (result != UDS_SUCCESS) { + uninitializeDeltaIndex(deltaIndex); + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +static bool verifyDeltaIndexPage(uint64_t nonce, + uint16_t numLists, + uint64_t expectedNonce, + byte *memory, + size_t memSize) +{ + // Verify the nonce. A mismatch here happens in normal operation when we are + // doing a rebuild but haven't written the entire volume once. + if (nonce != expectedNonce) { + return false; + } + + // Verify that the number of delta lists can fit in the page. + if (numLists > + (memSize - sizeof(DeltaPageHeader)) * CHAR_BIT / IMMUTABLE_HEADER_SIZE) { + return false; + } + + // Verify that the first delta list is immediately after the last delta list + // header. + if (getImmutableStart(memory, 0) != getImmutableHeaderOffset(numLists + 1)) { + return false; + } + + // Verify that the lists are in the correct order. + unsigned int i; + for (i = 0; i < numLists; i++) { + if (getImmutableStart(memory, i) > getImmutableStart(memory, i + 1)) { + return false; + } + } + + // Verify that the last list ends on the page, and that there is room for the + // post-field guard bits. + if (getImmutableStart(memory, numLists) + > (memSize - POST_FIELD_GUARD_BYTES) * CHAR_BIT) { + return false; + } + + // Verify that the guard bytes are correctly set to all ones. + for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) { + byte guardByte = memory[memSize - POST_FIELD_GUARD_BYTES + i]; + if (guardByte != (byte) ~0) { + return false; + } + } + + // All verifications passed. + return true; +} + +/**********************************************************************/ +int initializeDeltaIndexPage(DeltaIndexPage *deltaIndexPage, + uint64_t expectedNonce, + unsigned int meanDelta, + unsigned int numPayloadBits, + byte *memory, + size_t memSize) +{ + const DeltaPageHeader *header = (const DeltaPageHeader *) memory; + + if (invalidParameters(meanDelta, numPayloadBits)) { + return UDS_INVALID_ARGUMENT; + } + + // First assume that the header is little endian + uint64_t nonce = getUInt64LE((const byte *) &header->nonce); + uint64_t vcn = getUInt64LE((const byte *) &header->virtualChapterNumber); + uint16_t firstList = getUInt16LE((const byte *) &header->firstList); + uint16_t numLists = getUInt16LE((const byte *) &header->numLists); + if (!verifyDeltaIndexPage(nonce, numLists, expectedNonce, memory, memSize)) { + // That failed, so try big endian + nonce = getUInt64BE((const byte *) &header->nonce); + vcn = getUInt64BE((const byte *) &header->virtualChapterNumber); + firstList = getUInt16BE((const byte *) &header->firstList); + numLists = getUInt16BE((const byte *) &header->numLists); + if (!verifyDeltaIndexPage(nonce, numLists, expectedNonce, memory, + memSize)) { + // Also failed. Do not log this as an error. It happens in normal + // operation when we are doing a rebuild but haven't written the entire + // volume once. + return UDS_CORRUPT_COMPONENT; + } + } + + deltaIndexPage->deltaIndex.deltaZones = &deltaIndexPage->deltaMemory; + deltaIndexPage->deltaIndex.numZones = 1; + deltaIndexPage->deltaIndex.numLists = numLists; + deltaIndexPage->deltaIndex.listsPerZone = numLists; + deltaIndexPage->deltaIndex.isMutable = false; + deltaIndexPage->deltaIndex.tag = 'p'; + deltaIndexPage->virtualChapterNumber = vcn; + deltaIndexPage->lowestListNumber = firstList; + deltaIndexPage->highestListNumber = firstList + numLists - 1; + + initializeDeltaMemoryPage(&deltaIndexPage->deltaMemory, (byte *) memory, + memSize, numLists, meanDelta, numPayloadBits); + return UDS_SUCCESS; +} + +/**********************************************************************/ +void uninitializeDeltaIndex(DeltaIndex *deltaIndex) +{ + if (deltaIndex != NULL) { + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + uninitializeDeltaMemory(&deltaIndex->deltaZones[z]); + } + FREE(deltaIndex->deltaZones); + memset(deltaIndex, 0, sizeof(DeltaIndex)); + } +} + +/**********************************************************************/ +void emptyDeltaIndex(const DeltaIndex *deltaIndex) +{ + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + emptyDeltaLists(&deltaIndex->deltaZones[z]); + } +} + +/**********************************************************************/ +void emptyDeltaIndexZone(const DeltaIndex *deltaIndex, unsigned int zoneNumber) +{ + emptyDeltaLists(&deltaIndex->deltaZones[zoneNumber]); +} + +/**********************************************************************/ +int packDeltaIndexPage(const DeltaIndex *deltaIndex, + uint64_t headerNonce, + bool headerNativeEndian, + byte *memory, + size_t memSize, + uint64_t virtualChapterNumber, + unsigned int firstList, + unsigned int *numLists) +{ + if (!deltaIndex->isMutable) { + return logErrorWithStringError(UDS_BAD_STATE, + "Cannot pack an immutable index"); + } + if (deltaIndex->numZones != 1) { + return logErrorWithStringError(UDS_BAD_STATE, + "Cannot pack a delta index page when the" + " index has %u zones", + deltaIndex->numZones); + } + if (firstList > deltaIndex->numLists) { + return logErrorWithStringError(UDS_BAD_STATE, + "Cannot pack a delta index page when the" + " first list (%u) is larger than the number" + " of lists (%u)", + firstList, deltaIndex->numLists); + } + + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[0]; + DeltaList *deltaLists = &deltaZone->deltaLists[firstList + 1]; + unsigned int maxLists = deltaIndex->numLists - firstList; + + // Compute how many lists will fit on the page + int numBits = memSize * CHAR_BIT; + // Subtract the size of the fixed header and 1 delta list offset + numBits -= getImmutableHeaderOffset(1); + // Subtract the guard bytes of memory so that allow us to freely read a + // short distance past the end of any byte we are interested in. + numBits -= POST_FIELD_GUARD_BYTES * CHAR_BIT; + if (numBits < IMMUTABLE_HEADER_SIZE) { + // This page is too small to contain even one empty delta list + return logErrorWithStringError(UDS_OVERFLOW, + "Chapter Index Page of %zu bytes is too" + " small", + memSize); + } + + unsigned int nLists = 0; + while (nLists < maxLists) { + // Each list requires 1 delta list offset and the list data + int bits = IMMUTABLE_HEADER_SIZE + getDeltaListSize(&deltaLists[nLists]); + if (bits > numBits) { + break; + } + nLists++; + numBits -= bits; + } + *numLists = nLists; + + // Construct the page header + DeltaPageHeader *header = (DeltaPageHeader *) memory; + if (headerNativeEndian) { + header->nonce = headerNonce; + header->virtualChapterNumber = virtualChapterNumber; + header->firstList = firstList; + header->numLists = nLists; + } else { + storeUInt64LE((byte *) &header->nonce, headerNonce); + storeUInt64LE((byte *) &header->virtualChapterNumber, + virtualChapterNumber); + storeUInt16LE((byte *) &header->firstList, firstList); + storeUInt16LE((byte *) &header->numLists, nLists); + } + + // Construct the delta list offset table, making sure that the memory + // page is large enough. + unsigned int offset = getImmutableHeaderOffset(nLists + 1); + setImmutableStart(memory, 0, offset); + unsigned int i; + for (i = 0; i < nLists; i++) { + offset += getDeltaListSize(&deltaLists[i]); + setImmutableStart(memory, i + 1, offset); + } + + // Copy the delta list data onto the memory page + for (i = 0; i < nLists; i++) { + DeltaList *deltaList = &deltaLists[i]; + moveBits(deltaZone->memory, getDeltaListStart(deltaList), memory, + getImmutableStart(memory, i), getDeltaListSize(deltaList)); + } + + // Set all the bits in the guard bytes. Do not use the bit field + // utilities. + memset(memory + memSize - POST_FIELD_GUARD_BYTES, ~0, + POST_FIELD_GUARD_BYTES); + return UDS_SUCCESS; +} + + +/**********************************************************************/ +void setDeltaIndexTag(DeltaIndex *deltaIndex, byte tag) +{ + deltaIndex->tag = tag; + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + deltaIndex->deltaZones[z].tag = tag; + } +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeDeltaIndexHeader(Buffer *buffer, struct di_header *header) +{ + int result = getBytesFromBuffer(buffer, MAGIC_SIZE, &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->zoneNumber); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->numZones); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->firstList); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->numLists); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->recordCount); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->collisionCount); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + return result; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int readDeltaIndexHeader(BufferedReader *reader, + struct di_header *header) +{ + Buffer *buffer; + + int result = makeBuffer(sizeof(*header), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logWarningWithStringError(result, + "failed to read delta index header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = decodeDeltaIndexHeader(buffer, header); + freeBuffer(&buffer); + return result; +} + +/**********************************************************************/ +int startRestoringDeltaIndex(const DeltaIndex *deltaIndex, + BufferedReader **bufferedReaders, + int numReaders) +{ + if (!deltaIndex->isMutable) { + return logErrorWithStringError(UDS_BAD_STATE, + "Cannot restore to an immutable index"); + } + if (numReaders <= 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "No delta index files"); + } + + unsigned int numZones = numReaders; + if (numZones > MAX_ZONES) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "zone count %u must not exceed MAX_ZONES", + numZones); + } + + unsigned long recordCount = 0; + unsigned long collisionCount = 0; + unsigned int firstList[MAX_ZONES]; + unsigned int numLists[MAX_ZONES]; + BufferedReader *reader[MAX_ZONES]; + bool zoneFlags[MAX_ZONES] = { false, }; + + // Read the header from each file, and make sure we have a matching set + unsigned int z; + for (z = 0; z < numZones; z++) { + struct di_header header; + int result = readDeltaIndexHeader(bufferedReaders[z], &header); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to read delta index header"); + } + if (memcmp(header.magic, MAGIC_DI_START, MAGIC_SIZE) != 0) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index file has bad magic" + " number"); + } + if (numZones != header.numZones) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contain mismatched" + " zone counts (%u,%u)", + numZones, header.numZones); + } + if (header.zoneNumber >= numZones) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contains zone %u of" + " %u zones", + header.zoneNumber, numZones); + } + if (zoneFlags[header.zoneNumber]) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contain two of zone" + " %u", + header.zoneNumber); + } + reader[header.zoneNumber] = bufferedReaders[z]; + firstList[header.zoneNumber] = header.firstList; + numLists[header.zoneNumber] = header.numLists; + zoneFlags[header.zoneNumber] = true; + recordCount += header.recordCount; + collisionCount += header.collisionCount; + } + unsigned int listNext = 0; + for (z = 0; z < numZones; z++) { + if (firstList[z] != listNext) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index file for zone %u starts" + " with list %u instead of list %u", + z, firstList[z], listNext); + } + listNext += numLists[z]; + } + if (listNext != deltaIndex->numLists) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contain %u delta lists" + " instead of %u delta lists", + listNext, deltaIndex->numLists); + } + if (collisionCount > recordCount) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "delta index files contain %ld collisions" + " and %ld records", + collisionCount, recordCount); + } + + emptyDeltaIndex(deltaIndex); + deltaIndex->deltaZones[0].recordCount = recordCount; + deltaIndex->deltaZones[0].collisionCount = collisionCount; + + // Read the delta list sizes from the files, and distribute each of them + // to proper zone + for (z = 0; z < numZones; z++) { + unsigned int i; + for (i = 0; i < numLists[z]; i++) { + byte deltaListSizeData[sizeof(uint16_t)]; + int result = readFromBufferedReader(reader[z], deltaListSizeData, + sizeof(deltaListSizeData)); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to read delta index size"); + } + uint16_t deltaListSize = getUInt16LE(deltaListSizeData); + unsigned int listNumber = firstList[z] + i; + unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, listNumber); + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; + listNumber -= deltaZone->firstList; + deltaZone->deltaLists[listNumber + 1].size = deltaListSize; + } + } + + // Prepare each zone to start receiving the delta list data + for (z = 0; z < deltaIndex->numZones; z++) { + int result = startRestoringDeltaMemory(&deltaIndex->deltaZones[z]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +bool isRestoringDeltaIndexDone(const DeltaIndex *deltaIndex) +{ + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + if (!areDeltaMemoryTransfersDone(&deltaIndex->deltaZones[z])) { + return false; + } + } + return true; +} + +/**********************************************************************/ +int restoreDeltaListToDeltaIndex(const DeltaIndex *deltaIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + // Make sure the data are intended for this delta list. Do not + // log an error, as this may be valid data for another delta index. + if (dlsi->tag != deltaIndex->tag) { + return UDS_CORRUPT_COMPONENT; + } + + if (dlsi->index >= deltaIndex->numLists) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "invalid delta list number %u of %u", + dlsi->index, deltaIndex->numLists); + } + + unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, dlsi->index); + return restoreDeltaList(&deltaIndex->deltaZones[zoneNumber], dlsi, data); +} + +/**********************************************************************/ +void abortRestoringDeltaIndex(const DeltaIndex *deltaIndex) +{ + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + abortRestoringDeltaMemory(&deltaIndex->deltaZones[z]); + } +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int encodeDeltaIndexHeader(Buffer *buffer, struct di_header *header) +{ + int result = putBytes(buffer, MAGIC_SIZE, MAGIC_DI_START); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->zoneNumber); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->numZones); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->firstList); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->numLists); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->recordCount); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->collisionCount); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(*header), + "%zu bytes encoded of %zu expected", + contentLength(buffer), sizeof(*header)); + + return result; +} + +/**********************************************************************/ +int startSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) +{ + DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; + struct di_header header; + memcpy(header.magic, MAGIC_DI_START, MAGIC_SIZE); + header.zoneNumber = zoneNumber; + header.numZones = deltaIndex->numZones; + header.firstList = deltaZone->firstList; + header.numLists = deltaZone->numLists; + header.recordCount = deltaZone->recordCount; + header.collisionCount = deltaZone->collisionCount; + + Buffer *buffer; + int result = makeBuffer(sizeof(struct di_header), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = encodeDeltaIndexHeader(buffer, &header); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write delta index header"); + } + + unsigned int i; + for (i = 0; i < deltaZone->numLists; i++) { + uint16_t deltaListSize = getDeltaListSize(&deltaZone->deltaLists[i + 1]); + byte data[2]; + storeUInt16LE(data, deltaListSize); + result = writeToBufferedWriter(bufferedWriter, data, sizeof(data)); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write delta list size"); + } + } + + startSavingDeltaMemory(deltaZone, bufferedWriter); + return UDS_SUCCESS; +} + +/**********************************************************************/ +bool isSavingDeltaIndexDone(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + return areDeltaMemoryTransfersDone(&deltaIndex->deltaZones[zoneNumber]); +} + +/**********************************************************************/ +int finishSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + return finishSavingDeltaMemory(&deltaIndex->deltaZones[zoneNumber]); +} + +/**********************************************************************/ +int abortSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + abortSavingDeltaMemory(&deltaIndex->deltaZones[zoneNumber]); + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t computeDeltaIndexSaveBytes(unsigned int numLists, size_t memorySize) +{ + // The exact amount of memory used depends upon the number of zones. + // Compute the maximum potential memory size. + size_t maxMemSize = memorySize; + unsigned int numZones; + for (numZones = 1; numZones <= MAX_ZONES; numZones++) { + size_t memSize = getZoneMemorySize(numZones, memorySize); + if (memSize > maxMemSize) { + maxMemSize = memSize; + } + } + // Saving a delta index requires a header ... + return (sizeof(struct di_header) + // ... plus a DeltaListSaveInfo per delta list + // plus an extra byte per delta list ... + + numLists * (sizeof(DeltaListSaveInfo) + 1) + // ... plus the delta list memory + + maxMemSize); +} + +/**********************************************************************/ +int validateDeltaIndex(const DeltaIndex *deltaIndex) +{ + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + int result = validateDeltaLists(&deltaIndex->deltaZones[z]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int assertNotAtEnd(const DeltaIndexEntry *deltaEntry, int errorCode) +{ + return ASSERT_WITH_ERROR_CODE(!deltaEntry->atEnd, errorCode, + "operation is invalid because the list entry " + "is at the end of the delta list"); +} + +/**********************************************************************/ +static void prefetchDeltaList(const DeltaMemory *deltaZone, + const DeltaList *deltaList) +{ + const byte *memory = deltaZone->memory; + const byte *addr = &memory[getDeltaListStart(deltaList) / CHAR_BIT]; + unsigned int size = getDeltaListSize(deltaList) / CHAR_BIT; + prefetchRange(addr, size, false); +} + +/**********************************************************************/ +int startDeltaIndexSearch(const DeltaIndex *deltaIndex, + unsigned int listNumber, unsigned int key, + bool readOnly, DeltaIndexEntry *deltaEntry) +{ + int result + = ASSERT_WITH_ERROR_CODE((listNumber < deltaIndex->numLists), + UDS_CORRUPT_DATA, + "Delta list number (%u) is out of range (%u)", + listNumber, deltaIndex->numLists); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, listNumber); + DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; + listNumber -= deltaZone->firstList; + result = ASSERT_WITH_ERROR_CODE((listNumber < deltaZone->numLists), + UDS_CORRUPT_DATA, + "Delta list number (%u)" + " is out of range (%u) for zone (%u)", + listNumber, deltaZone->numLists, zoneNumber); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaList *deltaList; + if (deltaIndex->isMutable) { + deltaList = &deltaZone->deltaLists[listNumber + 1]; + if (!readOnly) { + // Here is the lazy writing of the index for a checkpoint + lazyFlushDeltaList(deltaZone, listNumber); + } + } else { + // Translate the immutable delta list header into a temporary full + // delta list header + deltaList = &deltaEntry->tempDeltaList; + deltaList->startOffset = getImmutableStart(deltaZone->memory, listNumber); + unsigned int endOffset = getImmutableStart(deltaZone->memory, + listNumber + 1); + deltaList->size = endOffset - deltaList->startOffset; + deltaList->saveKey = 0; + deltaList->saveOffset = 0; + } + + if (key > deltaList->saveKey) { + deltaEntry->key = deltaList->saveKey; + deltaEntry->offset = deltaList->saveOffset; + } else { + deltaEntry->key = 0; + deltaEntry->offset = 0; + if (key == 0) { + // This usually means we're about to walk the entire delta list, so get + // all of it into the CPU cache. + prefetchDeltaList(deltaZone, deltaList); + } + } + + deltaEntry->atEnd = false; + deltaEntry->deltaZone = deltaZone; + deltaEntry->deltaList = deltaList; + deltaEntry->entryBits = 0; + deltaEntry->isCollision = false; + deltaEntry->listNumber = listNumber; + deltaEntry->listOverflow = false; + deltaEntry->valueBits = deltaZone->valueBits; + return UDS_SUCCESS; +} + +/**********************************************************************/ +__attribute__((__noinline__)) +int nextDeltaIndexEntry(DeltaIndexEntry *deltaEntry) +{ + int result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); + if (result != UDS_SUCCESS) { + return result; + } + + const DeltaList *deltaList = deltaEntry->deltaList; + deltaEntry->offset += deltaEntry->entryBits; + unsigned int size = getDeltaListSize(deltaList); + if (unlikely(deltaEntry->offset >= size)) { + deltaEntry->atEnd = true; + deltaEntry->delta = 0; + deltaEntry->isCollision = false; + return ASSERT_WITH_ERROR_CODE((deltaEntry->offset == size), + UDS_CORRUPT_DATA, + "next offset past end of delta list"); + } + + decodeDelta(deltaEntry); + + unsigned int nextOffset = deltaEntry->offset + deltaEntry->entryBits; + if (nextOffset > size) { + // This is not an assertion because validateChapterIndexPage() wants to + // handle this error. + logWarning("Decoded past the end of the delta list"); + return UDS_CORRUPT_DATA; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int rememberDeltaIndexOffset(const DeltaIndexEntry *deltaEntry) +{ + int result = ASSERT(!deltaEntry->isCollision, "entry is not a collision"); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaList *deltaList = deltaEntry->deltaList; + deltaList->saveKey = deltaEntry->key - deltaEntry->delta; + deltaList->saveOffset = deltaEntry->offset; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getDeltaIndexEntry(const DeltaIndex *deltaIndex, unsigned int listNumber, + unsigned int key, const byte *name, bool readOnly, + DeltaIndexEntry *deltaEntry) +{ + int result = startDeltaIndexSearch(deltaIndex, listNumber, key, readOnly, + deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + do { + result = nextDeltaIndexEntry(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + } while (!deltaEntry->atEnd && (key > deltaEntry->key)); + + result = rememberDeltaIndexOffset(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + + if (!deltaEntry->atEnd && (key == deltaEntry->key)) { + DeltaIndexEntry collisionEntry; + collisionEntry = *deltaEntry; + for (;;) { + result = nextDeltaIndexEntry(&collisionEntry); + if (result != UDS_SUCCESS) { + return result; + } + if (collisionEntry.atEnd || !collisionEntry.isCollision) { + break; + } + byte collisionName[COLLISION_BYTES]; + getBytes(deltaEntry->deltaZone->memory, + getCollisionOffset(&collisionEntry), collisionName, + COLLISION_BYTES); + if (memcmp(collisionName, name, COLLISION_BYTES) == 0) { + *deltaEntry = collisionEntry; + break; + } + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getDeltaEntryCollision(const DeltaIndexEntry *deltaEntry, byte *name) +{ + int result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE(deltaEntry->isCollision, UDS_BAD_STATE, + "Cannot get full block name from a" + " non-collision delta index entry"); + if (result != UDS_SUCCESS) { + return result; + } + + getBytes(deltaEntry->deltaZone->memory, getCollisionOffset(deltaEntry), + name, COLLISION_BYTES); + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int assertMutableEntry(const DeltaIndexEntry *deltaEntry) +{ + return ASSERT_WITH_ERROR_CODE(deltaEntry->deltaList + != &deltaEntry->tempDeltaList, + UDS_BAD_STATE, + "delta index is mutable"); +} + +/**********************************************************************/ +int setDeltaEntryValue(const DeltaIndexEntry *deltaEntry, unsigned int value) +{ + int result = assertMutableEntry(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT_WITH_ERROR_CODE(((value & ((1 << deltaEntry->valueBits) - 1)) + == value), UDS_INVALID_ARGUMENT, + "Value (%u) being set in a delta index is " + "too large (must fit in %u bits)", + value, deltaEntry->valueBits); + if (result != UDS_SUCCESS) { + return result; + } + + setField(value, deltaEntry->deltaZone->memory, + getDeltaEntryOffset(deltaEntry), deltaEntry->valueBits); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int putDeltaIndexEntry(DeltaIndexEntry *deltaEntry, unsigned int key, + unsigned int value, const byte *name) +{ + int result = assertMutableEntry(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + if (deltaEntry->isCollision) { + /* + * The caller wants us to insert a collision entry onto a collision + * entry. This happens when we find a collision and attempt to add the + * name again to the index. This is normally a fatal error unless we + * are replaying a closed chapter while we are rebuilding a master + * index. + */ + return UDS_DUPLICATE_NAME; + } + + if (deltaEntry->offset < deltaEntry->deltaList->saveOffset) { + // The saved entry offset is after the new entry and will no longer be + // valid, so replace it with the insertion point. + result = rememberDeltaIndexOffset(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (name != NULL) { + // We are inserting a collision entry which is placed after this entry + result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT((key == deltaEntry->key), + "incorrect key for collision entry"); + if (result != UDS_SUCCESS) { + return result; + } + + deltaEntry->offset += deltaEntry->entryBits; + setDelta(deltaEntry, 0); + setCollision(deltaEntry); + result = insertBits(deltaEntry, deltaEntry->entryBits); + } else if (deltaEntry->atEnd) { + // We are inserting a new entry at the end of the delta list + result = ASSERT((key >= deltaEntry->key), "key past end of list"); + if (result != UDS_SUCCESS) { + return result; + } + + setDelta(deltaEntry, key - deltaEntry->key); + deltaEntry->key = key; + deltaEntry->atEnd = false; + result = insertBits(deltaEntry, deltaEntry->entryBits); + } else { + // We are inserting a new entry which requires the delta in the + // following entry to be updated. + result = ASSERT((key < deltaEntry->key), "key precedes following entry"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT((key >= deltaEntry->key - deltaEntry->delta), + "key effects following entry's delta"); + if (result != UDS_SUCCESS) { + return result; + } + + int oldEntrySize = deltaEntry->entryBits; + DeltaIndexEntry nextEntry = *deltaEntry; + unsigned int nextValue = getDeltaEntryValue(&nextEntry); + setDelta(deltaEntry, key - (deltaEntry->key - deltaEntry->delta)); + deltaEntry->key = key; + setDelta(&nextEntry, nextEntry.key - key); + nextEntry.offset += deltaEntry->entryBits; + // The 2 new entries are always bigger than the 1 entry we are replacing + int additionalSize + = deltaEntry->entryBits + nextEntry.entryBits - oldEntrySize; + result = insertBits(deltaEntry, additionalSize); + if (result != UDS_SUCCESS) { + return result; + } + encodeEntry(&nextEntry, nextValue, NULL); + } + if (result != UDS_SUCCESS) { + return result; + } + encodeEntry(deltaEntry, value, name); + + DeltaMemory *deltaZone = deltaEntry->deltaZone; + deltaZone->recordCount++; + deltaZone->collisionCount += deltaEntry->isCollision ? 1 : 0; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int removeDeltaIndexEntry(DeltaIndexEntry *deltaEntry) +{ + int result = assertMutableEntry(deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaIndexEntry nextEntry = *deltaEntry; + result = nextDeltaIndexEntry(&nextEntry); + if (result != UDS_SUCCESS) { + return result; + } + + DeltaMemory *deltaZone = deltaEntry->deltaZone; + + if (deltaEntry->isCollision) { + // This is a collision entry, so just remove it + deleteBits(deltaEntry, deltaEntry->entryBits); + nextEntry.offset = deltaEntry->offset; + deltaZone->collisionCount -= 1; + } else if (nextEntry.atEnd) { + // This entry is at the end of the list, so just remove it + deleteBits(deltaEntry, deltaEntry->entryBits); + nextEntry.key -= deltaEntry->delta; + nextEntry.offset = deltaEntry->offset; + } else { + // The delta in the next entry needs to be updated. + unsigned int nextValue = getDeltaEntryValue(&nextEntry); + int oldSize = deltaEntry->entryBits + nextEntry.entryBits; + if (nextEntry.isCollision) { + // The next record is a collision. It needs to be rewritten as a + // non-collision with a larger delta. + nextEntry.isCollision = false; + deltaZone->collisionCount -= 1; + } + setDelta(&nextEntry, deltaEntry->delta + nextEntry.delta); + nextEntry.offset = deltaEntry->offset; + // The 1 new entry is always smaller than the 2 entries we are replacing + deleteBits(deltaEntry, oldSize - nextEntry.entryBits); + encodeEntry(&nextEntry, nextValue, NULL); + } + deltaZone->recordCount--; + deltaZone->discardCount++; + *deltaEntry = nextEntry; + + DeltaList *deltaList = deltaEntry->deltaList; + if (deltaEntry->offset < deltaList->saveOffset) { + // The saved entry offset is after the entry we just removed and it + // will no longer be valid. We must force the next search to start at + // the beginning. + deltaList->saveKey = 0; + deltaList->saveOffset = 0; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +unsigned int getDeltaIndexZoneFirstList(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + return deltaIndex->deltaZones[zoneNumber].firstList; +} + +/**********************************************************************/ +unsigned int getDeltaIndexZoneNumLists(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + return deltaIndex->deltaZones[zoneNumber].numLists; +} + +/**********************************************************************/ +uint64_t getDeltaIndexZoneDlistBitsUsed(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) +{ + uint64_t bitCount = 0; + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; + unsigned int i; + for (i = 0; i < deltaZone->numLists; i++) { + bitCount += getDeltaListSize(&deltaZone->deltaLists[i + 1]); + } + return bitCount; +} + +/**********************************************************************/ +uint64_t getDeltaIndexDlistBitsUsed(const DeltaIndex *deltaIndex) +{ + uint64_t bitCount = 0; + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + bitCount += getDeltaIndexZoneDlistBitsUsed(deltaIndex, z); + } + return bitCount; +} + +/**********************************************************************/ +uint64_t getDeltaIndexDlistBitsAllocated(const DeltaIndex *deltaIndex) +{ + uint64_t byteCount = 0; + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[z]; + byteCount += deltaZone->size; + } + return byteCount * CHAR_BIT; +} + +/**********************************************************************/ +void getDeltaIndexStats(const DeltaIndex *deltaIndex, DeltaIndexStats *stats) +{ + memset(stats, 0, sizeof(DeltaIndexStats)); + stats->memoryAllocated = deltaIndex->numZones * sizeof(DeltaMemory); + unsigned int z; + for (z = 0; z < deltaIndex->numZones; z++) { + const DeltaMemory *deltaZone = &deltaIndex->deltaZones[z]; + stats->memoryAllocated += getDeltaMemoryAllocated(deltaZone); + stats->rebalanceTime += deltaZone->rebalanceTime; + stats->rebalanceCount += deltaZone->rebalanceCount; + stats->recordCount += deltaZone->recordCount; + stats->collisionCount += deltaZone->collisionCount; + stats->discardCount += deltaZone->discardCount; + stats->overflowCount += deltaZone->overflowCount; + stats->numLists += deltaZone->numLists; + } +} + +/**********************************************************************/ +unsigned int getDeltaIndexPageCount(unsigned int numEntries, + unsigned int numLists, + unsigned int meanDelta, + unsigned int numPayloadBits, + size_t bytesPerPage) +{ + // Compute the number of bits needed for all the entries + size_t bitsPerIndex + = getDeltaMemorySize(numEntries, meanDelta, numPayloadBits); + // Compute the number of bits needed for a single delta list + unsigned int bitsPerDeltaList = bitsPerIndex / numLists; + // Adjust the bits per index, adding the immutable delta list headers + bitsPerIndex += numLists * IMMUTABLE_HEADER_SIZE; + // Compute the number of usable bits on an immutable index page + unsigned int bitsPerPage + = (bytesPerPage - sizeof(DeltaPageHeader)) * CHAR_BIT; + // Adjust the bits per page, taking away one immutable delta list header + // and one delta list representing internal fragmentation + bitsPerPage -= IMMUTABLE_HEADER_SIZE + bitsPerDeltaList; + // Now compute the number of pages needed + return (bitsPerIndex + bitsPerPage - 1) / bitsPerPage; +} + +/**********************************************************************/ +void logDeltaIndexEntry(DeltaIndexEntry *deltaEntry) +{ + logRatelimit(logInfo, "List 0x%X Key 0x%X Offset 0x%X%s%s ListSize 0x%X%s", + deltaEntry->listNumber, deltaEntry->key, deltaEntry->offset, + deltaEntry->atEnd ? " end" : "", + deltaEntry->isCollision ? " collision" : "", + getDeltaListSize(deltaEntry->deltaList), + deltaEntry->listOverflow ? " overflow" : ""); + deltaEntry->listOverflow = false; +} diff --git a/source/uds/deltaIndex.h b/source/uds/deltaIndex.h new file mode 100644 index 0000000..af2d762 --- /dev/null +++ b/source/uds/deltaIndex.h @@ -0,0 +1,595 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/deltaIndex.h#4 $ + */ + +#ifndef DELTAINDEX_H +#define DELTAINDEX_H 1 + +#include "compiler.h" +#include "deltaMemory.h" + +enum { + // the number of extra bytes and bits needed to store a collision entry + COLLISION_BYTES = UDS_CHUNK_NAME_SIZE, + COLLISION_BITS = COLLISION_BYTES * CHAR_BIT +}; + +typedef struct deltaIndex { + DeltaMemory *deltaZones; // The zones + unsigned int numZones; // The number of zones + unsigned int numLists; // The number of delta lists + unsigned int listsPerZone; // Lists per zone (last zone can be smaller) + bool isMutable; // True if this index is mutable + byte tag; // Tag belonging to this delta index +} DeltaIndex; + +/* + * A DeltaIndexPage describes a single page of a chapter index. The deltaIndex + * field allows the page to be treated as an immutable DeltaIndex. We use the + * deltaMemory field to treat the chapter index page as a single zone index, + * and without the need to do an additional memory allocation. + */ + +typedef struct deltaIndexPage { + DeltaIndex deltaIndex; + // These values are loaded from the DeltaPageHeader + unsigned int lowestListNumber; + unsigned int highestListNumber; + uint64_t virtualChapterNumber; + // This structure describes the single zone of a delta index page. + DeltaMemory deltaMemory; +} DeltaIndexPage; + +/* + * Notes on the DeltaIndexEntries: + * + * The fields documented as "public" can be read by any code that uses a + * DeltaIndex. The fields documented as "private" carry information + * between DeltaIndex method calls and should not be used outside the + * DeltaIndex module. + * + * (1) The DeltaIndexEntry is used like an iterator when searching a delta + * list. + * + * (2) And it is also the result of a successful search and can be used to + * refer to the element found by the search. + * + * (3) And it is also the result of an unsuccessful search and can be used + * to refer to the insertion point for a new record. + * + * (4) If atEnd==true, the DeltaListEntry can only be used as the insertion + * point for a new record at the end of the list. + * + * (5) If atEnd==false and isCollision==true, the DeltaListEntry fields + * refer to a collision entry in the list, and the DeltaListEntry can + * be used a a reference to this entry. + * + * (6) If atEnd==false and isCollision==false, the DeltaListEntry fields + * refer to a non-collision entry in the list. Such DeltaListEntries + * can be used as a reference to a found entry, or an insertion point + * for a non-collision entry before this entry, or an insertion point + * for a collision entry that collides with this entry. + */ + +typedef struct deltaIndexEntry { + // Public fields + unsigned int key; // The key for this entry + bool atEnd; // We are after the last entry in the list + bool isCollision; // This record is a collision + // Private fields (but DeltaIndex_t1 cheats and looks at them) + bool listOverflow; // This delta list overflowed + unsigned short valueBits; // The number of bits used for the value + unsigned short entryBits; // The number of bits used for the entire entry + DeltaMemory *deltaZone; // The delta index zone + DeltaList *deltaList; // The delta list containing the entry, + unsigned int listNumber; // The delta list number + uint32_t offset; // Bit offset of this entry within the list + unsigned int delta; // The delta between this and previous entry + DeltaList tempDeltaList; // Temporary delta list for immutable indices +} DeltaIndexEntry; + +typedef struct { + size_t memoryAllocated; // Number of bytes allocated + RelTime rebalanceTime; // The time spent rebalancing + int rebalanceCount; // Number of memory rebalances + long recordCount; // The number of records in the index + long collisionCount; // The number of collision records + long discardCount; // The number of records removed + long overflowCount; // The number of UDS_OVERFLOWs detected + unsigned int numLists; // The number of delta lists +} DeltaIndexStats; + +/** + * Initialize a delta index. + * + * @param deltaIndex The delta index to initialize + * @param numZones The number of zones in the index + * @param numLists The number of delta lists in the index + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + * @param memorySize The number of bytes in memory for the index + * + * @return error code or UDS_SUCCESS + **/ +int initializeDeltaIndex(DeltaIndex *deltaIndex, unsigned int numZones, + unsigned int numLists, unsigned int meanDelta, + unsigned int numPayloadBits, size_t memorySize) + __attribute__((warn_unused_result)); + +/** + * Initialize an immutable delta index page. + * + * @param deltaIndexPage The delta index page to initialize + * @param expectedNonce If non-zero, the expected nonce. + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + * @param memory The memory page + * @param memSize The size of the memory page + * + * @return error code or UDS_SUCCESS + **/ +int initializeDeltaIndexPage(DeltaIndexPage *deltaIndexPage, + uint64_t expectedNonce, + unsigned int meanDelta, + unsigned int numPayloadBits, + byte *memory, + size_t memSize) + __attribute__((warn_unused_result)); + +/** + * Uninitialize a delta index. + * + * @param deltaIndex The delta index to uninitialize + **/ +void uninitializeDeltaIndex(DeltaIndex *deltaIndex); + +/** + * Empty the delta index. + * + * @param deltaIndex The delta index being emptied. + **/ +void emptyDeltaIndex(const DeltaIndex *deltaIndex); + +/** + * Empty a zone of the delta index. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone being emptied + **/ +void emptyDeltaIndexZone(const DeltaIndex *deltaIndex, + unsigned int zoneNumber); + +/** + * Pack delta lists from a mutable delta index into an immutable delta index + * page. A range of delta lists (starting with a specified list index) is + * copied from the mutable delta index into a memory page used in the immutable + * index. The number of lists copied onto the page is returned to the caller. + * + * @param deltaIndex The delta index being converted + * @param headerNonce The header nonce to store + * @param headerNativeEndian If true, write native endian header + * @param memory The memory page to use + * @param memSize The size of the memory page + * @param virtualChapterNumber The virtual chapter number + * @param firstList The first delta list number to be copied + * @param numLists The number of delta lists that were copied + * + * @return error code or UDS_SUCCESS. On UDS_SUCCESS, the numLists + * argument contains the number of lists copied. + **/ +int packDeltaIndexPage(const DeltaIndex *deltaIndex, + uint64_t headerNonce, + bool headerNativeEndian, + byte *memory, + size_t memSize, + uint64_t virtualChapterNumber, + unsigned int firstList, + unsigned int *numLists) + __attribute__((warn_unused_result)); + + +/** + * Set the tag value used when saving and/or restoring a delta index. + * + * @param deltaIndex The delta index + * @param tag The tag value + **/ +void setDeltaIndexTag(DeltaIndex *deltaIndex, byte tag); + +/** + * Start restoring a delta index from an input stream. + * + * @param deltaIndex The delta index to read into + * @param bufferedReaders The buffered readers to read the delta index from + * @param numReaders The number of buffered readers + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int startRestoringDeltaIndex(const DeltaIndex *deltaIndex, + BufferedReader **bufferedReaders, int numReaders) + __attribute__((warn_unused_result)); + +/** + * Have all the data been read while restoring a delta index from an + * input stream? + * + * @param deltaIndex The delta index + * + * @return true if all the data are read + **/ +bool isRestoringDeltaIndexDone(const DeltaIndex *deltaIndex); + +/** + * Restore a saved delta list + * + * @param deltaIndex The delta index + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +int restoreDeltaListToDeltaIndex(const DeltaIndex *deltaIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) + __attribute__((warn_unused_result)); + +/** + * Abort restoring a delta index from an input stream. + * + * @param deltaIndex The delta index + **/ +void abortRestoringDeltaIndex(const DeltaIndex *deltaIndex); + +/** + * Start saving a delta index zone to a buffered output stream. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * @param bufferedWriter The index state component being written + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int startSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) + __attribute__((warn_unused_result)); + +/** + * Have all the data been written while saving a delta index zone to an + * output stream? If the answer is yes, it is still necessary to call + * finishSavingDeltaIndex(), which will return quickly. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return true if all the data are written + **/ +bool isSavingDeltaIndexDone(const DeltaIndex *deltaIndex, + unsigned int zoneNumber); + +/** + * Finish saving a delta index zone to an output stream. Force the writing + * of all of the remaining data. If an error occurred asynchronously + * during the save operation, it will be returned here. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int finishSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Abort saving a delta index zone to an output stream. If an error + * occurred asynchronously during the save operation, it will be dropped. + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int abortSavingDeltaIndex(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Compute the number of bytes required to save a delta index + * + * @param numLists The number of delta lists in the index + * @param memorySize The number of bytes in memory for the index + * + * @return numBytes The number of bytes required to save the master index + **/ +size_t computeDeltaIndexSaveBytes(unsigned int numLists, size_t memorySize) + __attribute__((warn_unused_result)); + +/** + * Validate the delta index + * + * @param deltaIndex The delta index + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int validateDeltaIndex(const DeltaIndex *deltaIndex) + __attribute__((warn_unused_result)); + +/** + * Prepare to search for an entry in the specified delta list. + * + *

This is always the first routine to be called when dealing with delta + * index entries. It is always followed by calls to nextDeltaIndexEntry to + * iterate through a delta list. The fields of the DeltaIndexEntry argument + * will be set up for iteration, but will not contain an entry from the list. + * + * @param deltaIndex The delta index to search + * @param listNumber The delta list number + * @param key First delta list key that the caller is interested in + * @param readOnly True if this is a read-only operation + * @param iterator The index entry being used to search through the list + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int startDeltaIndexSearch(const DeltaIndex *deltaIndex, + unsigned int listNumber, unsigned int key, + bool readOnly, DeltaIndexEntry *iterator) + __attribute__((warn_unused_result)); + +/** + * Find the next entry in the specified delta list + * + * @param deltaEntry Info about an entry, which is updated to describe the + * following entry + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int nextDeltaIndexEntry(DeltaIndexEntry *deltaEntry) + __attribute__((warn_unused_result)); + +/** + * Remember the position of a delta index entry, so that we can use it when + * starting the next search. + * + * @param deltaEntry Info about an entry found during a search. This should + * be the first entry that matches the key exactly (i.e. + * not a collision entry), or the first entry with a key + * greater than the entry sought for. + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int rememberDeltaIndexOffset(const DeltaIndexEntry *deltaEntry) + __attribute__((warn_unused_result)); + +/** + * Find the delta index entry, or the insertion point for a delta index + * entry. + * + * @param deltaIndex The delta index to search + * @param listNumber The delta list number + * @param key The key field being looked for + * @param name The 256 bit full name + * @param readOnly True if this is a read-only index search + * @param deltaEntry Updated to describe the entry being looked for + * + * @return UDS_SUCCESS or an error code + **/ +int getDeltaIndexEntry(const DeltaIndex *deltaIndex, unsigned int listNumber, + unsigned int key, const byte *name, bool readOnly, + DeltaIndexEntry *deltaEntry) + __attribute__((warn_unused_result)); + +/** + * Get the full name from a collision DeltaIndexEntry + * + * @param deltaEntry The delta index record + * @param name The 256 bit full name + * + * @return UDS_SUCCESS or an error code + **/ +int getDeltaEntryCollision(const DeltaIndexEntry *deltaEntry, byte *name) + __attribute__((warn_unused_result)); + +/** + * Get the bit offset into delta memory of a delta index entry. + * + * @param deltaEntry The delta index entry + * + * @return the bit offset into delta memory + **/ +static INLINE uint64_t getDeltaEntryOffset(const DeltaIndexEntry *deltaEntry) +{ + return getDeltaListStart(deltaEntry->deltaList) + deltaEntry->offset; +} + +/** + * Get the number of bits used to encode the entry key (the delta). + * + * @param entry The delta index record + * + * @return the number of bits used to encode the key + **/ +static INLINE unsigned int getDeltaEntryKeyBits(const DeltaIndexEntry *entry) +{ + /* + * Derive keyBits by subtracting the sizes of the other two fields from the + * total. We don't actually use this for encoding/decoding, so it doesn't + * need to be super-fast. We save time where it matters by not storing it. + */ + return (entry->entryBits - entry->valueBits + - (entry->isCollision ? COLLISION_BITS : 0)); +} + +/** + * Get the value field of the DeltaIndexEntry + * + * @param deltaEntry The delta index record + * + * @return the value + **/ +static INLINE unsigned int getDeltaEntryValue(const DeltaIndexEntry *deltaEntry) +{ + return getField(deltaEntry->deltaZone->memory, + getDeltaEntryOffset(deltaEntry), deltaEntry->valueBits); +} + +/** + * Set the value field of the DeltaIndexEntry + * + * @param deltaEntry The delta index record + * @param value The new value + * + * @return UDS_SUCCESS or an error code + **/ +int setDeltaEntryValue(const DeltaIndexEntry *deltaEntry, unsigned int value) + __attribute__((warn_unused_result)); + +/** + * Create a new entry in the delta index + * + * @param deltaEntry The delta index entry that indicates the insertion point + * for the new record. For a collision entry, this is the + * non-collision entry that the new entry collides with. + * For a non-collision entry, this new entry is inserted + * before the specified entry. + * @param key The key field + * @param value The value field + * @param name For collision entries, the 256 bit full name; + * Otherwise null + * + * @return UDS_SUCCESS or an error code + **/ +int putDeltaIndexEntry(DeltaIndexEntry *deltaEntry, unsigned int key, + unsigned int value, const byte *name) + __attribute__((warn_unused_result)); + +/** + * Remove an existing delta index entry, and advance to the next entry in + * the delta list. + * + * @param deltaEntry On call the delta index record to remove. After + * returning, the following entry in the delta list. + * + * @return UDS_SUCCESS or an error code + **/ +int removeDeltaIndexEntry(DeltaIndexEntry *deltaEntry) + __attribute__((warn_unused_result)); + +/** + * Map a delta list number to a delta zone number + * + * @param deltaIndex The delta index + * @param listNumber The delta list number + * + * @return the zone number containing the delta list + **/ +static INLINE unsigned int getDeltaIndexZone(const DeltaIndex *deltaIndex, + unsigned int listNumber) +{ + return listNumber / deltaIndex->listsPerZone; +} + +/** + * Get the first delta list number in a zone + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return the first delta list index in the zone + **/ +unsigned int getDeltaIndexZoneFirstList(const DeltaIndex *deltaIndex, + unsigned int zoneNumber); + +/** + * Get the number of delta lists in a zone + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return the number of delta lists in the zone + **/ +unsigned int getDeltaIndexZoneNumLists(const DeltaIndex *deltaIndex, + unsigned int zoneNumber); + +/** + * Get the number of bytes used for master index entries in a zone + * + * @param deltaIndex The delta index + * @param zoneNumber The zone number + * + * @return The number of bits in use + **/ +uint64_t getDeltaIndexZoneDlistBitsUsed(const DeltaIndex *deltaIndex, + unsigned int zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes used for master index entries. + * + * @param deltaIndex The delta index + * + * @return The number of bits in use + **/ +uint64_t getDeltaIndexDlistBitsUsed(const DeltaIndex *deltaIndex) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes allocated for master index entries. + * + * @param deltaIndex The delta index + * + * @return The number of bits allocated + **/ +uint64_t getDeltaIndexDlistBitsAllocated(const DeltaIndex *deltaIndex) + __attribute__((warn_unused_result)); + +/** + * Get the delta index statistics. + * + * @param deltaIndex The delta index + * @param stats The statistics + **/ +void getDeltaIndexStats(const DeltaIndex *deltaIndex, DeltaIndexStats *stats); + +/** + * Get the number of pages needed for an immutable delta index. + * + * @param numEntries The number of entries in the index + * @param numLists The number of delta lists + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + * @param bytesPerPage The number of bytes in a page + * + * @return the number of pages needed for the index + **/ +unsigned int getDeltaIndexPageCount(unsigned int numEntries, + unsigned int numLists, + unsigned int meanDelta, + unsigned int numPayloadBits, + size_t bytesPerPage); + +/** + * Log a delta index entry, and any error conditions related to the entry. + * + * @param deltaEntry The delta index entry. + **/ +void logDeltaIndexEntry(DeltaIndexEntry *deltaEntry); + +#endif /* DELTAINDEX_H */ diff --git a/source/uds/deltaMemory.c b/source/uds/deltaMemory.c new file mode 100644 index 0000000..2b30714 --- /dev/null +++ b/source/uds/deltaMemory.c @@ -0,0 +1,720 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/deltaMemory.c#3 $ + */ +#include "deltaMemory.h" + +#include "bits.h" +#include "buffer.h" +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "timeUtils.h" +#include "typeDefs.h" +#include "uds.h" + +/* + * The DeltaMemory structure manages the memory that stores delta lists. + * + * The "mutable" form of DeltaMemory is used for the master index and for + * an open chapter index. The "immutable" form of DeltaMemory is used for + * regular chapter indices. + */ + +// This is the number of guard bits that are needed in the tail guard list +enum { GUARD_BITS = POST_FIELD_GUARD_BYTES * CHAR_BIT }; + +/** + * Get the offset of the first byte that a delta list bit stream resides in + * + * @param deltaList The delta list + * + * @return the number byte offset + **/ +static INLINE uint64_t getDeltaListByteStart(const DeltaList *deltaList) +{ + return getDeltaListStart(deltaList) / CHAR_BIT; +} + +/** + * Get the actual number of bytes that a delta list bit stream resides in + * + * @param deltaList The delta list + * + * @return the number of bytes + **/ +static INLINE uint16_t getDeltaListByteSize(const DeltaList *deltaList) +{ + uint16_t startBitOffset = getDeltaListStart(deltaList) % CHAR_BIT; + uint16_t bitSize = getDeltaListSize(deltaList); + return ((unsigned int) startBitOffset + bitSize + CHAR_BIT - 1) / CHAR_BIT; +} + +/** + * Get the number of bytes in the delta lists headers. + * + * @param numLists The number of delta lists + * + * @return the number of bytes in the delta lists headers + **/ +static INLINE size_t getSizeOfDeltaLists(unsigned int numLists) +{ + return (numLists + 2) * sizeof(DeltaList); +} + +/** + * Get the size of the flags array (in bytes) + * + * @param numLists The number of delta lists + * + * @return the number of bytes for an array that has one bit per delta + * list, plus the necessary guard bytes. + **/ +static INLINE size_t getSizeOfFlags(unsigned int numLists) +{ + return (numLists + CHAR_BIT - 1) / CHAR_BIT + POST_FIELD_GUARD_BYTES; +} + +/** + * Get the number of bytes of scratch memory for the delta lists. + * + * @param numLists The number of delta lists + * + * @return the number of bytes of scratch memory for the delta lists + **/ +static INLINE size_t getSizeOfTempOffsets(unsigned int numLists) +{ + return (numLists + 2) * sizeof(uint64_t); +} + +/**********************************************************************/ + +/** + * Clear the transfers flags. + * + * @param deltaMemory The delta memory + **/ +static void clearTransferFlags(DeltaMemory *deltaMemory) +{ + memset(deltaMemory->flags, 0, getSizeOfFlags(deltaMemory->numLists)); + deltaMemory->numTransfers = 0; + deltaMemory->transferStatus = UDS_SUCCESS; +} + +/**********************************************************************/ + +/** + * Set the transfer flags for delta lists that are not empty, and count how + * many there are. + * + * @param deltaMemory The delta memory + **/ +static void flagNonEmptyDeltaLists(DeltaMemory *deltaMemory) +{ + clearTransferFlags(deltaMemory); + unsigned int i; + for (i = 0; i < deltaMemory->numLists; i++) { + if (getDeltaListSize(&deltaMemory->deltaLists[i + 1]) > 0) { + setOne(deltaMemory->flags, i, 1); + deltaMemory->numTransfers++; + } + } +} + +/**********************************************************************/ +void emptyDeltaLists(DeltaMemory *deltaMemory) +{ + // Zero all the delta list headers + DeltaList *deltaLists = deltaMemory->deltaLists; + memset(deltaLists, 0, getSizeOfDeltaLists(deltaMemory->numLists)); + + /* + * Initialize delta lists to be empty. We keep 2 extra delta list + * descriptors, one before the first real entry and one after so that we + * don't need to bounds check the array access when calculating + * preceeding and following gap sizes. + * + * Because the delta list headers were zeroed, the head guard list is + * already at offset zero and size zero. + * + * The end guard list contains guard bytes so that the bit field + * utilities can safely read past the end of any byte we are interested + * in. + */ + uint64_t numBits = (uint64_t) deltaMemory->size * CHAR_BIT; + deltaLists[deltaMemory->numLists + 1].startOffset = numBits - GUARD_BITS; + deltaLists[deltaMemory->numLists + 1].size = GUARD_BITS; + + // Set all the bits in the end guard list. Do not use the bit field + // utilities. + memset(deltaMemory->memory + deltaMemory->size - POST_FIELD_GUARD_BYTES, + ~0, POST_FIELD_GUARD_BYTES); + + // Evenly space out the real delta lists. The sizes are already zero, so + // we just need to set the starting offsets. + uint64_t spacing = (numBits - GUARD_BITS) / deltaMemory->numLists; + uint64_t offset = spacing / 2; + unsigned int i; + for (i = 1; i <= deltaMemory->numLists; i++) { + deltaLists[i].startOffset = offset; + offset += spacing; + } + + // Update the statistics + deltaMemory->discardCount += deltaMemory->recordCount; + deltaMemory->recordCount = 0; + deltaMemory->collisionCount = 0; +} + +/**********************************************************************/ +/** + * Compute the Huffman coding parameters for the given mean delta + * + * @param meanDelta The mean delta value + * @param minBits The number of bits in the minimal key code + * @param minKeys The number of keys used in a minimal code + * @param incrKeys The number of keys used for another code bit + **/ +static void computeCodingConstants(unsigned int meanDelta, + unsigned short *minBits, + unsigned int *minKeys, + unsigned int *incrKeys) +{ + // We want to compute the rounded value of log(2) * meanDelta. Since we + // cannot always use floating point, use a really good integer approximation. + *incrKeys = (836158UL * meanDelta + 603160UL) / 1206321UL; + *minBits = computeBits(*incrKeys + 1); + *minKeys = (1 << *minBits) - *incrKeys; +} + +/**********************************************************************/ +/** + * Rebalance a range of delta lists within memory. + * + * @param deltaMemory A delta memory structure + * @param first The first delta list index + * @param last The last delta list index + **/ +static void rebalanceDeltaMemory(const DeltaMemory *deltaMemory, + unsigned int first, unsigned int last) +{ + if (first == last) { + DeltaList *deltaList = &deltaMemory->deltaLists[first]; + uint64_t newStart = deltaMemory->tempOffsets[first]; + // We need to move only one list, and we know it is safe to do so + if (getDeltaListStart(deltaList) != newStart) { + // Compute the first source byte + uint64_t source = getDeltaListByteStart(deltaList); + // Update the delta list location + deltaList->startOffset = newStart; + // Now use the same computation to locate the first destination byte + uint64_t destination = getDeltaListByteStart(deltaList); + memmove(deltaMemory->memory + destination, deltaMemory->memory + source, + getDeltaListByteSize(deltaList)); + } + } else { + // There is more than one list. Divide the problem in half, and use + // recursive calls to process each half. Note that after this + // computation, first <= middle, and middle < last. + unsigned int middle = (first + last) / 2; + const DeltaList *deltaList = &deltaMemory->deltaLists[middle]; + uint64_t newStart = deltaMemory->tempOffsets[middle]; + // The direction that our middle list is moving determines which half + // of the problem must be processed first. + if (newStart > getDeltaListStart(deltaList)) { + rebalanceDeltaMemory(deltaMemory, middle + 1, last); + rebalanceDeltaMemory(deltaMemory, first, middle); + } else { + rebalanceDeltaMemory(deltaMemory, first, middle); + rebalanceDeltaMemory(deltaMemory, middle + 1, last); + } + } +} + +/**********************************************************************/ +int initializeDeltaMemory(DeltaMemory *deltaMemory, size_t size, + unsigned int firstList, unsigned int numLists, + unsigned int meanDelta, unsigned int numPayloadBits) +{ + if (numLists == 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize delta memory with 0 " + "delta lists"); + } + byte *memory = NULL; + int result = ALLOCATE(size, byte, "delta list", &memory); + if (result != UDS_SUCCESS) { + return result; + } + uint64_t *tempOffsets = NULL; + result = ALLOCATE(numLists + 2, uint64_t, "delta list temp", + &tempOffsets); + if (result != UDS_SUCCESS) { + FREE(memory); + return result; + } + byte *flags = NULL; + result = ALLOCATE(getSizeOfFlags(numLists), byte, "delta list flags", + &flags); + if (result != UDS_SUCCESS) { + FREE(memory); + FREE(tempOffsets); + return result; + } + + computeCodingConstants(meanDelta, &deltaMemory->minBits, + &deltaMemory->minKeys, &deltaMemory->incrKeys); + deltaMemory->valueBits = numPayloadBits; + deltaMemory->memory = memory; + deltaMemory->deltaLists = NULL; + deltaMemory->tempOffsets = tempOffsets; + deltaMemory->flags = flags; + deltaMemory->bufferedWriter = NULL; + deltaMemory->size = size; + deltaMemory->rebalanceTime = 0; + deltaMemory->rebalanceCount = 0; + deltaMemory->recordCount = 0; + deltaMemory->collisionCount = 0; + deltaMemory->discardCount = 0; + deltaMemory->overflowCount = 0; + deltaMemory->firstList = firstList; + deltaMemory->numLists = numLists; + deltaMemory->numTransfers = 0; + deltaMemory->transferStatus = UDS_SUCCESS; + deltaMemory->tag = 'm'; + + // Allocate the delta lists. + result = ALLOCATE(deltaMemory->numLists + 2, DeltaList, + "delta lists", &deltaMemory->deltaLists); + if (result != UDS_SUCCESS) { + uninitializeDeltaMemory(deltaMemory); + return result; + } + + emptyDeltaLists(deltaMemory); + return UDS_SUCCESS; +} + +/**********************************************************************/ +void uninitializeDeltaMemory(DeltaMemory *deltaMemory) +{ + FREE(deltaMemory->flags); + deltaMemory->flags = NULL; + FREE(deltaMemory->tempOffsets); + deltaMemory->tempOffsets = NULL; + FREE(deltaMemory->deltaLists); + deltaMemory->deltaLists = NULL; + FREE(deltaMemory->memory); + deltaMemory->memory = NULL; +} + +/**********************************************************************/ +void initializeDeltaMemoryPage(DeltaMemory *deltaMemory, byte *memory, + size_t size, unsigned int numLists, + unsigned int meanDelta, + unsigned int numPayloadBits) +{ + computeCodingConstants(meanDelta, &deltaMemory->minBits, + &deltaMemory->minKeys, &deltaMemory->incrKeys); + deltaMemory->valueBits = numPayloadBits; + deltaMemory->memory = memory; + deltaMemory->deltaLists = NULL; + deltaMemory->tempOffsets = NULL; + deltaMemory->flags = NULL; + deltaMemory->bufferedWriter = NULL; + deltaMemory->size = size; + deltaMemory->rebalanceTime = 0; + deltaMemory->rebalanceCount = 0; + deltaMemory->recordCount = 0; + deltaMemory->collisionCount = 0; + deltaMemory->discardCount = 0; + deltaMemory->overflowCount = 0; + deltaMemory->firstList = 0; + deltaMemory->numLists = numLists; + deltaMemory->numTransfers = 0; + deltaMemory->transferStatus = UDS_SUCCESS; + deltaMemory->tag = 'p'; +} + +/**********************************************************************/ +bool areDeltaMemoryTransfersDone(const DeltaMemory *deltaMemory) +{ + return deltaMemory->numTransfers == 0; +} + +/**********************************************************************/ +int startRestoringDeltaMemory(DeltaMemory *deltaMemory) +{ + // Extend and balance memory to receive the delta lists + int result = extendDeltaMemory(deltaMemory, 0, 0, false); + if (result != UDS_SUCCESS) { + return UDS_SUCCESS; + } + + // The tail guard list needs to be set to ones + DeltaList *deltaList = &deltaMemory->deltaLists[deltaMemory->numLists + 1]; + setOne(deltaMemory->memory, getDeltaListStart(deltaList), + getDeltaListSize(deltaList)); + + flagNonEmptyDeltaLists(deltaMemory); + return UDS_SUCCESS; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int readDeltaListSaveInfo(BufferedReader *reader, + DeltaListSaveInfo *dlsi) +{ + byte buffer[sizeof(DeltaListSaveInfo)]; + int result = readFromBufferedReader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + dlsi->tag = buffer[0]; + dlsi->bitOffset = buffer[1]; + dlsi->byteCount = getUInt16LE(&buffer[2]); + dlsi->index = getUInt32LE(&buffer[4]); + return result; +} + +/**********************************************************************/ +int readSavedDeltaList(DeltaListSaveInfo *dlsi, + byte data[DELTA_LIST_MAX_BYTE_COUNT], + BufferedReader *bufferedReader) +{ + int result = readDeltaListSaveInfo(bufferedReader, dlsi); + if (result == UDS_END_OF_FILE) { + return UDS_END_OF_FILE; + } + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "failed to read delta list data"); + } + if ((dlsi->bitOffset >= CHAR_BIT) + || (dlsi->byteCount > DELTA_LIST_MAX_BYTE_COUNT)) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "corrupt delta list data"); + } + if (dlsi->tag == 'z') { + return UDS_END_OF_FILE; + } + result = readFromBufferedReader(bufferedReader, data, dlsi->byteCount); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "failed to read delta list data"); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int restoreDeltaList(DeltaMemory *deltaMemory, const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + unsigned int listNumber = dlsi->index - deltaMemory->firstList; + if (listNumber >= deltaMemory->numLists) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "invalid delta list number %u not in" + " range [%u,%u)", + dlsi->index, deltaMemory->firstList, + deltaMemory->firstList + + deltaMemory->numLists); + } + + if (getField(deltaMemory->flags, listNumber, 1) == 0) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected delta list number %u", + dlsi->index); + } + + DeltaList *deltaList = &deltaMemory->deltaLists[listNumber + 1]; + uint16_t bitSize = getDeltaListSize(deltaList); + unsigned int byteCount + = ((unsigned int) dlsi->bitOffset + bitSize + CHAR_BIT - 1) / CHAR_BIT; + if (dlsi->byteCount != byteCount) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected delta list size %u != %u", + dlsi->byteCount, byteCount); + } + + moveBits(data, dlsi->bitOffset, deltaMemory->memory, + getDeltaListStart(deltaList), bitSize); + setZero(deltaMemory->flags, listNumber, 1); + deltaMemory->numTransfers--; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void abortRestoringDeltaMemory(DeltaMemory *deltaMemory) +{ + clearTransferFlags(deltaMemory); + emptyDeltaLists(deltaMemory); +} + +/**********************************************************************/ +void startSavingDeltaMemory(DeltaMemory *deltaMemory, + BufferedWriter *bufferedWriter) +{ + flagNonEmptyDeltaLists(deltaMemory); + deltaMemory->bufferedWriter = bufferedWriter; +} + +/**********************************************************************/ +int finishSavingDeltaMemory(DeltaMemory *deltaMemory) +{ + unsigned int i; + for (i = 0; + !areDeltaMemoryTransfersDone(deltaMemory) + && (i < deltaMemory->numLists); + i++) { + lazyFlushDeltaList(deltaMemory, i); + } + if (deltaMemory->numTransfers > 0) { + deltaMemory->transferStatus + = logWarningWithStringError(UDS_CORRUPT_DATA, + "Not all delta lists written"); + } + deltaMemory->bufferedWriter = NULL; + return deltaMemory->transferStatus; +} + +/**********************************************************************/ +void abortSavingDeltaMemory(DeltaMemory *deltaMemory) +{ + clearTransferFlags(deltaMemory); + deltaMemory->bufferedWriter = NULL; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int writeDeltaListSaveInfo(BufferedWriter *bufferedWriter, + DeltaListSaveInfo *dlsi) +{ + byte buffer[sizeof(DeltaListSaveInfo)]; + buffer[0] = dlsi->tag; + buffer[1] = dlsi->bitOffset; + storeUInt16LE(&buffer[2], dlsi->byteCount); + storeUInt32LE(&buffer[4], dlsi->index); + return writeToBufferedWriter(bufferedWriter, buffer, sizeof(buffer)); +} + +/**********************************************************************/ +void flushDeltaList(DeltaMemory *deltaMemory, unsigned int flushIndex) +{ + ASSERT_LOG_ONLY((getField(deltaMemory->flags, flushIndex, 1) != 0), + "flush bit is set"); + setZero(deltaMemory->flags, flushIndex, 1); + deltaMemory->numTransfers--; + + DeltaList *deltaList = &deltaMemory->deltaLists[flushIndex + 1]; + DeltaListSaveInfo dlsi; + dlsi.tag = deltaMemory->tag; + dlsi.bitOffset = getDeltaListStart(deltaList) % CHAR_BIT; + dlsi.byteCount = getDeltaListByteSize(deltaList); + dlsi.index = deltaMemory->firstList + flushIndex; + + int result = writeDeltaListSaveInfo(deltaMemory->bufferedWriter, &dlsi); + if (result != UDS_SUCCESS) { + if (deltaMemory->transferStatus == UDS_SUCCESS) { + logWarningWithStringError(result, "failed to write delta list memory"); + deltaMemory->transferStatus = result; + } + } + result = writeToBufferedWriter(deltaMemory->bufferedWriter, + deltaMemory->memory + + getDeltaListByteStart(deltaList), + dlsi.byteCount); + if (result != UDS_SUCCESS) { + if (deltaMemory->transferStatus == UDS_SUCCESS) { + logWarningWithStringError(result, "failed to write delta list memory"); + deltaMemory->transferStatus = result; + } + } +} + +/**********************************************************************/ +int writeGuardDeltaList(BufferedWriter *bufferedWriter) +{ + DeltaListSaveInfo dlsi; + dlsi.tag = 'z'; + dlsi.bitOffset = 0; + dlsi.byteCount = 0; + dlsi.index = 0; + int result = writeToBufferedWriter(bufferedWriter, (const byte *) &dlsi, + sizeof(DeltaListSaveInfo)); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "failed to write guard delta list"); + } + return result; +} + +/**********************************************************************/ +int extendDeltaMemory(DeltaMemory *deltaMemory, unsigned int growingIndex, + size_t growingSize, bool doCopy) +{ + if (!isMutable(deltaMemory)) { + return logErrorWithStringError(UDS_BAD_STATE, + "Attempt to read into an immutable delta" + " list memory"); + } + + AbsTime startTime = currentTime(CLOCK_MONOTONIC); + + // Calculate the amount of space that is in use. Include the space that + // has a planned use. + DeltaList *deltaLists = deltaMemory->deltaLists; + size_t usedSpace = growingSize; + unsigned int i; + for (i = 0; i <= deltaMemory->numLists + 1; i++) { + usedSpace += getDeltaListByteSize(&deltaLists[i]); + } + + if (deltaMemory->size < usedSpace) { + return UDS_OVERFLOW; + } + + // Compute the new offsets of the delta lists + size_t spacing = (deltaMemory->size - usedSpace) / deltaMemory->numLists; + deltaMemory->tempOffsets[0] = 0; + for (i = 0; i <= deltaMemory->numLists; i++) { + deltaMemory->tempOffsets[i + 1] = (deltaMemory->tempOffsets[i] + + getDeltaListByteSize(&deltaLists[i]) + + spacing); + deltaMemory->tempOffsets[i] *= CHAR_BIT; + deltaMemory->tempOffsets[i] + += getDeltaListStart(&deltaLists[i]) % CHAR_BIT; + if (i == 0) { + deltaMemory->tempOffsets[i + 1] -= spacing / 2; + } + if (i + 1 == growingIndex) { + deltaMemory->tempOffsets[i + 1] += growingSize; + } + } + deltaMemory->tempOffsets[deltaMemory->numLists + 1] + = (deltaMemory->size * CHAR_BIT + - getDeltaListSize(&deltaLists[deltaMemory->numLists + 1])); + // When we rebalance the delta list, we will include the end guard list + // in the rebalancing. It contains the end guard data, which must be + // copied. + if (doCopy) { + rebalanceDeltaMemory(deltaMemory, 1, deltaMemory->numLists + 1); + AbsTime endTime = currentTime(CLOCK_MONOTONIC); + deltaMemory->rebalanceCount++; + deltaMemory->rebalanceTime += timeDifference(endTime, startTime); + } else { + for (i = 1; i <= deltaMemory->numLists + 1; i++) { + deltaLists[i].startOffset = deltaMemory->tempOffsets[i]; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int validateDeltaLists(const DeltaMemory *deltaMemory) +{ + // Validate the delta index fields set by restoring a delta index + if (deltaMemory->collisionCount > deltaMemory->recordCount) { + return logWarningWithStringError(UDS_BAD_STATE, + "delta index contains more collisions" + " (%ld) than records (%ld)", + deltaMemory->collisionCount, + deltaMemory->recordCount); + } + + // Validate the delta lists + DeltaList *deltaLists = deltaMemory->deltaLists; + if (getDeltaListStart(&deltaLists[0]) != 0) { + return logWarningWithStringError(UDS_BAD_STATE, + "the head guard delta list does not start" + " at 0: %llu", + getDeltaListStart(&deltaLists[0])); + } + uint64_t numBits = getDeltaListEnd(&deltaLists[deltaMemory->numLists + 1]); + if (numBits != deltaMemory->size * CHAR_BIT) { + return logWarningWithStringError(UDS_BAD_STATE, + "the tail guard delta list does not end " + "at end of allocated memory: %" PRIu64 + " != %zd", + numBits, deltaMemory->size * CHAR_BIT); + } + int numGuardBits = getDeltaListSize(&deltaLists[deltaMemory->numLists + 1]); + if (numGuardBits < GUARD_BITS) { + return logWarningWithStringError(UDS_BAD_STATE, + "the tail guard delta list does not " + "contain sufficient guard bits: %d < %d", + numGuardBits, GUARD_BITS); + } + unsigned int i; + for (i = 0; i <= deltaMemory->numLists + 1; i++) { + if (getDeltaListStart(&deltaLists[i]) > getDeltaListEnd(&deltaLists[i])) { + return logWarningWithStringError(UDS_BAD_STATE, + "invalid delta list %u: [%" PRIu64 + ", %llu)", + i, + getDeltaListStart(&deltaLists[i]), + getDeltaListEnd(&deltaLists[i])); + } + if (i > deltaMemory->numLists) { + // The rest of the checks do not apply to the tail guard list + continue; + } + if (getDeltaListEnd(&deltaLists[i]) + > getDeltaListStart(&deltaLists[i + 1])) { + return logWarningWithStringError(UDS_BAD_STATE, + "delta lists %u and %u overlap: %" + PRIu64 " > %llu", + i, i + 1, + getDeltaListEnd(&deltaLists[i]), + getDeltaListStart(&deltaLists[i + 1])); + } + if (i == 0) { + // The rest of the checks do not apply to the head guard list + continue; + } + if (deltaLists[i].saveOffset > getDeltaListSize(&deltaLists[i])) { + return logWarningWithStringError(UDS_BAD_STATE, + "delta lists %u saved offset is larger" + " than the list: %u > %u", + i, deltaLists[i].saveOffset, + getDeltaListSize(&deltaLists[i])); + } + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t getDeltaMemoryAllocated(const DeltaMemory *deltaMemory) +{ + return (deltaMemory->size + + getSizeOfDeltaLists(deltaMemory->numLists) + + getSizeOfFlags(deltaMemory->numLists) + + getSizeOfTempOffsets(deltaMemory->numLists)); +} + +/**********************************************************************/ +size_t getDeltaMemorySize(unsigned long numEntries, unsigned int meanDelta, + unsigned int numPayloadBits) +{ + unsigned short minBits; + unsigned int incrKeys, minKeys; + computeCodingConstants(meanDelta, &minBits, &minKeys, &incrKeys); + // On average, each delta is encoded into about minBits+1.5 bits. + return (numEntries * (numPayloadBits + minBits + 1) + numEntries / 2); +} diff --git a/source/uds/deltaMemory.h b/source/uds/deltaMemory.h new file mode 100644 index 0000000..1ffb3fd --- /dev/null +++ b/source/uds/deltaMemory.h @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/deltaMemory.h#1 $ + */ + +#ifndef DELTAMEMORY_H +#define DELTAMEMORY_H 1 + +#include "bits.h" +#include "bufferedReader.h" +#include "bufferedWriter.h" +#include "compiler.h" +#include "cpu.h" +#include "timeUtils.h" + +/* + * We encode the delta list information into 16 bytes per list. + * + * Because the master index has 1 million delta lists, each byte of header + * information ends up costing us 1MB. We have an incentive to keep the + * size down. + * + * The master index delta list memory is currently about 780MB in size, + * which is more than 6 gigabits. Therefore we need at least 33 bits to + * address the master index memory and we use the uint64_t type. + * + * The master index delta lists have 256 entries of about 24 bits each, + * which is 6K bits. The index needs 13 bits to represent the size of a + * delta list and we use the uint16_t type. + */ + +typedef struct deltaList { + uint64_t startOffset; // The offset of the delta list start within memory + uint16_t size; // The number of bits in the delta list + uint16_t saveOffset; // Where the last search "found" the key + unsigned int saveKey; // The key for the record just before saveOffset. +} DeltaList; + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) deltaMemory { + byte *memory; // The delta list memory + DeltaList *deltaLists; // The delta list headers + uint64_t *tempOffsets; // Temporary starts of delta lists + byte *flags; // Transfer flags + BufferedWriter *bufferedWriter; // Buffered writer for saving an index + size_t size; // The size of delta list memory + RelTime rebalanceTime; // The time spent rebalancing + int rebalanceCount; // Number of memory rebalances + unsigned short valueBits; // The number of bits of value + unsigned short minBits; // The number of bits in the minimal key code + unsigned int minKeys; // The number of keys used in a minimal code + unsigned int incrKeys; // The number of keys used for another code bit + long recordCount; // The number of records in the index + long collisionCount; // The number of collision records + long discardCount; // The number of records removed + long overflowCount; // The number of UDS_OVERFLOWs detected + unsigned int firstList; // The index of the first delta list + unsigned int numLists; // The number of delta lists + unsigned int numTransfers; // Number of transfer flags that are set + int transferStatus; // Status of the transfers in progress + byte tag; // Tag belonging to this delta index +} DeltaMemory; + +typedef struct deltaListSaveInfo { + uint8_t tag; // Tag identifying which delta index this list is in + uint8_t bitOffset; // Bit offset of the start of the list data + uint16_t byteCount; // Number of bytes of list data + uint32_t index; // The delta list number within the delta index +} DeltaListSaveInfo; + +// The maximum size of a single delta list (in bytes). We add guard bytes +// to this because such a buffer can be used with moveBits. +enum { DELTA_LIST_MAX_BYTE_COUNT = ((UINT16_MAX + CHAR_BIT) / CHAR_BIT + + POST_FIELD_GUARD_BYTES) }; + +/** + * Initialize delta list memory. + * + * @param deltaMemory A delta memory structure + * @param size The initial size of the memory array + * @param firstList The index of the first delta list + * @param numLists The number of delta lists + * @param meanDelta The mean delta + * @param numPayloadBits The number of payload bits + * + * @return error code or UDS_SUCCESS + **/ +int initializeDeltaMemory(DeltaMemory *deltaMemory, size_t size, + unsigned int firstList, unsigned int numLists, + unsigned int meanDelta, unsigned int numPayloadBits) + __attribute__((warn_unused_result)); + +/** + * Uninitialize delta list memory. + * + * @param deltaMemory A delta memory structure + **/ +void uninitializeDeltaMemory(DeltaMemory *deltaMemory); + +/** + * Initialize delta list memory to refer to a cached page. + * + * @param deltaMemory A delta memory structure + * @param memory The memory page + * @param size The size of the memory page + * @param numLists The number of delta lists + * @param meanDelta The mean delta + * @param numPayloadBits The number of payload bits + **/ +void initializeDeltaMemoryPage(DeltaMemory *deltaMemory, byte *memory, + size_t size, unsigned int numLists, + unsigned int meanDelta, + unsigned int numPayloadBits); + +/** + * Empty the delta lists. + * + * @param deltaMemory The delta memory + **/ +void emptyDeltaLists(DeltaMemory *deltaMemory); + +/** + * Is there a delta list memory save or restore in progress? + * + * @param deltaMemory A delta memory structure + * + * @return true if there are no delta lists that need to be saved or + * restored + **/ +bool areDeltaMemoryTransfersDone(const DeltaMemory *deltaMemory); + +/** + * Start restoring delta list memory from a file descriptor + * + * @param deltaMemory A delta memory structure + * + * @return error code or UDS_SUCCESS + **/ +int startRestoringDeltaMemory(DeltaMemory *deltaMemory) + __attribute__((warn_unused_result)); + +/** + * Read a saved delta list from a file descriptor + * + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * @param bufferedReader The buffered reader to read the delta list from + * + * @return error code or UDS_SUCCESS + * or UDS_END_OF_FILE at end of the data stream + **/ +int readSavedDeltaList(DeltaListSaveInfo *dlsi, + byte data[DELTA_LIST_MAX_BYTE_COUNT], + BufferedReader *bufferedReader) + __attribute__((warn_unused_result)); + +/** + * Restore a saved delta list + * + * @param deltaMemory A delta memory structure + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +int restoreDeltaList(DeltaMemory *deltaMemory, const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) + __attribute__((warn_unused_result)); + +/** + * Abort restoring delta list memory from an input stream. + * + * @param deltaMemory A delta memory structure + **/ +void abortRestoringDeltaMemory(DeltaMemory *deltaMemory); + +/** + * Start saving delta list memory to a buffered output stream + * + * @param deltaMemory A delta memory structure + * @param bufferedWriter The index state component being written + **/ +void startSavingDeltaMemory(DeltaMemory *deltaMemory, + BufferedWriter *bufferedWriter); + +/** + * Finish saving delta list memory to an output stream. Force the writing + * of all of the remaining data. If an error occurred asynchronously + * during the save operation, it will be returned here. + * + * @param deltaMemory A delta memory structure + * + * @return error code or UDS_SUCCESS + **/ +int finishSavingDeltaMemory(DeltaMemory *deltaMemory) + __attribute__((warn_unused_result)); + +/** + * Abort saving delta list memory to an output stream. If an error + * occurred asynchronously during the save operation, it will be dropped. + * + * @param deltaMemory A delta memory structure + **/ +void abortSavingDeltaMemory(DeltaMemory *deltaMemory); + +/** + * Flush a delta list to an output stream + * + * @param deltaMemory A delta memory structure + * @param flushIndex Index of the delta list that may need to be flushed. + **/ +void flushDeltaList(DeltaMemory *deltaMemory, unsigned int flushIndex); + +/** + * Write a guard delta list to mark the end of the saved data + * + * @param bufferedWriter The buffered writer to write the guard delta list to + * + * @return error code or UDS_SUCCESS + **/ +int writeGuardDeltaList(BufferedWriter *bufferedWriter) + __attribute__((warn_unused_result)); + +/** + * Extend the memory used by the delta lists and rebalance the lists in the + * new chunk. + * + *

The delta memory contains N delta lists, which are guarded by two + * empty delta lists. The valid delta lists are numbered 1 to N, and the + * guards are numbered 0 and (N+1); + * + *

When the delta lista are bit streams, it is possible that the tail + * of list J and the head of list (J+1) are in the same byte. In this case + * oldOffsets[j]+sizes[j]==oldOffset[j]-1. We handle this correctly. + * + * @param deltaMemory A delta memory structure + * @param growingIndex Index of the delta list that needs additional space + * left before it (from 1 to N+1). + * @param growingSize Number of additional bytes needed before growingIndex + * @param doCopy True to copy the data, False to just balance the space + * + * @return UDS_SUCCESS or an error code + **/ +int extendDeltaMemory(DeltaMemory *deltaMemory, unsigned int growingIndex, + size_t growingSize, bool doCopy) + __attribute__((warn_unused_result)); + +/** + * Validate the delta list headers. + * + * @param deltaMemory A delta memory structure + * + * @return UDS_SUCCESS or an error code + **/ +int validateDeltaLists(const DeltaMemory *deltaMemory) + __attribute__((warn_unused_result)); + +/** + * Get the number of bytes allocated for delta index entries and any + * associated overhead. + * + * @param deltaMemory A delta memory structure + * + * @return The number of bytes allocated + **/ +size_t getDeltaMemoryAllocated(const DeltaMemory *deltaMemory); + +/** + * Get the expected number of bits used in a delta index + * + * @param numEntries The number of index entries + * @param meanDelta The mean delta value + * @param numPayloadBits The number of bits in the payload or value + * + * @return The expected size of a delta index in bits + **/ +size_t getDeltaMemorySize(unsigned long numEntries, unsigned int meanDelta, + unsigned int numPayloadBits) + __attribute__((warn_unused_result)); + +/** + * Get the bit offset to the start of the delta list bit stream + * + * @param deltaList The delta list header + * + * @return the start of the delta list + **/ +static INLINE uint64_t getDeltaListStart(const DeltaList *deltaList) +{ + return deltaList->startOffset; +} + +/** + * Get the number of bits in a delta list bit stream + * + * @param deltaList The delta list header + * + * @return the size of the delta list + **/ +static INLINE uint16_t getDeltaListSize(const DeltaList *deltaList) +{ + return deltaList->size; +} + +/** + * Get the bit offset to the end of the delta list bit stream + * + * @param deltaList The delta list header + * + * @return the end of the delta list + **/ +static INLINE uint64_t getDeltaListEnd(const DeltaList *deltaList) +{ + return getDeltaListStart(deltaList) + getDeltaListSize(deltaList); +} + +/** + * Identify mutable vs. immutable delta memory + * + * Mutable delta memory contains delta lists that can be modified, and is + * initialized using initializeDeltaMemory(). + * + * Immutable delta memory contains packed delta lists, cannot be modified, + * and is initialized using initializeDeltaMemoryPage(). + * + * For mutable delta memory, all of the following expressions are true. + * And for immutable delta memory, all of the following expressions are + * false. + * deltaLists != NULL + * tempOffsets != NULL + * flags != NULL + * + * @param deltaMemory A delta memory structure + * + * @return true if the delta memory is mutable + **/ +static INLINE bool isMutable(const DeltaMemory *deltaMemory) +{ + return deltaMemory->deltaLists != NULL; +} + +/** + * Lazily flush a delta list to an output stream + * + * @param deltaMemory A delta memory structure + * @param flushIndex Index of the delta list that may need to be flushed. + **/ +static INLINE void lazyFlushDeltaList(DeltaMemory *deltaMemory, + unsigned int flushIndex) +{ + if (getField(deltaMemory->flags, flushIndex, 1) != 0) { + flushDeltaList(deltaMemory, flushIndex); + } +} +#endif /* DELTAMEMORY_H */ diff --git a/source/uds/errors.c b/source/uds/errors.c new file mode 100644 index 0000000..5aab19e --- /dev/null +++ b/source/uds/errors.c @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/errors.c#11 $ + */ + +#include "errors.h" + +#include "common.h" +#include "permassert.h" +#include "stringUtils.h" + +#ifdef __KERNEL__ +#include +#endif + +static const struct errorInfo successful = { "UDS_SUCCESS", "Success" }; + +#ifdef __KERNEL__ +static const char *const messageTable[] = { + [EPERM] = "Operation not permitted", + [ENOENT] = "No such file or directory", + [ESRCH] = "No such process", + [EINTR] = "Interrupted system call", + [EIO] = "Input/output error", + [ENXIO] = "No such device or address", + [E2BIG] = "Argument list too long", + [ENOEXEC] = "Exec format error", + [EBADF] = "Bad file descriptor", + [ECHILD] = "No child processes", + [EAGAIN] = "Resource temporarily unavailable", + [ENOMEM] = "Cannot allocate memory", + [EACCES] = "Permission denied", + [EFAULT] = "Bad address", + [ENOTBLK] = "Block device required", + [EBUSY] = "Device or resource busy", + [EEXIST] = "File exists", + [EXDEV] = "Invalid cross-device link", + [ENODEV] = "No such device", + [ENOTDIR] = "Not a directory", + [EISDIR] = "Is a directory", + [EINVAL] = "Invalid argument", + [ENFILE] = "Too many open files in system", + [EMFILE] = "Too many open files", + [ENOTTY] = "Inappropriate ioctl for device", + [ETXTBSY] = "Text file busy", + [EFBIG] = "File too large", + [ENOSPC] = "No space left on device", + [ESPIPE] = "Illegal seek", + [EROFS] = "Read-only file system", + [EMLINK] = "Too many links", + [EPIPE] = "Broken pipe", + [EDOM] = "Numerical argument out of domain", + [ERANGE] = "Numerical result out of range" +}; +#endif + +static const struct errorInfo errorList[] = { + { "UDS_UNINITIALIZED", "UDS library is not initialized" }, + { "UDS_SHUTTINGDOWN", "UDS library is shutting down" }, + { "UDS_EMODULE_LOAD", "Could not load modules" }, + { "UDS_ENOTHREADS", "Could not create a new thread" }, + { "UDS_NOCONTEXT", "Could not find the requested library context" }, + { "UDS_DISABLED", "UDS library context is disabled" }, + { "UDS_CORRUPT_COMPONENT", "Corrupt saved component" }, + { "UDS_UNKNOWN_ERROR", "Unknown error" }, + { "UDS_UNUSED_CODE_8", "Unused error code 8" }, + { "UDS_UNUSED_CODE_9", "Unused error code 9" }, + { "UDS_UNSUPPORTED_VERSION", "Unsupported version" }, + { "UDS_NO_INDEXSESSION", "Index session not known" }, + { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" }, + { "UDS_SHORT_READ", "Could not read requested number of bytes" }, + { "UDS_UNUSED_CODE_14", "Unused error code 14" }, + { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" }, + { "UDS_VOLUME_OVERFLOW", "Memory overflow due to storage failure" }, + { "UDS_UNUSED_CODE_17", "Unused error code 17" }, + { "UDS_UNUSED_CODE_18", "Unused error code 18" }, + { "UDS_UNUSED_CODE_19", "Unused error code 19" }, + { "UDS_CONF_PTR_REQUIRED", "A configuration pointer is required" }, + { "UDS_INDEX_STATS_PTR_REQUIRED", "An index stats pointer is required" }, + { "UDS_CONTEXT_STATS_PTR_REQUIRED", "A context stats pointer is required" }, + { "UDS_UNUSED_CODE_23", "Unused error code 23" }, + { "UDS_UNUSED_CODE_24", "Unused error code 24" }, + { "UDS_UNUSED_CODE_25", "Unused error code 25" }, + { "UDS_UNUSED_CODE_26", "Unused error code 26" }, + { "UDS_UNUSED_CODE_27", "Unused error code 27" }, + { "UDS_INVALID_MEMORY_SIZE", + "Configured memory too small or unsupported size" }, + { "UDS_UNUSED_CODE_29", "Unused error code 29" }, + { "UDS_INDEX_NAME_REQUIRED", "An index name is required" }, + { "UDS_CONF_REQUIRED", "A configuration is required" }, + { "UDS_UNUSED_CODE_32", "Unused error code 32" }, + { "UDS_UNUSED_CODE_33", "Unused error code 33" }, + { "UDS_UNUSED_CODE_34", "Unused error code 34" }, + { "UDS_UNUSED_CODE_35", "Unused error code 35" }, + { "UDS_UNUSED_CODE_36", "Unused error code 36" }, + { "UDS_NO_INDEX", "No index found" }, + { "UDS_BAD_CHECKPOINT_FREQUENCY", "Checkpoint frequency out of range" }, + { "UDS_WRONG_INDEX_CONFIG", "Wrong type of index configuration" }, + { "UDS_UNUSED_CODE_40", "Unused error code 40" }, + { "UDS_UNUSED_CODE_41", "Unused error code 41" }, + { "UDS_UNUSED_CODE_42", "Unused error code 42" }, + { "UDS_UNUSED_CODE_43", "Unused error code 43" }, + { "UDS_END_OF_FILE", "Unexpected end of file" }, + { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" }, + { "UDS_UNUSED_CODE_46", "Unused error code 46" }, + { "UDS_INSUFFICIENT_INDEX_SPACE", "Insufficient index space" }, + { "UDS_UNUSED_CODE_48", "Unused error code 48" }, + { "UDS_UNUSED_CODE_49", "Unused error code 49" }, + { "UDS_SUSPENDED", "Index suspended"}, + { "UDS_UNUSED_CODE_51", "Unused error code 51" }, + { "UDS_INDEXSESSION_IN_USE", "Index session in use"}, + { "UDS_CALLBACK_REQUIRED", "A callback function is required"}, + { "UDS_INVALID_OPERATION_TYPE", "Invalid type of request operation"}, +}; + +static const struct errorInfo internalErrorList[] = { + { "UDS_INTERNAL_UNUSED_0", "Unused internal error 0" }, + { "UDS_OVERFLOW", "Index overflow" }, + { "UDS_INTERNAL_UNUSED_2", "Unused internal error 2" }, + { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" }, + { "UDS_BAD_STATE", "UDS data structures are in an invalid state" }, + { "UDS_DUPLICATE_NAME", + "Attempt to enter the same name into a delta index twice" }, + { "UDS_UNEXPECTED_RESULT", "Unexpected result from internal routine" }, + { "UDS_INJECTED_ERROR", "Injected error" }, + { "UDS_ASSERTION_FAILED", "Assertion failed" }, + { "UDS_INTERNAL_UNUSED_9", "Unused internal error 9" }, + { "UDS_QUEUED", "Request queued" }, + { "UDS_INTERNAL_UNUSED_11", "Unused internal error 11" }, + { "UDS_INTERNAL_UNUSED_12", "Unused internal error 12" }, + { "UDS_BUFFER_ERROR", "Buffer error" }, + { "UDS_INTERNAL_UNUSED_14", "Unused internal error 14" }, + { "UDS_INTERNAL_UNUSED_15", "Unused internal error 15" }, + { "UDS_NO_DIRECTORY", "Expected directory is missing" }, + { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" }, + { "UDS_INTERNAL_UNUSED_18", "Unused internal error 18" }, + { "UDS_INTERNAL_UNUSED_19", "Unused internal error 19" }, + { "UDS_ALREADY_REGISTERED", "Error range already registered" }, + { "UDS_BAD_IO_DIRECTION", "Bad I/O direction" }, + { "UDS_INCORRECT_ALIGNMENT", "Offset not at block alignment" }, + { "UDS_OUT_OF_RANGE", "Cannot access data outside specified limits" }, +}; + +typedef struct errorBlock { + const char *name; + int base; + int last; + int max; + const ErrorInfo *infos; +} ErrorBlock; + +enum { + MAX_ERROR_BLOCKS = 6 // needed for testing +}; + +static struct errorInformation { + int allocated; + int count; + ErrorBlock blocks[MAX_ERROR_BLOCKS]; +} registeredErrors = { + .allocated = MAX_ERROR_BLOCKS, + .count = 2, + .blocks = { + { + .name = "UDS Error", + .base = UDS_ERROR_CODE_BASE, + .last = UDS_ERROR_CODE_LAST, + .max = UDS_ERROR_CODE_BLOCK_END, + .infos = errorList, + }, + { + .name = "UDS Internal Error", + .base = UDS_INTERNAL_ERROR_CODE_BASE, + .last = UDS_INTERNAL_ERROR_CODE_LAST, + .max = UDS_INTERNAL_ERROR_CODE_BLOCK_END, + .infos = internalErrorList, + } + } +}; + +/** + * Fetch the error info (if any) for the error number. + * + * @param errnum the error number + * @param infoPtr the place to store the info for this error (if known), + * otherwise set to NULL + * + * @return the name of the error block (if known), NULL othersise + **/ +static const char *getErrorInfo(int errnum, const ErrorInfo **infoPtr) +{ + + if (errnum == UDS_SUCCESS) { + if (infoPtr != NULL) { + *infoPtr = &successful; + } + return NULL; + } + + ErrorBlock *block; + for (block = registeredErrors.blocks; + block < registeredErrors.blocks + registeredErrors.count; + ++block) { + if ((errnum >= block->base) && (errnum < block->last)) { + if (infoPtr != NULL) { + *infoPtr = block->infos + (errnum - block->base); + } + return block->name; + } else if ((errnum >= block->last) && (errnum < block->max)) { + if (infoPtr != NULL) { + *infoPtr = NULL; + } + return block->name; + } + } + if (infoPtr != NULL) { + *infoPtr = NULL; + } + return NULL; +} + +/** + * Return string describing a system error message + * + * @param errnum System error number + * @param buf Buffer that can be used to contain the return value + * @param buflen Length of the buffer + * + * @return The error string, which may be a string constant or may be + * returned in the buf argument + **/ +#ifdef __KERNEL__ +static const char *systemStringError(int errnum, char *buf, size_t buflen) +{ + const char *errorString = NULL; + if ((errnum > 0) && (errnum < COUNT_OF(messageTable))) { + errorString = messageTable[errnum]; + } + + size_t len = ((errorString == NULL) + ? snprintf(buf, buflen, "Unknown error %d", errnum) + : snprintf(buf, buflen, "%s", errorString)); + if (len < buflen) { + return buf; + } + + buf[0] = '\0'; + return "System error"; +} +#else +static INLINE const char *systemStringError(int errnum, char *buf, + size_t buflen) +{ + return strerror_r(errnum, buf, buflen); +} +#endif + +/*****************************************************************************/ +const char *stringError(int errnum, char *buf, size_t buflen) +{ + if (buf == NULL) { + return NULL; + } + + char *buffer = buf; + char *bufEnd = buf + buflen; + + if (isUnrecoverable(errnum)) { + buffer = appendToBuffer(buffer, bufEnd, "Unrecoverable error: "); + errnum = sansUnrecoverable(errnum); + } + + const ErrorInfo *info = NULL; + const char *blockName = getErrorInfo(errnum, &info); + + if (blockName != NULL) { + if (info != NULL) { + buffer = appendToBuffer(buffer, bufEnd, + "%s: %s", blockName, info->message); + } else { + buffer = appendToBuffer(buffer, bufEnd, + "Unknown %s %d", blockName, errnum); + } + } else if (info != NULL) { + buffer = appendToBuffer(buffer, bufEnd, "%s", info->message); + } else { + const char *tmp = systemStringError(errnum, buffer, bufEnd - buffer); + if (tmp != buffer) { + buffer = appendToBuffer(buffer, bufEnd, "%s", tmp); + } else { + buffer += strlen(tmp); + } + } + return buf; +} + +/*****************************************************************************/ +const char *stringErrorName(int errnum, char *buf, size_t buflen) +{ + errnum = sansUnrecoverable(errnum); + + char *buffer = buf; + char *bufEnd = buf + buflen; + + const ErrorInfo *info = NULL; + const char *blockName = getErrorInfo(errnum, &info); + + if (blockName != NULL) { + if (info != NULL) { + buffer = appendToBuffer(buffer, bufEnd, "%s", info->name); + } else { + buffer = appendToBuffer(buffer, bufEnd, "%s %d", blockName, errnum); + } + } else if (info != NULL) { + buffer = appendToBuffer(buffer, bufEnd, "%s", info->name); + } else { + const char *tmp = systemStringError(errnum, buffer, bufEnd - buffer); + if (tmp != buffer) { + buffer = appendToBuffer(buffer, bufEnd, "%s", tmp); + } else { + buffer += strlen(tmp); + } + } + return buf; +} + +/*****************************************************************************/ +int registerErrorBlock(const char *blockName, + int firstError, + int lastReservedError, + const ErrorInfo *infos, + size_t infoSize) +{ + int result = ASSERT(firstError < lastReservedError, + "bad error block range"); + if (result != UDS_SUCCESS) { + return result; + } + + if (registeredErrors.count == registeredErrors.allocated) { + // could reallocate and grow, but should never happen + return UDS_OVERFLOW; + } + + ErrorBlock *block; + for (block = registeredErrors.blocks; + block < registeredErrors.blocks + registeredErrors.count; + ++block) { + if (strcmp(blockName, block->name) == 0) { + return UDS_DUPLICATE_NAME; + } + // check for overlap in error ranges + if ((firstError < block->max) && (lastReservedError > block->base)) { + return UDS_ALREADY_REGISTERED; + } + } + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = blockName, + .base = firstError, + .last = firstError + (infoSize / sizeof(ErrorInfo)), + .max = lastReservedError, + .infos = infos + }; + + return UDS_SUCCESS; +} diff --git a/source/uds/errors.h b/source/uds/errors.h new file mode 100644 index 0000000..faccd5a --- /dev/null +++ b/source/uds/errors.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/errors.h#4 $ + */ + +#ifndef ERRORS_H +#define ERRORS_H + +#include "compiler.h" +#include "typeDefs.h" +#include "uds-error.h" + +enum udsInternalErrorCodes { + /** Used as a base value for reporting internal errors */ + UDS_INTERNAL_ERROR_CODE_BASE = 66560, + /** Unused */ + UDS_INTERNAL_UNUSED_0 = UDS_INTERNAL_ERROR_CODE_BASE + 0, + /** Index overflow */ + UDS_OVERFLOW = UDS_INTERNAL_ERROR_CODE_BASE + 1, + /** Unused */ + UDS_INTERNAL_UNUSED_2 = UDS_INTERNAL_ERROR_CODE_BASE + 2, + /** Invalid argument passed to internal routine */ + UDS_INVALID_ARGUMENT = UDS_INTERNAL_ERROR_CODE_BASE + 3, + /** UDS data structures are in an invalid state */ + UDS_BAD_STATE = UDS_INTERNAL_ERROR_CODE_BASE + 4, + /** Attempt to enter the same name into an internal structure twice */ + UDS_DUPLICATE_NAME = UDS_INTERNAL_ERROR_CODE_BASE + 5, + /** An internal protocol violation between system components */ + UDS_UNEXPECTED_RESULT = UDS_INTERNAL_ERROR_CODE_BASE + 6, + /** An error created by test case processing */ + UDS_INJECTED_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 7, + /** An assertion failed */ + UDS_ASSERTION_FAILED = UDS_INTERNAL_ERROR_CODE_BASE + 8, + /** Unused */ + UDS_INTERNAL_UNUSED_9 = UDS_INTERNAL_ERROR_CODE_BASE + 9, + /** Not an actual error, but reporting that the result will be delayed */ + UDS_QUEUED = UDS_INTERNAL_ERROR_CODE_BASE + 10, + /** Unused */ + UDS_INTERNAL_UNUSED_11 = UDS_INTERNAL_ERROR_CODE_BASE + 11, + /** Unused */ + UDS_INTERNAL_UNUSED_12 = UDS_INTERNAL_ERROR_CODE_BASE + 12, + /** A problem has occured with a Buffer */ + UDS_BUFFER_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 13, + /** Unused */ + UDS_INTERNAL_UNUSED_14 = UDS_INTERNAL_ERROR_CODE_BASE + 14, + /** Unused */ + UDS_INTERNAL_UNUSED_15 = UDS_INTERNAL_ERROR_CODE_BASE + 15, + /** No directory was found where one was expected */ + UDS_NO_DIRECTORY = UDS_INTERNAL_ERROR_CODE_BASE + 16, + /** Checkpoint not completed */ + UDS_CHECKPOINT_INCOMPLETE = UDS_INTERNAL_ERROR_CODE_BASE + 17, + /** Unused */ + UDS_INTERNAL_UNUSED_18 = UDS_INTERNAL_ERROR_CODE_BASE + 18, + /** Unused */ + UDS_INTERNAL_UNUSED_19 = UDS_INTERNAL_ERROR_CODE_BASE + 19, + /** This error range has already been registered */ + UDS_ALREADY_REGISTERED = UDS_INTERNAL_ERROR_CODE_BASE + 20, + /** Either read-only or write-only */ + UDS_BAD_IO_DIRECTION = UDS_INTERNAL_ERROR_CODE_BASE + 21, + /** Cannot do I/O at this offset */ + UDS_INCORRECT_ALIGNMENT = UDS_INTERNAL_ERROR_CODE_BASE + 22, + /** Attempt to read or write data outside the bounds established for it */ + UDS_OUT_OF_RANGE = UDS_INTERNAL_ERROR_CODE_BASE + 23, + /** One more than the last UDS_INTERNAL error code */ + UDS_INTERNAL_ERROR_CODE_LAST, + /** One more than the last error this block will ever use */ + UDS_INTERNAL_ERROR_CODE_BLOCK_END = UDS_INTERNAL_ERROR_CODE_BASE + 440 +}; + +enum { + ERRBUF_SIZE = 128 // default size for buffer passed to stringError +}; + +// Error attributes - or into top half of error code +enum { UDS_UNRECOVERABLE = (1 << 17) }; + +const char *stringError(int errnum, char *buf, size_t buflen); +const char *stringErrorName(int errnum, char *buf, size_t buflen); + +/* + * Identify that an result code is a successful result. + * + * @param result A result code + * + * @return true if the result represents a success. + */ +__attribute__((warn_unused_result)) +static INLINE bool isSuccessful(int result) +{ + return (result == UDS_SUCCESS) || (result == UDS_QUEUED); +} + +/* + * Identify that an result code has been marked unrecoverable. + * + * @param result A result code + * + * @return true if the result has been marked unrecoverable. + */ +__attribute__((warn_unused_result)) +static INLINE bool isUnrecoverable(int result) +{ + return (result & UDS_UNRECOVERABLE) != 0; +} + +/* + * Mark a result code as unrecoverable. + * + * @param result A result code + * + * @return the result code with the unrecoverable marker added + */ +__attribute__((warn_unused_result)) +static INLINE int makeUnrecoverable(int result) +{ + return isSuccessful(result) ? result : (result | UDS_UNRECOVERABLE); +} + +/* + * Remove the unrecoverable marker from a result code. + * + * @param result A result code + * + * @return the result code with the unrecoverable marker removed + */ +__attribute__((warn_unused_result)) +static INLINE int sansUnrecoverable(int result) +{ + return result & ~UDS_UNRECOVERABLE; +} + +typedef struct errorInfo { + const char *name; + const char *message; +} ErrorInfo; + +/** + * Register an error code block for stringError and stringErrorName. + * + * @param blockName the name of the block of error codes + * @param firstError the first error code in the block + * @param lastReservedError one past the highest possible error in the bloc + * @param infos a pointer to the error info array for the block + * @param infoSize the size of the error info array, which + * determines the last actual error for which + * information is available + * + * @return a success or error code, particularly UDS_DUPLICATE_NAME if the + * block name is already present, or UDS_ALREADY_REGISTERED if a + * block with the specified error code is present + **/ +int registerErrorBlock(const char *blockName, + int firstError, + int lastReservedError, + const ErrorInfo *infos, + size_t infoSize); + +/** + * Return the first error between result1 and result2. + * + * @param result1 A success or error code. + * @param result2 A success or error code. + * + * @return result1 if that is an error, else result2 + **/ +static INLINE int firstError(int result1, int result2) +{ + return result1 == UDS_SUCCESS ? result2 : result1; +} + +#endif /* ERRORS_H */ diff --git a/source/uds/geometry.c b/source/uds/geometry.c new file mode 100644 index 0000000..6d8cfa6 --- /dev/null +++ b/source/uds/geometry.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/geometry.c#3 $ + */ + +#include "geometry.h" + +#include "deltaIndex.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" + +/**********************************************************************/ +static int initializeGeometry(Geometry *geometry, + size_t bytesPerPage, + unsigned int recordPagesPerChapter, + unsigned int chaptersPerVolume, + unsigned int sparseChaptersPerVolume) +{ + int result = ASSERT_WITH_ERROR_CODE(bytesPerPage >= BYTES_PER_RECORD, + UDS_BAD_STATE, + "page is smaller than a record: %zu", + bytesPerPage); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT_WITH_ERROR_CODE(chaptersPerVolume > sparseChaptersPerVolume, + UDS_INVALID_ARGUMENT, + "sparse chapters per volume (%u) must be less" + " than chapters per volume (%u)", + sparseChaptersPerVolume, + chaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + + geometry->bytesPerPage = bytesPerPage; + geometry->recordPagesPerChapter = recordPagesPerChapter; + geometry->chaptersPerVolume = chaptersPerVolume; + geometry->sparseChaptersPerVolume = sparseChaptersPerVolume; + geometry->denseChaptersPerVolume = + chaptersPerVolume - sparseChaptersPerVolume; + + // Calculate the number of records in a page, chapter, and volume. + geometry->recordsPerPage = bytesPerPage / BYTES_PER_RECORD; + geometry->recordsPerChapter + = geometry->recordsPerPage * recordPagesPerChapter; + geometry->recordsPerVolume + = (unsigned long) geometry->recordsPerChapter * chaptersPerVolume; + geometry->openChapterLoadRatio = DEFAULT_OPEN_CHAPTER_LOAD_RATIO; + + // Initialize values for delta chapter indexes. + geometry->chapterMeanDelta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS; + geometry->chapterPayloadBits = computeBits(recordPagesPerChapter - 1); + // We want 1 delta list for every 64 records in the chapter. The "| 077" + // ensures that the chapterDeltaListBits computation does not underflow. + geometry->chapterDeltaListBits + = computeBits((geometry->recordsPerChapter - 1) | 077) - 6; + geometry->deltaListsPerChapter = 1 << geometry->chapterDeltaListBits; + // We need enough address bits to achieve the desired mean delta. + geometry->chapterAddressBits + = (DEFAULT_CHAPTER_MEAN_DELTA_BITS - geometry->chapterDeltaListBits + + computeBits(geometry->recordsPerChapter - 1)); + // Let the delta index code determine how many pages are needed for the index + geometry->indexPagesPerChapter + = getDeltaIndexPageCount(geometry->recordsPerChapter, + geometry->deltaListsPerChapter, + geometry->chapterMeanDelta, + geometry->chapterPayloadBits, + bytesPerPage); + + // Now that we have the size of a chapter index, we can calculate the + // space used by chapters and volumes. + geometry->pagesPerChapter + = geometry->indexPagesPerChapter + recordPagesPerChapter; + geometry->pagesPerVolume = geometry->pagesPerChapter * chaptersPerVolume; + geometry->headerPagesPerVolume = 1; + geometry->bytesPerVolume = bytesPerPage * + (geometry->pagesPerVolume + geometry->headerPagesPerVolume); + geometry->bytesPerChapter = bytesPerPage * geometry->pagesPerChapter; + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeGeometry(size_t bytesPerPage, + unsigned int recordPagesPerChapter, + unsigned int chaptersPerVolume, + unsigned int sparseChaptersPerVolume, + Geometry **geometryPtr) +{ + Geometry *geometry; + int result = ALLOCATE(1, Geometry, "geometry", &geometry); + if (result != UDS_SUCCESS) { + return result; + } + result = initializeGeometry(geometry, bytesPerPage, recordPagesPerChapter, + chaptersPerVolume, sparseChaptersPerVolume); + if (result != UDS_SUCCESS) { + freeGeometry(geometry); + return result; + } + + *geometryPtr = geometry; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int copyGeometry(Geometry *source, Geometry **geometryPtr) +{ + return makeGeometry(source->bytesPerPage, + source->recordPagesPerChapter, + source->chaptersPerVolume, + source->sparseChaptersPerVolume, + geometryPtr); +} + +/**********************************************************************/ +void freeGeometry(Geometry *geometry) +{ + FREE(geometry); +} + +/**********************************************************************/ +uint64_t mapToVirtualChapterNumber(Geometry *geometry, + uint64_t newestVirtualChapter, + unsigned int physicalChapter) +{ + unsigned int newestPhysicalChapter + = mapToPhysicalChapter(geometry, newestVirtualChapter); + uint64_t virtualChapter + = newestVirtualChapter - newestPhysicalChapter + physicalChapter; + if (physicalChapter > newestPhysicalChapter) { + virtualChapter -= geometry->chaptersPerVolume; + } + return virtualChapter; +} + +/**********************************************************************/ +bool hasSparseChapters(const Geometry *geometry, + uint64_t oldestVirtualChapter, + uint64_t newestVirtualChapter) +{ + return (isSparse(geometry) + && ((newestVirtualChapter - oldestVirtualChapter + 1) + > geometry->denseChaptersPerVolume)); +} + +/**********************************************************************/ +bool isChapterSparse(const Geometry *geometry, + uint64_t oldestVirtualChapter, + uint64_t newestVirtualChapter, + uint64_t virtualChapterNumber) +{ + return (hasSparseChapters(geometry, oldestVirtualChapter, + newestVirtualChapter) + && ((virtualChapterNumber + geometry->denseChaptersPerVolume) + <= newestVirtualChapter)); +} + +/**********************************************************************/ +bool areSamePhysicalChapter(const Geometry *geometry, + uint64_t chapter1, + uint64_t chapter2) +{ + return ((chapter1 % geometry->chaptersPerVolume) + == (chapter2 % geometry->chaptersPerVolume)); +} diff --git a/source/uds/geometry.h b/source/uds/geometry.h new file mode 100644 index 0000000..47f771d --- /dev/null +++ b/source/uds/geometry.h @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/geometry.h#3 $ + */ + +#ifndef GEOMETRY_H +#define GEOMETRY_H 1 + +#include "compiler.h" +#include "typeDefs.h" +#include "uds.h" +#include "uds-block.h" + +/** + * Geometry defines constants and a record that parameterize the layout of an + * Albireo index volume. + * + *

An index volume is divided into a fixed number of fixed-size + * chapters, each consisting of a fixed number of fixed-size + * pages. The volume layout is defined by two assumptions and four + * parameters. The assumptions (constants) are that index records are + * 64 bytes (32-byte block name plus 32-byte metadata) and that open + * chapter index hash slots are one byte long. The four parameters are + * the number of bytes in a page, the number of chapters in a volume, + * the number of record pages in a chapter, and the number of chapters + * that are sparse. From these parameters, we derive the rest of the + * layout and derived properties, ranging from the number of pages in + * a chapter to the number of records in the volume. + * + *

The default geometry is 64 KByte pages, 1024 chapters, 256 + * record pages in a chapter, and zero sparse chapters. This will + * allow us to store 2^28 entries (indexing 1TB of 4K blocks) in an + * approximately 16.5 MByte volume using fourteen index pages in each + * chapter. + **/ +typedef struct geometry { + /** Length of a page in a chapter, in bytes */ + size_t bytesPerPage; + /** Number of record pages in a chapter */ + unsigned int recordPagesPerChapter; + /** Number of (total) chapters in a volume */ + unsigned int chaptersPerVolume; + /** Number of sparsely-indexed chapters in a volume */ + unsigned int sparseChaptersPerVolume; + /** Number of bits used to determine delta list numbers */ + unsigned int chapterDeltaListBits; + + // These are derived properties, expressed as fields for convenience. + /** Total number of pages in a volume, excluding header */ + unsigned int pagesPerVolume; + /** Total number of header pages per volume */ + unsigned int headerPagesPerVolume; + /** Total number of bytes in a volume, including header */ + size_t bytesPerVolume; + /** Total number of bytes in a chapter */ + size_t bytesPerChapter; + /** Number of pages in a chapter */ + unsigned int pagesPerChapter; + /** Number of index pages in a chapter index */ + unsigned int indexPagesPerChapter; + /** The minimum ratio of hash slots to records in an open chapter */ + unsigned int openChapterLoadRatio; + /** Number of records that fit on a page */ + unsigned int recordsPerPage; + /** Number of records that fit in a chapter */ + unsigned int recordsPerChapter; + /** Number of records that fit in a volume */ + uint64_t recordsPerVolume; + /** Number of deltaLists per chapter index */ + unsigned int deltaListsPerChapter; + /** Mean delta in chapter indexes */ + unsigned int chapterMeanDelta; + /** Number of bits needed for record page numbers */ + unsigned int chapterPayloadBits; + /** Number of bits used to compute addresses for chapter delta lists */ + unsigned int chapterAddressBits; + /** Number of densely-indexed chapters in a volume */ + unsigned int denseChaptersPerVolume; +} Geometry; + +enum { + /* The number of bytes in a record (name + metadata) */ + BYTES_PER_RECORD = (UDS_CHUNK_NAME_SIZE + UDS_MAX_BLOCK_DATA_SIZE), + + /* The default length of a page in a chapter, in bytes */ + DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD, + + /* The default maximum number of records per page */ + DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD, + + /** The default number of record pages in a chapter */ + DEFAULT_RECORD_PAGES_PER_CHAPTER = 256, + + /** The default number of record pages in a chapter for a small index */ + SMALL_RECORD_PAGES_PER_CHAPTER = 64, + + /** The default number of chapters in a volume */ + DEFAULT_CHAPTERS_PER_VOLUME = 1024, + + /** The default number of sparsely-indexed chapters in a volume */ + DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0, + + /** The log2 of the default mean delta */ + DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16, + + /** The log2 of the number of delta lists in a large chapter */ + DEFAULT_CHAPTER_DELTA_LIST_BITS = 12, + + /** The log2 of the number of delta lists in a small chapter */ + SMALL_CHAPTER_DELTA_LIST_BITS = 10, + + /** The default min ratio of slots to records in an open chapter */ + DEFAULT_OPEN_CHAPTER_LOAD_RATIO = 2, + + /** Checkpoint every n chapters written. Default is to not checkpoint */ + DEFAULT_CHECKPOINT_FREQUENCY = 0 +}; + +/** + * Allocate and initialize all fields of a volume geometry using the + * specified layout parameters. + * + * @param bytesPerPage The length of a page in a chapter, in bytes + * @param recordPagesPerChapter The number of pages in a chapter + * @param chaptersPerVolume The number of chapters in a volume + * @param sparseChaptersPerVolume The number of sparse chapters in a volume + * @param geometryPtr A pointer to hold the new geometry + * + * @return UDS_SUCCESS or an error code + **/ +int makeGeometry(size_t bytesPerPage, + unsigned int recordPagesPerChapter, + unsigned int chaptersPerVolume, + unsigned int sparseChaptersPerVolume, + Geometry **geometryPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate a new geometry and initialize it with the same parameters as an + * existing geometry. + * + * @param source The geometry record to copy + * @param geometryPtr A pointer to hold the new geometry + * + * @return UDS_SUCCESS or an error code + **/ +int copyGeometry(Geometry *source, + Geometry **geometryPtr) + __attribute__((warn_unused_result)); + +/** + * Clean up a geometry and its memory. + * + * @param geometry The geometry record to free + **/ +void freeGeometry(Geometry *geometry); + +/** + * Map a virtual chapter number to a physical chapter number + * + * @param geometry The geometry + * @param virtualChapter The virtual chapter number + * + * @return the corresponding physical chapter number + **/ +__attribute__((warn_unused_result)) +static INLINE unsigned int mapToPhysicalChapter(const Geometry *geometry, + uint64_t virtualChapter) +{ + return (virtualChapter % geometry->chaptersPerVolume); +} + +/** + * Convert a physical chapter number to its current virtual chapter number. + * + * @param geometry The geometry + * @param newestVirtualChapter The number of the newest virtual chapter + * @param physicalChapter The physical chapter number to convert + * + * @return The current virtual chapter number of the physical chapter + * in question + **/ +uint64_t mapToVirtualChapterNumber(Geometry *geometry, + uint64_t newestVirtualChapter, + unsigned int physicalChapter); + +/** + * Check whether this geometry is for a sparse index. + * + * @param geometry The geometry to check + * + * @return true if this geometry has sparse chapters + **/ +__attribute__((warn_unused_result)) +static INLINE bool isSparse(const Geometry *geometry) +{ + return (geometry->sparseChaptersPerVolume > 0); +} + +/** + * Check whether any sparse chapters have been filled. + * + * @param geometry The geometry of the index + * @param oldestVirtualChapter The number of the oldest chapter in the + * index + * @param newestVirtualChapter The number of the newest chapter in the + * index + * + * @return true if the index has filled at least one sparse chapter + **/ +bool hasSparseChapters(const Geometry *geometry, + uint64_t oldestVirtualChapter, + uint64_t newestVirtualChapter) + __attribute__((warn_unused_result)); + +/** + * Check whether a chapter is sparse or dense. + * + * @param geometry The geometry of the index containing the chapter + * @param oldestVirtualChapter The number of the oldest chapter in the index + * @param newestVirtualChapter The number of the newest chapter in the index + * @param virtualChapterNumber The number of the chapter to check + * + * @return true if the chapter is sparse + **/ +bool isChapterSparse(const Geometry *geometry, + uint64_t oldestVirtualChapter, + uint64_t newestVirtualChapter, + uint64_t virtualChapterNumber) + __attribute__((warn_unused_result)); + +/** + * Check whether two virtual chapter numbers correspond to the same + * physical chapter. + * + * @param geometry The geometry of the index + * @param chapter1 The first chapter to compare + * @param chapter2 The second chapter to compare + * + * @return true if both chapters correspond to the same + * physical chapter + **/ +bool areSamePhysicalChapter(const Geometry *geometry, + uint64_t chapter1, + uint64_t chapter2) + __attribute__((warn_unused_result)); + +#endif /* GEOMETRY_H */ diff --git a/source/uds/hashUtils.c b/source/uds/hashUtils.c new file mode 100644 index 0000000..45b2c81 --- /dev/null +++ b/source/uds/hashUtils.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/hashUtils.c#2 $ + */ + +#include "hashUtils.h" + +#include "errors.h" +#include "logger.h" +#include "permassert.h" +#include "stringUtils.h" +#include "uds.h" + +/** + * Convert a byte string to the hex representation. + * + * @param data binary data to convert + * @param dataLen length of binary data + * @param hex target to write hex string into + * @param hexLen capacity of target string + * + * @return UDS_SUCCESS, + * or UDS_INVALID_ARGUMENT if hexLen + * is too short. + **/ +static int dataToHex(const unsigned char *data, size_t dataLen, + char *hex, size_t hexLen) +{ + if (hexLen < 2 * dataLen + 1) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "hex data incorrect size"); + } + size_t i; + for (i = 0; i < dataLen; ++i) { + int rc = fixedSprintf(__func__, &hex[2 * i], hexLen - (2 * i), + UDS_INVALID_ARGUMENT, "%02X", data[i]); + + if (rc != UDS_SUCCESS) { + return rc; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int chunkNameToHex(const UdsChunkName *chunkName, + char *hexData, size_t hexDataLen) +{ + return dataToHex(chunkName->name, UDS_CHUNK_NAME_SIZE, + hexData, hexDataLen); +} + +/**********************************************************************/ +int chunkDataToHex(const UdsChunkData *chunkData, + char *hexData, size_t hexDataLen) +{ + return dataToHex(chunkData->data, UDS_MAX_BLOCK_DATA_SIZE, + hexData, hexDataLen); +} + +/**********************************************************************/ +unsigned int computeBits(unsigned int maxValue) +{ + // __builtin_clz() counts leading (high-order) zero bits, so if + // we ever need this to be fast, under GCC we can do: + // return ((maxValue == 0) ? 0 : (32 - __builtin_clz(maxValue))); + + unsigned int bits = 0; + while (maxValue > 0) { + maxValue >>= 1; + bits++; + } + return bits; +} + +/**********************************************************************/ +void hashUtilsCompileTimeAssertions(void) +{ + STATIC_ASSERT((UDS_CHUNK_NAME_SIZE % sizeof(uint64_t)) == 0); + STATIC_ASSERT(UDS_CHUNK_NAME_SIZE == 16); +} diff --git a/source/uds/hashUtils.h b/source/uds/hashUtils.h new file mode 100644 index 0000000..2d6d0a8 --- /dev/null +++ b/source/uds/hashUtils.h @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/hashUtils.h#1 $ + */ + +#ifndef HASH_UTILS_H +#define HASH_UTILS_H 1 + +#include "compiler.h" +#include "common.h" +#include "geometry.h" +#include "numeric.h" +#include "uds.h" + +// How various portions of a hash are apportioned. Size dependent. +enum { + MASTER_INDEX_BYTES_OFFSET = 0, // size 8 + CHAPTER_INDEX_BYTES_OFFSET = 8, // size 6 + SAMPLE_BYTES_OFFSET = 14, // size 2 + MASTER_INDEX_BYTES_COUNT = 8, + CHAPTER_INDEX_BYTES_COUNT = 6, + SAMPLE_BYTES_COUNT = 2, +}; + +/** + * Extract the portion of a block name used by the chapter index. + * + * @param name The block name + * + * @return The chapter index bytes + **/ +static INLINE uint64_t extractChapterIndexBytes(const UdsChunkName *name) +{ + // Get the high order 16 bits, then the low order 32 bits + uint64_t bytes + = (uint64_t) getUInt16BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET]) << 32; + bytes |= getUInt32BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET + 2]); + return bytes; +} + +/** + * Extract the portion of a block name used by the master index. + * + * @param name The block name + * + * @return The master index portion of the block name + **/ +static INLINE uint64_t extractMasterIndexBytes(const UdsChunkName *name) +{ + return getUInt64BE(&name->name[MASTER_INDEX_BYTES_OFFSET]); +} + +/** + * Extract the portion of a block name used for sparse sampling. + * + * @param name The block name + * + * @return The sparse sample portion of the block name + **/ +static INLINE uint32_t extractSamplingBytes(const UdsChunkName *name) +{ + return getUInt16BE(&name->name[SAMPLE_BYTES_OFFSET]); +} + +/** + * For a given block, find the chapter delta list to use + * + * @param name The block name to hash + * @param geometry The geometry to use + * + * @return The chapter delta list where we expect to find the given blockname + **/ +static INLINE unsigned int hashToChapterDeltaList(const UdsChunkName *name, + const Geometry *geometry) +{ + return (unsigned int) ((extractChapterIndexBytes(name) + >> geometry->chapterAddressBits) + & ((1 << geometry->chapterDeltaListBits) - 1)); +} + +/** + * For a given block, find the chapter delta address to use + * + * @param name The block name to hash + * @param geometry The geometry to use + * + * @return The chapter delta address to use + **/ +static INLINE unsigned int hashToChapterDeltaAddress(const UdsChunkName *name, + const Geometry *geometry) +{ + return (unsigned int) (extractChapterIndexBytes(name) + & ((1 << geometry->chapterAddressBits) - 1)); +} + +/** + * For a given block name, find the slot in the open chapter hash table + * where it is expected to reside. + * + * @param name The block name to hash + * @param slotCount The size of the hash table + * + * @return the record number in the index page where we expect to find + # the given blockname + **/ +static INLINE unsigned int nameToHashSlot(const UdsChunkName *name, + unsigned int slotCount) +{ + return (unsigned int) (extractChapterIndexBytes(name) % slotCount); +} + +/** + * Convert a chunk name to hex to make it more readable. + * + * @param chunkName The chunk name + * @param hexData The resulting hexdata from the given chunk name + * @param hexDataLen The capacity of hexData + * + * @return UDS_SUCCESS, + * or UDS_INVALID_ARGUMENT if hexDataLen + * is too short. + **/ +int chunkNameToHex(const UdsChunkName *chunkName, + char *hexData, + size_t hexDataLen) + __attribute__((warn_unused_result)); + +/** + * Convert chunk data to hex to make it more readable. + * + * @param chunkData The chunk data + * @param hexData The resulting hexdata from the given chunk data + * @param hexDataLen The capacity of hexData + * + * @return UDS_SUCCESS, + * or UDS_INVALID_ARGUMENT if hexDataLen + * is too short. + **/ +int chunkDataToHex(const UdsChunkData *chunkData, + char *hexData, + size_t hexDataLen) + __attribute__((warn_unused_result)); + +/** + * Compute the number of bits required to store a field with the given + * maximum value. + * + * @param maxValue The maximum value of the field + * + * @return the number of bits required + **/ +unsigned int computeBits(unsigned int maxValue) + __attribute__((warn_unused_result)); + +/** + * FOR TESTING. Set the portion of a block name used by the chapter index. + * + * @param name The block name + * @param value The value to store + **/ +static INLINE void setChapterIndexBytes(UdsChunkName *name, uint64_t value) +{ + // Store the high order bytes, then the low-order bytes + storeUInt16BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET], + (uint16_t)(value >> 32)); + storeUInt32BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET + 2], + (uint32_t)value); +} + +/** + * FOR TESTING. Set the bits used to find a chapter delta list + * + * @param name The block name + * @param geometry The geometry to use + * @param value The value to store + **/ +static INLINE void setChapterDeltaListBits(UdsChunkName *name, + const Geometry *geometry, + uint64_t value) +{ + uint64_t deltaAddress = hashToChapterDeltaAddress(name, geometry); + deltaAddress |= value << geometry->chapterAddressBits; + setChapterIndexBytes(name, deltaAddress); +} + +/** + * FOR TESTING. Set the portion of a block name used by the master index. + * + * @param name The block name + * @param val The value to store + **/ +static INLINE void setMasterIndexBytes(UdsChunkName *name, uint64_t val) +{ + storeUInt64BE(&name->name[MASTER_INDEX_BYTES_OFFSET], val); +} + +/** + * Set the portion of a block name used for sparse sampling. + * + * @param name The block name + * @param value The value to store + **/ +static INLINE void setSamplingBytes(UdsChunkName *name, uint32_t value) +{ + storeUInt16BE(&name->name[SAMPLE_BYTES_OFFSET], (uint16_t)value); +} + +/** + * Special function wrapper required for compile-time assertions. This + * function will fail to compile if UDS_CHUNK_NAME_SIZE is not an integer + * multiple of 8. + **/ +void hashUtilsCompileTimeAssertions(void); + +#endif /* HASH_UTILS_H */ diff --git a/source/uds/index.c b/source/uds/index.c new file mode 100644 index 0000000..a84d50f --- /dev/null +++ b/source/uds/index.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/index.c#15 $ + */ + +#include "index.h" + +#include "hashUtils.h" +#include "indexCheckpoint.h" +#include "indexInternals.h" +#include "logger.h" + +static const uint64_t NO_LAST_CHECKPOINT = UINT_MAX; + + +/** + * Replay an index which was loaded from a checkpoint. + * + * @param index The index to replay + * @param lastCheckpointChapter The number of the chapter where the + * last checkpoint was made + * + * @return UDS_SUCCESS or an error code. + **/ +static int replayIndexFromCheckpoint(Index *index, + uint64_t lastCheckpointChapter) +{ + // Find the volume chapter boundaries + uint64_t lowestVCN, highestVCN; + bool isEmpty = false; + IndexLookupMode oldLookupMode = index->volume->lookupMode; + index->volume->lookupMode = LOOKUP_FOR_REBUILD; + int result = findVolumeChapterBoundaries(index->volume, &lowestVCN, + &highestVCN, &isEmpty); + index->volume->lookupMode = oldLookupMode; + if (result != UDS_SUCCESS) { + return logFatalWithStringError(result, + "cannot replay index: " + "unknown volume chapter boundaries"); + } + if (lowestVCN > highestVCN) { + logFatal("cannot replay index: no valid chapters exist"); + return UDS_CORRUPT_COMPONENT; + } + + if (isEmpty) { + // The volume is empty, so the index should also be empty + if (index->newestVirtualChapter != 0) { + logFatal("cannot replay index from empty volume"); + return UDS_CORRUPT_COMPONENT; + } + return UDS_SUCCESS; + } + + unsigned int chaptersPerVolume = index->volume->geometry->chaptersPerVolume; + index->oldestVirtualChapter = lowestVCN; + index->newestVirtualChapter = highestVCN + 1; + if (index->newestVirtualChapter == lowestVCN + chaptersPerVolume) { + // skip the chapter shadowed by the open chapter + index->oldestVirtualChapter++; + } + + uint64_t firstReplayChapter = lastCheckpointChapter; + if (firstReplayChapter < index->oldestVirtualChapter) { + firstReplayChapter = index->oldestVirtualChapter; + } + return replayVolume(index, firstReplayChapter); +} + +/**********************************************************************/ +static int loadIndex(Index *index, bool allowReplay) +{ + bool replayRequired = false; + + int result = loadIndexState(index->state, &replayRequired); + if (result != UDS_SUCCESS) { + return result; + } + + if (replayRequired && !allowReplay) { + return logErrorWithStringError( + UDS_INDEX_NOT_SAVED_CLEANLY, + "index not saved cleanly: open chapter missing"); + } + + uint64_t lastCheckpointChapter + = ((index->lastCheckpoint != NO_LAST_CHECKPOINT) + ? index->lastCheckpoint : 0); + + logInfo("loaded index from chapter %llu through chapter %llu", + index->oldestVirtualChapter, lastCheckpointChapter); + + if (replayRequired) { + result = replayIndexFromCheckpoint(index, lastCheckpointChapter); + if (result != UDS_SUCCESS) { + return result; + } + } + + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + setActiveChapters(index->zones[i]); + } + + index->loadedType = replayRequired ? LOAD_REPLAY : LOAD_LOAD; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int rebuildIndex(Index *index) +{ + // Find the volume chapter boundaries + uint64_t lowestVCN, highestVCN; + bool isEmpty = false; + IndexLookupMode oldLookupMode = index->volume->lookupMode; + index->volume->lookupMode = LOOKUP_FOR_REBUILD; + int result = findVolumeChapterBoundaries(index->volume, &lowestVCN, + &highestVCN, &isEmpty); + index->volume->lookupMode = oldLookupMode; + if (result != UDS_SUCCESS) { + return logFatalWithStringError(result, + "cannot rebuild index: " + "unknown volume chapter boundaries"); + } + if (lowestVCN > highestVCN) { + logFatal("cannot rebuild index: no valid chapters exist"); + return UDS_CORRUPT_COMPONENT; + } + + if (isEmpty) { + index->newestVirtualChapter = index->oldestVirtualChapter = 0; + } else { + unsigned int numChapters = index->volume->geometry->chaptersPerVolume; + index->newestVirtualChapter = highestVCN + 1; + index->oldestVirtualChapter = lowestVCN; + if (index->newestVirtualChapter + == (index->oldestVirtualChapter + numChapters)) { + // skip the chapter shadowed by the open chapter + index->oldestVirtualChapter++; + } + } + + if ((index->newestVirtualChapter - index->oldestVirtualChapter) > + index->volume->geometry->chaptersPerVolume) { + return logFatalWithStringError(UDS_CORRUPT_COMPONENT, + "cannot rebuild index: " + "volume chapter boundaries too large"); + } + + setMasterIndexOpenChapter(index->masterIndex, 0); + if (isEmpty) { + index->loadedType = LOAD_EMPTY; + return UDS_SUCCESS; + } + + result = replayVolume(index, index->oldestVirtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + setActiveChapters(index->zones[i]); + } + + index->loadedType = LOAD_REBUILD; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeIndex(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + unsigned int zoneCount, + LoadType loadType, + IndexLoadContext *loadContext, + Index **newIndex) +{ + Index *index; + int result = allocateIndex(layout, config, userParams, zoneCount, loadType, + &index); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "could not allocate index"); + } + + index->loadContext = loadContext; + + uint64_t nonce = getVolumeNonce(layout); + result = makeMasterIndex(config, zoneCount, nonce, &index->masterIndex); + if (result != UDS_SUCCESS) { + freeIndex(index); + return logErrorWithStringError(result, "could not make master index"); + } + + result = addIndexStateComponent(index->state, MASTER_INDEX_INFO, NULL, + index->masterIndex); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = addIndexStateComponent(index->state, &INDEX_PAGE_MAP_INFO, + index->volume->indexPageMap, NULL); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = makeChapterWriter(index, getIndexVersion(layout), + &index->chapterWriter); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + if ((loadType == LOAD_LOAD) || (loadType == LOAD_REBUILD)) { + if (!index->existed) { + freeIndex(index); + return UDS_NO_INDEX; + } + result = loadIndex(index, loadType == LOAD_REBUILD); + switch (result) { + case UDS_SUCCESS: + break; + case ENOMEM: + // We should not try a rebuild for this error. + logErrorWithStringError(result, "index could not be loaded"); + break; + default: + logErrorWithStringError(result, "index could not be loaded"); + if (loadType == LOAD_REBUILD) { + result = rebuildIndex(index); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "index could not be rebuilt"); + } + } + break; + } + } else { + index->loadedType = LOAD_CREATE; + discardIndexStateData(index->state); + } + + if (result != UDS_SUCCESS) { + freeIndex(index); + return logUnrecoverable(result, "fatal error in makeIndex"); + } + + if (index->loadContext != NULL) { + lockMutex(&index->loadContext->mutex); + index->loadContext->status = INDEX_READY; + // If we get here, suspend is meaningless, but notify any thread trying + // to suspend us so it doesn't hang. + broadcastCond(&index->loadContext->cond); + unlockMutex(&index->loadContext->mutex); + } + + index->hasSavedOpenChapter = index->loadedType == LOAD_LOAD; + *newIndex = index; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeIndex(Index *index) +{ + if (index == NULL) { + return; + } + freeChapterWriter(index->chapterWriter); + + if (index->masterIndex != NULL) { + freeMasterIndex(index->masterIndex); + } + releaseIndex(index); +} + +/**********************************************************************/ +int saveIndex(Index *index) +{ + waitForIdleChapterWriter(index->chapterWriter); + int result = finishCheckpointing(index); + if (result != UDS_SUCCESS) { + logInfo("save index failed"); + return result; + } + beginSave(index, false, index->newestVirtualChapter); + + result = saveIndexState(index->state); + if (result != UDS_SUCCESS) { + logInfo("save index failed"); + index->lastCheckpoint = index->prevCheckpoint; + } else { + index->hasSavedOpenChapter = true; + logInfo("finished save (vcn %llu)", index->lastCheckpoint); + } + return result; +} + +/** + * Get the zone for a request. + * + * @param index The index + * @param request The request + * + * @return The zone for the request + **/ +static IndexZone *getRequestZone(Index *index, Request *request) +{ + return index->zones[request->zoneNumber]; +} + +/** + * Search an index zone. This function is only correct for LRU. + * + * @param zone The index zone to query. + * @param request The request originating the query. + * + * @return UDS_SUCCESS or an error code + **/ +static int searchIndexZone(IndexZone *zone, Request *request) +{ + MasterIndexRecord record; + int result = getMasterIndexRecord(zone->index->masterIndex, + &request->chunkName, &record); + if (result != UDS_SUCCESS) { + return result; + } + + bool found = false; + if (record.isFound) { + result = getRecordFromZone(zone, request, &found, record.virtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + if (found) { + request->location = computeIndexRegion(zone, record.virtualChapter); + } + } + + /* + * If a record has overflowed a chapter index in more than one chapter + * (or overflowed in one chapter and collided with an existing record), + * it will exist as a collision record in the master index, but we won't + * find it in the volume. This case needs special handling. + */ + bool overflowRecord = (record.isFound && record.isCollision && !found); + uint64_t chapter = zone->newestVirtualChapter; + if (found || overflowRecord) { + if ((request->action == REQUEST_QUERY) + && (!request->update || overflowRecord)) { + /* This is a query without update, or with nothing to update */ + return UDS_SUCCESS; + } + + if (record.virtualChapter != chapter) { + /* + * Update the master index to reference the new chapter for the block. + * If the record had been deleted or dropped from the chapter index, it + * will be back. + */ + result = setMasterIndexRecordChapter(&record, chapter); + } else if (request->action != REQUEST_UPDATE) { + /* The record is already in the open chapter, so we're done */ + return UDS_SUCCESS; + } + } else { + // The record wasn't in the master index, so check whether the name + // is in a cached sparse chapter. + if (!isMasterIndexSample(zone->index->masterIndex, &request->chunkName) + && isSparse(zone->index->volume->geometry)) { + // Passing UINT64_MAX triggers a search of the entire sparse cache. + result = searchSparseCacheInZone(zone, request, UINT64_MAX, &found); + if (result != UDS_SUCCESS) { + return result; + } + + if (found) { + request->location = LOC_IN_SPARSE; + } + } + + if (request->action == REQUEST_QUERY) { + if (!found || !request->update) { + // This is a query without update or for a new record, so we're done. + return UDS_SUCCESS; + } + } + + /* + * Add a new entry to the master index referencing the open chapter. + * This needs to be done both for new records, and for records from + * cached sparse chapters. + */ + result = putMasterIndexRecord(&record, chapter); + } + + if (result == UDS_OVERFLOW) { + /* + * The master index encountered a delta list overflow. The condition + * was already logged. We will go on without adding the chunk to the + * open chapter. + */ + return UDS_SUCCESS; + } + + if (result != UDS_SUCCESS) { + return result; + } + + UdsChunkData *metadata; + if (!found || (request->action == REQUEST_UPDATE)) { + // This is a new record or we're updating an existing record. + metadata = &request->newMetadata; + } else { + // This is a duplicate, so move the record to the open chapter (for LRU). + metadata = &request->oldMetadata; + } + return putRecordInZone(zone, request, metadata); +} + +/**********************************************************************/ +static int removeFromIndexZone(IndexZone *zone, Request *request) +{ + MasterIndexRecord record; + int result = getMasterIndexRecord(zone->index->masterIndex, + &request->chunkName, &record); + if (result != UDS_SUCCESS) { + return result; + } + + if (!record.isFound) { + // The name does not exist in master index, so there is nothing to remove. + return UDS_SUCCESS; + } + + if (!record.isCollision) { + // Non-collision records are hints, so resolve the name in the chapter. + bool found; + int result = getRecordFromZone(zone, request, &found, + record.virtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + + if (!found) { + // The name does not exist in the chapter, so there is nothing to remove. + return UDS_SUCCESS; + } + } + + request->location = computeIndexRegion(zone, record.virtualChapter); + + /* + * Delete the master index entry for the named record only. Note that a + * later search might later return stale advice if there is a colliding name + * in the same chapter, but it's a very rare case (1 in 2^21). + */ + result = removeMasterIndexRecord(&record); + if (result != UDS_SUCCESS) { + return result; + } + + // If the record is in the open chapter, we must remove it or mark it + // deleted to avoid trouble if the record is added again later. + if (request->location == LOC_IN_OPEN_CHAPTER) { + bool hashExists = false; + removeFromOpenChapter(zone->openChapter, &request->chunkName, &hashExists); + result = ASSERT(hashExists, "removing record not found in open chapter"); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * Simulate the creation of a sparse cache barrier message by the triage + * queue, and the later execution of that message in an index zone. + * + * If the index receiving the request is multi-zone or dense, this function + * does nothing. This simulation is an optimization for single-zone sparse + * indexes. It also supports unit testing of indexes without routers and + * queues. + * + * @param zone the index zone responsible for the index request + * @param request the index request about to be executed + * + * @return UDS_SUCCESS always + **/ +static int simulateIndexZoneBarrierMessage(IndexZone *zone, Request *request) +{ + // Do nothing unless this is a single-zone sparse index. + if ((zone->index->zoneCount > 1) + || !isSparse(zone->index->volume->geometry)) { + return UDS_SUCCESS; + } + + // Check if the index request is for a sampled name in a sparse chapter. + uint64_t sparseVirtualChapter = triageIndexRequest(zone->index, request); + if (sparseVirtualChapter == UINT64_MAX) { + // Not indexed, not a hook, or in a chapter that is still dense, which + // means there should be no change to the sparse chapter index cache. + return UDS_SUCCESS; + } + + /* + * The triage queue would have generated and enqueued a barrier message + * preceding this request, which we simulate by directly invoking the + * execution hook for an equivalent message. + */ + BarrierMessageData barrier = { .virtualChapter = sparseVirtualChapter }; + return executeSparseCacheBarrierMessage(zone, &barrier); +} + +/**********************************************************************/ +static int dispatchIndexZoneRequest(IndexZone *zone, Request *request) +{ + if (!request->requeued) { + // Single-zone sparse indexes don't have a triage queue to generate cache + // barrier requests, so see if we need to synthesize a barrier. + int result = simulateIndexZoneBarrierMessage(zone, request); + if (result != UDS_SUCCESS) { + return result; + } + } + + // Set the default location. It will be overwritten if we find the chunk. + request->location = LOC_UNAVAILABLE; + + int result; + switch (request->action) { + case REQUEST_INDEX: + case REQUEST_UPDATE: + case REQUEST_QUERY: + result = makeUnrecoverable(searchIndexZone(zone, request)); + break; + + case REQUEST_DELETE: + result = makeUnrecoverable(removeFromIndexZone(zone, request)); + break; + + default: + result = logWarningWithStringError(UDS_INVALID_ARGUMENT, + "attempted to execute invalid action:" + " %d", + request->action); + break; + } + + return result; +} + +/**********************************************************************/ +int dispatchIndexRequest(Index *index, Request *request) +{ + return dispatchIndexZoneRequest(getRequestZone(index, request), request); +} + +/**********************************************************************/ +static int rebuildIndexPageMap(Index *index, uint64_t vcn) +{ + Geometry *geometry = index->volume->geometry; + unsigned int chapter = mapToPhysicalChapter(geometry, vcn); + unsigned int expectedListNumber = 0; + unsigned int indexPageNumber; + for (indexPageNumber = 0; + indexPageNumber < geometry->indexPagesPerChapter; + indexPageNumber++) { + DeltaIndexPage *chapterIndexPage; + int result = getPage(index->volume, chapter, indexPageNumber, + CACHE_PROBE_INDEX_FIRST, NULL, &chapterIndexPage); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "failed to read index page %u" + " in chapter %u", + indexPageNumber, chapter); + } + unsigned int lowestDeltaList = chapterIndexPage->lowestListNumber; + unsigned int highestDeltaList = chapterIndexPage->highestListNumber; + if (lowestDeltaList != expectedListNumber) { + return logErrorWithStringError(UDS_CORRUPT_DATA, + "chapter %u index page %u is corrupt", + chapter, indexPageNumber); + } + result = updateIndexPageMap(index->volume->indexPageMap, vcn, chapter, + indexPageNumber, highestDeltaList); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "failed to update chapter %u index page" + " %u", + chapter, indexPageNumber); + } + expectedListNumber = highestDeltaList + 1; + } + return UDS_SUCCESS; +} + +/** + * Add an entry to the master index when rebuilding. + * + * @param index The index to query. + * @param name The block name of interest. + * @param virtualChapter The virtual chapter number to write to the + * master index + * @param willBeSparseChapter True if this entry will be in the sparse portion + * of the index at the end of rebuilding + * + * @return UDS_SUCCESS or an error code + **/ +static int replayRecord(Index *index, + const UdsChunkName *name, + uint64_t virtualChapter, + bool willBeSparseChapter) +{ + if (willBeSparseChapter && !isMasterIndexSample(index->masterIndex, name)) { + // This entry will be in a sparse chapter after the rebuild completes, + // and it is not a sample, so just skip over it. + return UDS_SUCCESS; + } + + MasterIndexRecord record; + int result = getMasterIndexRecord(index->masterIndex, name, &record); + if (result != UDS_SUCCESS) { + return result; + } + + bool updateRecord; + if (record.isFound) { + if (record.isCollision) { + if (record.virtualChapter == virtualChapter) { + /* The record is already correct, so we don't need to do anything */ + return UDS_SUCCESS; + } + updateRecord = true; + } else if (record.virtualChapter == virtualChapter) { + /* + * There is a master index entry pointing to the current + * chapter, but we don't know if it is for the same name as the + * one we are currently working on or not. For now, we're just + * going to assume that it isn't. This will create one extra + * collision record if there was a deleted record in the current + * chapter. + */ + updateRecord = false; + } else { + /* + * If we're rebuilding, we don't normally want to go to disk to see if + * the record exists, since we will likely have just read the record from + * disk (i.e. we know it's there). The exception to this is when we + * already find an entry in the master index that has a different chapter. + * In this case, we need to search that chapter to determine if the + * master index entry was for the same record or a different one. + */ + result = searchVolumePageCache(index->volume, NULL, name, + record.virtualChapter, NULL, + &updateRecord); + if (result != UDS_SUCCESS) { + return result; + } + } + } else { + updateRecord = false; + } + + if (updateRecord) { + /* + * Update the master index to reference the new chapter for the block. + * If the record had been deleted or dropped from the chapter index, it + * will be back. + */ + result = setMasterIndexRecordChapter(&record, virtualChapter); + } else { + /* + * Add a new entry to the master index referencing the open + * chapter. This should be done regardless of whether we are a brand + * new record or a sparse record, i.e. one that doesn't exist in the + * index but does on disk, since for a sparse record, we would want to + * un-sparsify if it did exist. + */ + result = putMasterIndexRecord(&record, virtualChapter); + } + + if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { + /* Ignore duplicate record and delta list overflow errors */ + return UDS_SUCCESS; + } + + return result; +} + +/**********************************************************************/ +void beginSave(Index *index, bool checkpoint, uint64_t openChapterNumber) +{ + index->prevCheckpoint = index->lastCheckpoint; + index->lastCheckpoint = ((openChapterNumber == 0) + ? NO_LAST_CHECKPOINT + : openChapterNumber - 1); + + const char *what = (checkpoint ? "checkpoint" : "save"); + logInfo("beginning %s (vcn %llu)", what, index->lastCheckpoint); +} + +/** + * Suspend the index if necessary and wait for a signal to resume. + * + * @param index The index to replay + * + * @return true if the replay should terminate + **/ +static bool checkForSuspend(Index *index) +{ + if (index->loadContext == NULL) { + return false; + } + + lockMutex(&index->loadContext->mutex); + if (index->loadContext->status != INDEX_SUSPENDING) { + unlockMutex(&index->loadContext->mutex); + return false; + } + + // Notify that we are suspended and wait for the resume. + index->loadContext->status = INDEX_SUSPENDED; + broadcastCond(&index->loadContext->cond); + + while ((index->loadContext->status != INDEX_OPENING) + && (index->loadContext->status != INDEX_FREEING)) { + waitCond(&index->loadContext->cond, &index->loadContext->mutex); + } + + bool retVal = (index->loadContext->status == INDEX_FREEING); + unlockMutex(&index->loadContext->mutex); + return retVal; +} + +/**********************************************************************/ +int replayVolume(Index *index, uint64_t fromVCN) +{ + int result; + uint64_t uptoVCN = index->newestVirtualChapter; + logInfo("Replaying volume from chapter %llu through chapter %" + PRIu64, + fromVCN, uptoVCN); + setMasterIndexOpenChapter(index->masterIndex, uptoVCN); + setMasterIndexOpenChapter(index->masterIndex, fromVCN); + + /* + * At least two cases to deal with here! + * - index loaded but replaying from lastCheckpoint; maybe full, maybe not + * - index failed to load, full rebuild + * Starts empty, then dense-only, then dense-plus-sparse. + * Need to sparsify while processing individual chapters. + */ + IndexLookupMode oldLookupMode = index->volume->lookupMode; + index->volume->lookupMode = LOOKUP_FOR_REBUILD; + /* + * Go through each record page of each chapter and add the records back to + * the master index. This should not cause anything to be written to either + * the open chapter or on disk volume. Also skip the on disk chapter + * corresponding to upto, as this would have already been + * purged from the master index when the chapter was opened. + * + * Also, go through each index page for each chapter and rebuild the + * index page map. + */ + const Geometry *geometry = index->volume->geometry; + uint64_t oldIPMupdate = getLastUpdate(index->volume->indexPageMap); + uint64_t vcn; + for (vcn = fromVCN; vcn < uptoVCN; ++vcn) { + if (checkForSuspend(index)) { + logInfo("Replay interrupted by index shutdown at chapter %llu", vcn); + return UDS_SHUTTINGDOWN; + } + + bool willBeSparseChapter = isChapterSparse(geometry, fromVCN, uptoVCN, + vcn); + unsigned int chapter = mapToPhysicalChapter(geometry, vcn); + prefetchVolumePages(&index->volume->volumeStore, + mapToPhysicalPage(geometry, chapter, 0), + geometry->pagesPerChapter); + setMasterIndexOpenChapter(index->masterIndex, vcn); + result = rebuildIndexPageMap(index, vcn); + if (result != UDS_SUCCESS) { + index->volume->lookupMode = oldLookupMode; + return logErrorWithStringError(result, + "could not rebuild index page map for" + " chapter %u", + chapter); + } + + unsigned int j; + for (j = 0; j < geometry->recordPagesPerChapter; j++) { + unsigned int recordPageNumber = geometry->indexPagesPerChapter + j; + byte *recordPage; + result = getPage(index->volume, chapter, recordPageNumber, + CACHE_PROBE_RECORD_FIRST, &recordPage, NULL); + if (result != UDS_SUCCESS) { + index->volume->lookupMode = oldLookupMode; + return logUnrecoverable(result, "could not get page %d", + recordPageNumber); + } + unsigned int k; + for (k = 0; k < geometry->recordsPerPage; k++) { + const byte *nameBytes = recordPage + (k * BYTES_PER_RECORD); + + UdsChunkName name; + memcpy(&name.name, nameBytes, UDS_CHUNK_NAME_SIZE); + + result = replayRecord(index, &name, vcn, willBeSparseChapter); + if (result != UDS_SUCCESS) { + char hexName[(2 * UDS_CHUNK_NAME_SIZE) + 1]; + if (chunkNameToHex(&name, hexName, sizeof(hexName)) != UDS_SUCCESS) { + strncpy(hexName, "", sizeof(hexName)); + } + index->volume->lookupMode = oldLookupMode; + return logUnrecoverable(result, + "could not find block %s during rebuild", + hexName); + } + } + } + } + index->volume->lookupMode = oldLookupMode; + + // We also need to reap the chapter being replaced by the open chapter + setMasterIndexOpenChapter(index->masterIndex, uptoVCN); + + uint64_t newIPMupdate = getLastUpdate(index->volume->indexPageMap); + + if (newIPMupdate != oldIPMupdate) { + logInfo("replay changed index page map update from %llu to %llu", + oldIPMupdate, newIPMupdate); + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +void getIndexStats(Index *index, UdsIndexStats *counters) +{ + uint64_t cwAllocated = getChapterWriterMemoryAllocated(index->chapterWriter); + // We're accessing the master index while not on a zone thread, but that's + // safe to do when acquiring statistics. + MasterIndexStats denseStats, sparseStats; + getMasterIndexStats(index->masterIndex, &denseStats, &sparseStats); + + counters->entriesIndexed = (denseStats.recordCount + + sparseStats.recordCount); + counters->memoryUsed = ((uint64_t) denseStats.memoryAllocated + + (uint64_t) sparseStats.memoryAllocated + + (uint64_t) getCacheSize(index->volume) + + cwAllocated); + counters->collisions = (denseStats.collisionCount + + sparseStats.collisionCount); + counters->entriesDiscarded = (denseStats.discardCount + + sparseStats.discardCount); + counters->checkpoints = getCheckpointCount(index->checkpoint); +} + +/**********************************************************************/ +void advanceActiveChapters(Index *index) +{ + index->newestVirtualChapter++; + if (areSamePhysicalChapter(index->volume->geometry, + index->newestVirtualChapter, + index->oldestVirtualChapter)) { + index->oldestVirtualChapter++; + } +} + +/**********************************************************************/ +uint64_t triageIndexRequest(Index *index, Request *request) +{ + MasterIndexTriage triage; + lookupMasterIndexName(index->masterIndex, &request->chunkName, &triage); + if (!triage.inSampledChapter) { + // Not indexed or not a hook. + return UINT64_MAX; + } + + IndexZone *zone = getRequestZone(index, request); + if (!isZoneChapterSparse(zone, triage.virtualChapter)) { + return UINT64_MAX; + } + + // XXX Optimize for a common case by remembering the chapter from the most + // recent barrier message and skipping this chapter if is it the same. + + // Return the sparse chapter number to trigger the barrier messages. + return triage.virtualChapter; +} diff --git a/source/uds/index.h b/source/uds/index.h new file mode 100644 index 0000000..d2bc805 --- /dev/null +++ b/source/uds/index.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/index.h#3 $ + */ + +#ifndef INDEX_H +#define INDEX_H + +#include "chapterWriter.h" +#include "indexLayout.h" +#include "indexSession.h" +#include "indexZone.h" +#include "loadType.h" +#include "masterIndexOps.h" +#include "volume.h" + + +/** + * Index checkpoint state private to indexCheckpoint.c. + **/ +typedef struct indexCheckpoint IndexCheckpoint; + +typedef struct index { + bool existed; + bool hasSavedOpenChapter; + LoadType loadedType; + IndexLoadContext *loadContext; + IndexLayout *layout; + IndexState *state; + MasterIndex *masterIndex; + Volume *volume; + unsigned int zoneCount; + IndexZone **zones; + + /* + * ATTENTION!!! + * The meaning of the next two fields has changed. + * + * They now represent the oldest and newest chapters only at load time, + * and when the index is quiescent. At other times, they may lag individual + * zones' views of the index depending upon the progress made by the chapter + * writer. + */ + uint64_t oldestVirtualChapter; + uint64_t newestVirtualChapter; + + uint64_t lastCheckpoint; + uint64_t prevCheckpoint; + ChapterWriter *chapterWriter; + + // checkpoint state used by indexCheckpoint.c + IndexCheckpoint *checkpoint; +} Index; + +/** + * Construct a new index from the given configuration. + * + * @param layout The index layout + * @param config The configuration to use + * @param userParams The index session parameters. If NULL, the default + * session parameters will be used. + * @param zoneCount The number of zones for this index to use + * @param loadType How to create the index: it can be create only, allow + * loading from files, and allow rebuilding from the volume + * @param loadContext The load context to use + * @param newIndex A pointer to hold a pointer to the new index + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndex(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + unsigned int zoneCount, + LoadType loadType, + IndexLoadContext *loadContext, + Index **newIndex) + __attribute__((warn_unused_result)); + +/** + * Save an index. + * + * Before saving an index and while saving an index, the caller must ensure + * that there are no index requests in progress. + * + * Some users follow saveIndex immediately with a freeIndex. But some tests + * use the IndexLayout to modify the saved index. The Index will then have + * some cached information that does not reflect these updates. + * + * @param index The index to save + * + * @return UDS_SUCCESS if successful + **/ +int saveIndex(Index *index) __attribute__((warn_unused_result)); + +/** + * Clean up the index and its memory. + * + * @param index The index to destroy. + **/ +void freeIndex(Index *index); + +/** + * Perform the index operation specified by the action field of a UDS request. + * + * For UDS API requests, this searches the index for the chunk name in the + * request. If the chunk name is already present in the index, the location + * field of the request will be set to the IndexRegion where it was found. If + * the action is not DELETE, the oldMetadata field of the request will also be + * filled in with the prior metadata for the name. + * + * If the API request action is: + * + * REQUEST_INDEX, a record will be added to the open chapter with the + * metadata in the request for new records, and the existing metadata for + * existing records + * + * REQUEST_UPDATE, a record will be added to the open chapter with the + * metadata in the request + * + * REQUEST_QUERY, if the update flag is set in the request, any record + * found will be moved to the open chapter. In all other cases the contents + * of the index will remain unchanged. + * + * REQUEST_REMOVE, the any entry with the name will removed from the index + * + * For non-API requests, no chunk name search is involved. + * + * @param index The index + * @param request The originating request + * + * @return UDS_SUCCESS, UDS_QUEUED, or an error code + **/ +int dispatchIndexRequest(Index *index, Request *request) + __attribute__((warn_unused_result)); + +/** + * Internal helper to prepare the index for saving. + * + * @param index the index + * @param checkpoint whether the save is a checkpoint + * @param openChapterNumber the virtual chapter number of the open chapter + **/ +void beginSave(Index *index, bool checkpoint, uint64_t openChapterNumber); + +/** + * Replay the volume file to repopulate the master index. + * + * @param index The index + * @param fromVCN The virtual chapter to start replaying + * + * @return UDS_SUCCESS if successful + **/ +int replayVolume(Index *index, uint64_t fromVCN) + __attribute__((warn_unused_result)); + +/** + * Gather statistics from the master index, volume, and cache. + * + * @param index The index + * @param counters the statistic counters for the index + **/ +void getIndexStats(Index *index, UdsIndexStats *counters); + +/** + * Set lookup state for this index. Disabling lookups means assume + * all records queried are new (intended for debugging uses, e.g., + * albfill). + * + * @param index The index + * @param enabled The new lookup state + **/ +void setIndexLookupState(Index *index, bool enabled); + +/** + * Advance the newest virtual chapter. If this will overwrite the oldest + * virtual chapter, advance that also. + * + * @param index The index to advance + **/ +void advanceActiveChapters(Index *index); + +/** + * Triage an index request, deciding whether it requires that a sparse cache + * barrier message precede it. + * + * This resolves the chunk name in the request in the master index, + * determining if it is a hook or not, and if a hook, what virtual chapter (if + * any) it might be found in. If a virtual chapter is found, it checks whether + * that chapter appears in the sparse region of the index. If all these + * conditions are met, the (sparse) virtual chapter number is returned. In all + * other cases it returns UINT64_MAX. + * + * @param index the index that will process the request + * @param request the index request containing the chunk name to triage + * + * @return the sparse chapter number for the sparse cache barrier message, or + * UINT64_MAX if the request does not require a barrier + **/ +uint64_t triageIndexRequest(Index *index, Request *request) + __attribute__((warn_unused_result)); + +#endif /* INDEX_H */ diff --git a/source/uds/indexCheckpoint.c b/source/uds/indexCheckpoint.c new file mode 100644 index 0000000..9c803b6 --- /dev/null +++ b/source/uds/indexCheckpoint.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexCheckpoint.c#2 $ + */ + +#include "indexCheckpoint.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "threads.h" +#include "typeDefs.h" + +/** + * index checkpointState values + * + * @note The order of these values is significant, + * see indexState.c doIndexStateCheckpointInZone(). + **/ +typedef enum checkpointState { + NOT_CHECKPOINTING, + CHECKPOINT_IN_PROGRESS, + CHECKPOINT_ABORTING +} CheckpointState; + +/** + * Private structure which tracks checkpointing. + **/ +struct indexCheckpoint { + Mutex mutex; // covers this group of fields + uint64_t chapter; // vcn of the starting chapter + CheckpointState state; // is checkpoint in progress or aborting + unsigned int zonesBusy; // count of zones not yet done + unsigned int frequency; // number of chapters between checkpoints + uint64_t checkpoints; // number of checkpoints this session +}; + +/** + * Enum return value of indexCheckpointTrigger function. + **/ +typedef enum indexCheckpointTriggerValue { + ICTV_IDLE, //< no checkpointing right now + ICTV_START, //< start a new checkpoint now + ICTV_CONTINUE, //< continue checkpointing if needed + ICTV_FINISH, //< finish checkpointing, next time will start new cycle + ICTV_ABORT //< immediately abort checkpointing +} IndexCheckpointTriggerValue; + +typedef int CheckpointFunction(Index *index, unsigned int zone); + +// These functions are called while holding the checkpoint->mutex but are +// expected to release it. +// +static CheckpointFunction doCheckpointStart; +static CheckpointFunction doCheckpointProcess; +static CheckpointFunction doCheckpointFinish; +static CheckpointFunction doCheckpointAbort; + +CheckpointFunction *const checkpointFuncs[] = { + NULL, + doCheckpointStart, + doCheckpointProcess, + doCheckpointFinish, + doCheckpointAbort +}; + +/**********************************************************************/ +int makeIndexCheckpoint(Index *index) +{ + IndexCheckpoint *checkpoint; + int result + = ALLOCATE(1, IndexCheckpoint, "IndexCheckpoint", &checkpoint); + if (result != UDS_SUCCESS) { + return result; + } + + result = initMutex(&checkpoint->mutex); + if (result != UDS_SUCCESS) { + FREE(checkpoint); + return result; + } + + checkpoint->checkpoints = 0; + + index->checkpoint = checkpoint; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeIndexCheckpoint(IndexCheckpoint *checkpoint) +{ + if (checkpoint != NULL) { + destroyMutex(&checkpoint->mutex); + FREE(checkpoint); + } +} + +/**********************************************************************/ +unsigned int getIndexCheckpointFrequency(IndexCheckpoint *checkpoint) +{ + lockMutex(&checkpoint->mutex); + unsigned int frequency = checkpoint->frequency; + unlockMutex(&checkpoint->mutex); + return frequency; +} + +/**********************************************************************/ +unsigned int setIndexCheckpointFrequency(IndexCheckpoint *checkpoint, + unsigned int frequency) +{ + lockMutex(&checkpoint->mutex); + unsigned int oldFrequency = checkpoint->frequency; + checkpoint->frequency = frequency; + unlockMutex(&checkpoint->mutex); + return oldFrequency; +} + +/**********************************************************************/ +uint64_t getCheckpointCount(IndexCheckpoint *checkpoint) +{ + return checkpoint->checkpoints; +} + +/**********************************************************************/ +static IndexCheckpointTriggerValue +getCheckpointAction(IndexCheckpoint *checkpoint, + uint64_t virtualChapter) +{ + if (checkpoint->frequency == 0) { + return ICTV_IDLE; + } + unsigned int value = virtualChapter % checkpoint->frequency; + if (checkpoint->state == CHECKPOINT_ABORTING) { + return ICTV_ABORT; + } else if (checkpoint->state == CHECKPOINT_IN_PROGRESS) { + if (value == checkpoint->frequency - 1) { + return ICTV_FINISH; + } else { + return ICTV_CONTINUE; + } + } else { + if (value == 0) { + return ICTV_START; + } else { + return ICTV_IDLE; + } + } +} + +/**********************************************************************/ +int processCheckpointing(Index *index, + unsigned int zone, + uint64_t newVirtualChapter) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + lockMutex(&checkpoint->mutex); + + IndexCheckpointTriggerValue ictv + = getCheckpointAction(checkpoint, newVirtualChapter); + + if (ictv == ICTV_START) { + checkpoint->chapter = newVirtualChapter; + } + + CheckpointFunction *func = checkpointFuncs[ictv]; + if (func == NULL) { + // nothing to do in idle state + unlockMutex(&checkpoint->mutex); + return UDS_SUCCESS; + } + + return (*func)(index, zone); +} + +/**********************************************************************/ +int processChapterWriterCheckpointSaves(Index *index) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + + int result = UDS_SUCCESS; + + lockMutex(&checkpoint->mutex); + if (checkpoint->state == CHECKPOINT_IN_PROGRESS) { + result = + performIndexStateCheckpointChapterSynchronizedSaves(index->state); + + if (result != UDS_SUCCESS) { + checkpoint->state = CHECKPOINT_ABORTING; + logInfo("checkpoint failed"); + index->lastCheckpoint = index->prevCheckpoint; + } + } + + unlockMutex(&checkpoint->mutex); + return result; +} + +/** + * Helper function used to abort checkpoint if an error has occurred. + * + * @param index the index + * @param result the error result + * + * @return result + **/ +static int abortCheckpointing(Index *index, int result) +{ + if (index->checkpoint->state != NOT_CHECKPOINTING) { + index->checkpoint->state = CHECKPOINT_ABORTING; + logInfo("checkpoint failed"); + index->lastCheckpoint = index->prevCheckpoint; + } + return result; +} + +/**********************************************************************/ +int finishCheckpointing(Index *index) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + + int result = processChapterWriterCheckpointSaves(index); + if (result != UDS_SUCCESS) { + return result; + } + + lockMutex(&checkpoint->mutex); + + unsigned int z; + for (z = 0; z < index->zoneCount; ++z) { + if (checkpoint->state != CHECKPOINT_IN_PROGRESS) { + break; + } + result = doCheckpointFinish(index, z); + // reacquire mutex released by doCheckpointFinish + lockMutex(&checkpoint->mutex); + if (result != UDS_SUCCESS) { + break; + } + } + + if ((result == UDS_SUCCESS) && + (checkpoint->state == CHECKPOINT_IN_PROGRESS)) { + result = finishIndexStateCheckpoint(index->state); + if (result == UDS_SUCCESS) { + checkpoint->state = NOT_CHECKPOINTING; + } + } + + unlockMutex(&checkpoint->mutex); + return result; +} + +/** + * Starts an incremental checkpoint. + * + * Called by the first zone to finish a chapter which starts a checkpoint. + * + * @param index the index + * @param zone the zone number + * + * @return UDS_SUCCESS or an error code + **/ +static int doCheckpointStart(Index *index, unsigned int zone) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + beginSave(index, true, checkpoint->chapter); + int result = startIndexStateCheckpoint(index->state); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "cannot start index checkpoint"); + index->lastCheckpoint = index->prevCheckpoint; + unlockMutex(&checkpoint->mutex); + return result; + } + + checkpoint->state = CHECKPOINT_IN_PROGRESS; + checkpoint->zonesBusy = index->zoneCount; + + return doCheckpointProcess(index, zone); +} + +/**********************************************************************/ +static int doCheckpointProcess(Index *index, unsigned int zone) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + unlockMutex(&checkpoint->mutex); + CompletionStatus status = CS_NOT_COMPLETED; + int result = performIndexStateCheckpointInZone(index->state, zone, &status); + if (result != UDS_SUCCESS) { + lockMutex(&checkpoint->mutex); + logErrorWithStringError(result, "cannot continue index checkpoint"); + result = abortCheckpointing(index, result); + unlockMutex(&checkpoint->mutex); + } else if (status == CS_JUST_COMPLETED) { + lockMutex(&checkpoint->mutex); + if (--checkpoint->zonesBusy == 0) { + checkpoint->checkpoints += 1; + logInfo("finished checkpoint"); + result = finishIndexStateCheckpoint(index->state); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "%s checkpoint finish failed", + __func__); + } + checkpoint->state = NOT_CHECKPOINTING; + } + unlockMutex(&checkpoint->mutex); + } + return result; +} + +/**********************************************************************/ +static int doCheckpointAbort(Index *index, unsigned int zone) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + CompletionStatus status = CS_NOT_COMPLETED; + int result = abortIndexStateCheckpointInZone(index->state, zone, &status); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "cannot abort index checkpoint"); + } else if (status == CS_JUST_COMPLETED) { + if (--checkpoint->zonesBusy == 0) { + logInfo("aborted checkpoint"); + result = abortIndexStateCheckpoint(index->state); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "checkpoint abort failed"); + } + checkpoint->state = NOT_CHECKPOINTING; + } + } + unlockMutex(&checkpoint->mutex); + + return result; +} + +/**********************************************************************/ +static int doCheckpointFinish(Index *index, unsigned int zone) +{ + IndexCheckpoint *checkpoint = index->checkpoint; + CompletionStatus status = CS_NOT_COMPLETED; + unlockMutex(&checkpoint->mutex); + int result = finishIndexStateCheckpointInZone(index->state, zone, &status); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "cannot finish index checkpoint"); + lockMutex(&checkpoint->mutex); + result = abortCheckpointing(index, result); + unlockMutex(&checkpoint->mutex); + } else if (status == CS_JUST_COMPLETED) { + lockMutex(&checkpoint->mutex); + if (--checkpoint->zonesBusy == 0) { + checkpoint->checkpoints += 1; + logInfo("finished checkpoint"); + result = finishIndexStateCheckpoint(index->state); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "%s checkpoint finish failed", + __func__); + } + checkpoint->state = NOT_CHECKPOINTING; + } + unlockMutex(&checkpoint->mutex); + } + return result; +} diff --git a/source/uds/indexCheckpoint.h b/source/uds/indexCheckpoint.h new file mode 100644 index 0000000..02d2936 --- /dev/null +++ b/source/uds/indexCheckpoint.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexCheckpoint.h#1 $ + */ + +#ifndef INDEX_CHECKPOINT_H +#define INDEX_CHECKPOINT_H + +#include "index.h" + +/** + * Construct and initialize the checkpoint sub-structure of an index. + * + * @param index the index receive the new checkpoint structure. + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndexCheckpoint(Index *index) __attribute__((warn_unused_result)); + +/** + * Free the checkpoint sub-structure of an index. + * + * @param checkpoint the structure to free + **/ +void freeIndexCheckpoint(IndexCheckpoint *checkpoint); + +/** + * Get the current checkpointing frequency of an index. + * + * @param checkpoint the checkpoint state of the index + * + * @return the number of chapters between checkpoints + **/ +unsigned int getIndexCheckpointFrequency(IndexCheckpoint *checkpoint) + __attribute__((warn_unused_result)); + +/** + * Set checkpointing frequency for the index. + * + * @param checkpoint the checkpoint state of the index + * @param frequency The new checkpointing frequency + * + * @return the old checkpointing frequency + **/ +unsigned int setIndexCheckpointFrequency(IndexCheckpoint *checkpoint, + unsigned int frequency); + +/** + * Gets the number of checkpoints completed during the lifetime of this index + * + * @param checkpoint the checkpoint state of the index + * + * @return the number of checkpoints completed + **/ +uint64_t getCheckpointCount(IndexCheckpoint *checkpoint) + __attribute__((warn_unused_result)); + +/** + * If incremental checkpointing is in progress, finish it. + * + * @param index The index + * + * @return UDS_SUCCESS or an error code + * + * @note This function is called automatically during normal operation; + * its presence here is for tests that expect checkpointing to + * have completed at some point in their logic. It is not an + * error to call this function if checkpointing is not in + * progress, it silently returns success. + **/ +int finishCheckpointing(Index *index) __attribute__((warn_unused_result)); + +/** + * Process one zone's incremental checkpoint operation. Automatically + * starts, processes, and finishes a checkpoint over multiple invocations + * as successive chapters are closed and written. + * + * Uses its own mutex to serialize the starting and finishing or aborting, + * but allows parallel execution of the incremental progress. + * + * @param index The index to checkpoint + * @param zone The current zone number + * @param newVirtualChapter The number of the chapter which the calling + * zone has just opened + * + * @return UDS_SUCCESS or an error code. + **/ +int processCheckpointing(Index *index, + unsigned int zone, + uint64_t newVirtualChapter) + __attribute__((warn_unused_result)); + +/** + * Process saves done outside any zone by the chapter writer. + * + * Grabs the mutex associated with processCheckpointing(). + * + * @param index The index to process. + * + * @return UDS_SUCCESS or an error code. + **/ +int processChapterWriterCheckpointSaves(Index *index) + __attribute__((warn_unused_result)); + +#endif // INDEX_CHECKPOINT_H diff --git a/source/uds/indexComponent.c b/source/uds/indexComponent.c new file mode 100644 index 0000000..c932b8d --- /dev/null +++ b/source/uds/indexComponent.c @@ -0,0 +1,745 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexComponent.c#8 $ + */ + +#include "indexComponent.h" + +#include "compiler.h" +#include "errors.h" +#include "indexLayout.h" +#include "indexState.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "typeDefs.h" + +/*****************************************************************************/ +int makeIndexComponent(IndexState *state, + const IndexComponentInfo *info, + unsigned int zoneCount, + void *data, + void *context, + IndexComponent **componentPtr) +{ + if ((info == NULL) || (info->name == NULL)) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "invalid component or directory specified"); + } + if (info->loader == NULL) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "no .loader function specified " + "for component %s", + info->name); + } + if ((info->saver == NULL) && (info->incremental == NULL)) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "neither .saver function nor .incremental " + "function specified for component %s", + info->name); + } + + IndexComponent *component = NULL; + int result = ALLOCATE(1, IndexComponent, "index component", &component); + if (result != UDS_SUCCESS) { + return result; + } + + component->componentData = data; + component->context = context; + component->info = info; + component->numZones = info->multiZone ? zoneCount : 1; + component->state = state; + component->writeZones = NULL; + *componentPtr = component; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static void freeWriteZones(IndexComponent *component) +{ + if (component->writeZones != NULL) { + unsigned int z; + for (z = 0; z < component->numZones; ++z) { + WriteZone *wz = component->writeZones[z]; + if (wz == NULL) { + continue; + } + freeBufferedWriter(wz->writer); + FREE(wz); + } + FREE(component->writeZones); + component->writeZones = NULL; + } +} + +/*****************************************************************************/ +void freeIndexComponent(IndexComponent **componentPtr) +{ + if (componentPtr == NULL) { + return; + } + IndexComponent *component = *componentPtr; + if (component == NULL) { + return; + } + *componentPtr = NULL; + + freeWriteZones(component); + FREE(component); +} + +/** + * Destroy, deallocate, and expunge a read portal. + * + * @param readPortal the readzone array + **/ +static void freeReadPortal(ReadPortal *readPortal) +{ + if (readPortal == NULL) { + return; + } + unsigned int z; + for (z = 0; z < readPortal->zones; ++z) { + if (readPortal->readers[z] != NULL) { + freeBufferedReader(readPortal->readers[z]); + } + } + FREE(readPortal->readers); + FREE(readPortal); +} + +/*****************************************************************************/ +int getBufferedReaderForPortal(ReadPortal *portal, + unsigned int part, + BufferedReader **readerPtr) +{ + if (part >= portal->zones) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "%s: cannot access zone %u of %u", + __func__, part, portal->zones); + } + IndexComponent *component = portal->component; + if (component->info->ioStorage && (portal->readers[part] == NULL)) { + int result = openStateBufferedReader(component->state, + component->info->kind, part, + &portal->readers[part]); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "%s: cannot make buffered reader " + "for zone %u", __func__, part); + } + } + *readerPtr = portal->readers[part]; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int readIndexComponent(IndexComponent *component) +{ + ReadPortal *portal; + int result = ALLOCATE(1, ReadPortal, "index component read portal", &portal); + if (result != UDS_SUCCESS) { + return result; + } + int readZones = component->state->loadZones; + result = ALLOCATE(readZones, BufferedReader *, "read zone buffered readers", + &portal->readers); + if (result != UDS_SUCCESS) { + FREE(portal); + return result; + } + + portal->component = component; + portal->zones = readZones; + result = (*component->info->loader)(portal); + freeReadPortal(portal); + return result; +} + +/** + * Determine the writeZone structure for the specified component and zone. + * + * @param [in] component the index component + * @param [in] zone the zone number + * @param [out] writeZonePtr the resulting write zone instance + * + * @return UDS_SUCCESS or an error code + **/ +static int resolveWriteZone(const IndexComponent *component, + unsigned int zone, + WriteZone **writeZonePtr) +{ + int result = ASSERT(writeZonePtr != NULL, + "output parameter is null"); + if (result != UDS_SUCCESS) { + return result; + } + + if (component->writeZones == NULL) { + return logErrorWithStringError(UDS_BAD_STATE, + "cannot resolve index component write zone:" + " not allocated"); + } + + if (zone >= component->numZones) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "cannot resolve index component write zone:" + " zone out of range"); + } + *writeZonePtr = component->writeZones[zone]; + return UDS_SUCCESS; +} + +/** + * Non-incremental save function used to emulate a regular save + * using an incremental save function as a basis. + * + * @param component the index component + * @param writer the buffered writer + * @param zone the zone number + * + * @return UDS_SUCCESS or an error code + **/ +static int indexComponentSaverIncrementalWrapper(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone) +{ + IncrementalWriter incrFunc = component->info->incremental; + bool completed = false; + + int result = (*incrFunc)(component, writer, zone, IWC_START, &completed); + if (result != UDS_SUCCESS) { + return result; + } + + if (!completed) { + result = (*incrFunc)(component, writer, zone, IWC_FINISH, &completed); + if (result != UDS_SUCCESS) { + return result; + } + } + + result = flushBufferedWriter(writer); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/** + * Specify that writing to a specific zone file has finished. + * + * If a syncer has been registered with the index component, the file + * descriptor will be enqueued upon it for fsyncing and closing. + * If not, or if the enqueue fails, the file will be fsynced and closed + * immediately. + * + * @param writeZone the index component write zone + * + * @return UDS_SUCCESS or an error code + **/ +static int doneWithZone(WriteZone *writeZone) +{ + const IndexComponent *component = writeZone->component; + if (writeZone->writer != NULL) { + int result = flushBufferedWriter(writeZone->writer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot flush buffered writer for " + "%s component (zone %u)", + component->info->name, writeZone->zone); + } + } + return UDS_SUCCESS; +} + +/** + * Construct the array of WriteZone instances for this component. + * + * @param component the index component + * + * @return UDS_SUCCESS or an error code + * + * If this is a multizone component, each zone will be fully defined, + * otherwise zone 0 stands in for the single state file. + **/ +static int makeWriteZones(IndexComponent *component) +{ + unsigned int z; + if (component->writeZones != NULL) { + // just reinitialize states + for (z = 0; z < component->numZones; ++z) { + WriteZone *wz = component->writeZones[z]; + wz->phase = IWC_IDLE; + } + return UDS_SUCCESS; + } + + int result = ALLOCATE(component->numZones, WriteZone *, + "index component write zones", &component->writeZones); + if (result != UDS_SUCCESS) { + return result; + } + + for (z = 0; z < component->numZones; ++z) { + result = ALLOCATE(1, WriteZone, "plain write zone", + &component->writeZones[z]); + if (result != UDS_SUCCESS) { + freeWriteZones(component); + return result; + } + *component->writeZones[z] = (WriteZone) { + .component = component, + .phase = IWC_IDLE, + .zone = z, + }; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static int openBufferedWriters(IndexComponent *component) +{ + int result = UDS_SUCCESS; + WriteZone **wzp; + for (wzp = component->writeZones; + wzp < component->writeZones + component->numZones; + ++wzp) { + WriteZone *wz = *wzp; + wz->phase = IWC_START; + + result = ASSERT(wz->writer == NULL, "write zone writer already exists"); + if (result != UDS_SUCCESS) { + return result; + } + + if (component->info->ioStorage) { + int result = openStateBufferedWriter(component->state, + component->info->kind, wz->zone, + &wz->writer); + if (result != UDS_SUCCESS) { + return result; + } + } + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static int startIndexComponentSave(IndexComponent *component) +{ + int result = makeWriteZones(component); + if (result != UDS_SUCCESS) { + return result; + } + + result = openBufferedWriters(component); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int startIndexComponentIncrementalSave(IndexComponent *component) +{ + return startIndexComponentSave(component); +} + +/*****************************************************************************/ +int writeIndexComponent(IndexComponent *component) +{ + Saver saver = component->info->saver; + if ((saver == NULL) && (component->info->incremental != NULL)) { + saver = indexComponentSaverIncrementalWrapper; + } + + int result = startIndexComponentSave(component); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int z; + for (z = 0; z < component->numZones; ++z) { + WriteZone *writeZone = component->writeZones[z]; + + result = (*saver)(component, writeZone->writer, z); + if (result != UDS_SUCCESS) { + break; + } + + result = doneWithZone(writeZone); + if (result != UDS_SUCCESS) { + break; + } + + freeBufferedWriter(writeZone->writer); + writeZone->writer = NULL; + } + + if (result != UDS_SUCCESS) { + freeWriteZones(component); + return logErrorWithStringError(result, "index component write failed"); + } + + return UDS_SUCCESS; +} + +/** + * Close a specific buffered writer in a component write zone. + * + * @param writeZone the write zone + * + * @return UDS_SUCCESS or an error code + * + * @note closing a buffered writer causes its file descriptor to be + * passed to doneWithZone + **/ +static int closeBufferedWriter(WriteZone *writeZone) +{ + if (writeZone->writer == NULL) { + return UDS_SUCCESS; + } + + int result = doneWithZone(writeZone); + freeBufferedWriter(writeZone->writer); + writeZone->writer = NULL; + + return result; +} + +/** + * Faux incremental saver function for index components which only define + * a simple saver. Conforms to IncrementalWriter signature. + * + * @param [in] component the index component + * @param [in] writer the buffered writer that does the output + * @param [in] zone the zone number + * @param [in] command the incremental writer command + * @param [out] completed if non-NULL, set to whether the save is complete + * + * @return UDS_SUCCESS or an error code + * + * @note This wrapper always calls the non-incremental saver when + * the IWC_START command is issued, and always reports that + * the save is complete unless the saver failed. + **/ +static int wrapSaverAsIncremental(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone, + IncrementalWriterCommand command, + bool *completed) +{ + int result = UDS_SUCCESS; + + if ((command >= IWC_START) && (command <= IWC_FINISH)) { + result = (*component->info->saver)(component, writer, zone); + if ((result == UDS_SUCCESS) && (writer != NULL)) { + noteBufferedWriterUsed(writer); + } + } + if ((result == UDS_SUCCESS) && (completed != NULL)) { + *completed = true; + } + return result; +} + +/** + * Return the appropriate incremental writer function depending on + * the component's type and whether this is the first zone. + * + * @param component the index component + * + * @return the correct IncrementalWriter function to use, or + * NULL signifying no progress can be made at this time. + **/ +static IncrementalWriter getIncrementalWriter(IndexComponent *component) +{ + IncrementalWriter incrFunc = component->info->incremental; + + if (incrFunc == NULL) { + incrFunc = &wrapSaverAsIncremental; + } + + return incrFunc; +} + +/*****************************************************************************/ +int performIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) +{ + CompletionStatus comp = CS_NOT_COMPLETED; + + WriteZone *wz = NULL; + int result = resolveWriteZone(component, zone, &wz); + if (result != UDS_SUCCESS) { + return result; + } + + if (wz->phase == IWC_IDLE) { + comp = CS_COMPLETED_PREVIOUSLY; + } else if (wz->phase == IWC_DONE) { + comp = CS_JUST_COMPLETED; + wz->phase = IWC_IDLE; + } else if (!component->info->chapterSync) { + bool done = false; + IncrementalWriter incrFunc = getIncrementalWriter(component); + int result = (*incrFunc)(component, wz->writer, zone, wz->phase, &done); + if (result != UDS_SUCCESS) { + if (wz->phase == IWC_ABORT) { + wz->phase = IWC_IDLE; + } else { + wz->phase = IWC_ABORT; + } + return result; + } + if (done) { + comp = CS_JUST_COMPLETED; + wz->phase = IWC_IDLE; + } else if (wz->phase == IWC_START) { + wz->phase = IWC_CONTINUE; + } + } + + if (completed != NULL) { + *completed = comp; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int performIndexComponentChapterWriterSave(IndexComponent *component) +{ + WriteZone *wz = NULL; + int result = resolveWriteZone(component, 0, &wz); + if (result != UDS_SUCCESS) { + return result; + } + + if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { + bool done = false; + IncrementalWriter incrFunc = getIncrementalWriter(component); + int result = ASSERT(incrFunc != NULL, "no writer function"); + if (result != UDS_SUCCESS) { + return result; + } + result = (*incrFunc)(component, wz->writer, 0, wz->phase, &done); + if (result != UDS_SUCCESS) { + if (wz->phase == IWC_ABORT) { + wz->phase = IWC_IDLE; + } else { + wz->phase = IWC_ABORT; + } + return result; + } + if (done) { + wz->phase = IWC_DONE; + } else if (wz->phase == IWC_START) { + wz->phase = IWC_CONTINUE; + } + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int finishIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) +{ + WriteZone *wz = NULL; + int result = resolveWriteZone(component, zone, &wz); + if (result != UDS_SUCCESS) { + return result; + } + + CompletionStatus comp; + switch (wz->phase) { + case IWC_IDLE: + comp = CS_COMPLETED_PREVIOUSLY; + break; + + case IWC_DONE: + comp = CS_JUST_COMPLETED; + break; + + default: + comp = CS_NOT_COMPLETED; + } + + IncrementalWriter incrFunc = getIncrementalWriter(component); + if ((wz->phase >= IWC_START) && (wz->phase < IWC_ABORT)) { + bool done = false; + int result = (*incrFunc)(component, wz->writer, zone, IWC_FINISH, &done); + if (result != UDS_SUCCESS) { + wz->phase = IWC_ABORT; + return result; + } + if (!done) { + logWarning("finish incremental save did not complete for %s zone %u", + component->info->name, zone); + return UDS_CHECKPOINT_INCOMPLETE; + } + wz->phase = IWC_IDLE; + comp = CS_JUST_COMPLETED; + } + + if (completed != NULL) { + *completed = comp; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int finishIndexComponentIncrementalSave(IndexComponent *component) +{ + unsigned int zone; + for (zone = 0; zone < component->numZones; ++zone) { + WriteZone *wz = component->writeZones[zone]; + IncrementalWriter incrFunc = getIncrementalWriter(component); + if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { + // Note: this is only safe if no other threads are currently processing + // this particular index + bool done = false; + int result = (*incrFunc)(component, wz->writer, zone, IWC_FINISH, &done); + if (result != UDS_SUCCESS) { + return result; + } + if (!done) { + logWarning("finishing incremental save did not complete for %s zone %u", + component->info->name, zone); + return UDS_UNEXPECTED_RESULT; + } + wz->phase = IWC_IDLE; + } + + if ((wz->writer != NULL) && !wasBufferedWriterUsed(wz->writer)) { + return logErrorWithStringError(UDS_CHECKPOINT_INCOMPLETE, + "component %s zone %u did not get written", + component->info->name, zone); + } + + int result = closeBufferedWriter(wz); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int abortIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *status) +{ + WriteZone *wz = NULL; + int result = resolveWriteZone(component, zone, &wz); + if (result != UDS_SUCCESS) { + return result; + } + + CompletionStatus comp = CS_COMPLETED_PREVIOUSLY; + + IncrementalWriter incrFunc = getIncrementalWriter(component); + if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { + result = (*incrFunc)(component, wz->writer, zone, IWC_ABORT, NULL); + wz->phase = IWC_IDLE; + if (result != UDS_SUCCESS) { + return result; + } + comp = CS_JUST_COMPLETED; + } + + if (status != NULL) { + *status = comp; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int abortIndexComponentIncrementalSave(IndexComponent *component) +{ + int result = UDS_SUCCESS; + unsigned int zone; + for (zone = 0; zone < component->numZones; ++zone) { + WriteZone *wz = component->writeZones[zone]; + IncrementalWriter incrFunc = getIncrementalWriter(component); + if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { + // Note: this is only safe if no other threads are currently processing + // this particular index + result = (*incrFunc)(component, wz->writer, zone, IWC_ABORT, NULL); + wz->phase = IWC_IDLE; + if (result != UDS_SUCCESS) { + return result; + } + } + + int result = closeBufferedWriter(wz); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int discardIndexComponent(IndexComponent *component) +{ + if (!component->info->ioStorage) { + return UDS_INVALID_ARGUMENT; + } + + unsigned int numZones = 0; + unsigned int saveSlot = 0; + int result = findLatestIndexSaveSlot(component->state->layout, &numZones, + &saveSlot); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int oldSaveSlot = component->state->saveSlot; + component->state->saveSlot = saveSlot; + + unsigned int z; + for (z = 0; z < numZones; ++z) { + BufferedWriter *writer; + int result = openStateBufferedWriter(component->state, + component->info->kind, z, &writer); + if (result != UDS_SUCCESS) { + break; + } + result = writeZerosToBufferedWriter(writer, UDS_BLOCK_SIZE); + if (result != UDS_SUCCESS) { + break; + } + result = flushBufferedWriter(writer); + if (result != UDS_SUCCESS) { + break; + } + freeBufferedWriter(writer); + } + + component->state->saveSlot = oldSaveSlot; + return result; +} diff --git a/source/uds/indexComponent.h b/source/uds/indexComponent.h new file mode 100644 index 0000000..22066b1 --- /dev/null +++ b/source/uds/indexComponent.h @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexComponent.h#5 $ + */ + +#ifndef INDEX_COMPONENT_H +#define INDEX_COMPONENT_H 1 + +#include "common.h" + +#include "bufferedReader.h" +#include "bufferedWriter.h" +#include "compiler.h" +#include "regionIdentifiers.h" + +typedef enum completionStatus { + CS_NOT_COMPLETED, // operation has not completed + CS_JUST_COMPLETED, // operation just completed + CS_COMPLETED_PREVIOUSLY // operation completed previously +} CompletionStatus; + +typedef struct readPortal { + struct indexComponent *component; + BufferedReader **readers; + unsigned int zones; +} ReadPortal; + +/** + * Prototype for functions which can load an index component from its + * saved state. + * + * @param portal A component portal which can be used to load the + * specified component. + * @return UDS_SUCCESS or an error code + **/ +typedef int (*Loader)(ReadPortal *portal); + +/** + * Prototype for functions which can save an index component. + * + * @param component The index component. + * @param writer A buffered writer. + * @param zone The zone number. + * + * @return UDS_SUCCESS or an error code + **/ +typedef int (*Saver)(struct indexComponent *component, + BufferedWriter *writer, + unsigned int zone); + +/** + * Command code used by IncrementalWriter function protocol. + **/ +typedef enum incrementalWriterCommand { + IWC_START, //< start an incremental save + IWC_CONTINUE, //< continue an incremental save + IWC_FINISH, //< force finish of incremental save + IWC_ABORT, //< abort incremental save + IWC_IDLE = -1,//< not a command, used internally to signify not in progress + IWC_DONE = -2 //< not a command, used internally to signify async completion +} IncrementalWriterCommand; + +typedef struct writeZone { + struct indexComponent *component; + IncrementalWriterCommand phase; + BufferedWriter *writer; + unsigned int zone; +} WriteZone; + +/** + * @param [in] component The index component. + * @param [in] writer A buffered writer. + * @param [in] zone The zone number (0 for non-multi-zone). + * @param [in] command The incremental writer command. + * @param [out] completed If non-NULL, set to whether save is done. + * + * @return UDS_SUCCESS or an error code + **/ +typedef int (*IncrementalWriter)(struct indexComponent *component, + BufferedWriter *writer, + unsigned int zone, + IncrementalWriterCommand command, + bool *completed); + +/** + * The structure describing how to load or save an index component. + * At least one of saver or incremental must be specified. + **/ +typedef struct indexComponentInfo { + RegionKind kind; // Region kind + const char *name; // The name of the component (for logging) + bool saveOnly; // Used for saves but not checkpoints + bool chapterSync; // Saved by the chapter writer + bool multiZone; // Does this component have multiple zones? + bool ioStorage; // Do we do I/O directly to storage? + Loader loader; // The function load this component + Saver saver; // The function to store this component + IncrementalWriter incremental; // The function for incremental writing +} IndexComponentInfo; + +/** + * The structure representing a savable (and loadable) part of an index. + **/ +typedef struct indexComponent { + const IndexComponentInfo *info; // IndexComponentInfo specification + void *componentData; // The object to load or save + void *context; // The context used to load or save + struct indexState *state; // The index state + unsigned int numZones; // Number of zones in write portal + WriteZone **writeZones; // State for writing component +} IndexComponent; + +/** + * Make an index component + * + * @param state The index state in which this component instance + * shall reside. + * @param info The component info specification for this component. + * @param zoneCount How many active zones are in use. + * @param data Component-specific data. + * @param context Component-specific context. + * @param componentPtr Where to store the resulting component. + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndexComponent(struct indexState *state, + const IndexComponentInfo *info, + unsigned int zoneCount, + void *data, + void *context, + IndexComponent **componentPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy and index component. + * + * @param componentPtr A pointer to the component to be freed. + **/ +void freeIndexComponent(IndexComponent **componentPtr); + +/** + * Return the index component name for this component. + **/ +static INLINE const char *indexComponentName(IndexComponent *component) +{ + return component->info->name; +} + +/** + * Return the index component data for this component. + **/ +static INLINE void *indexComponentData(IndexComponent *component) +{ + return component->componentData; +} + +/** + * Return the index component context for this component. + **/ +static INLINE void *indexComponentContext(IndexComponent *component) +{ + return component->context; +} + +/** + * Determine whether this component may be skipped for a checkpoint. + * + * @param component the component, + * + * @return whether the component may be skipped + **/ +static INLINE bool skipIndexComponentOnCheckpoint(IndexComponent *component) +{ + return component->info->saveOnly; +} + +/** + * Determine whether actual saving during a checkpoint should be + * invoked by the chapter writer thread. + **/ +static INLINE bool +deferIndexComponentCheckpointToChapterWriter(IndexComponent *component) +{ + return component->info->chapterSync; +} + +/** + * Determine whether a replay is required if component is missing. + * + * @param component the component + * + * @return whether the component is final (that is, contains shutdown state) + **/ +static INLINE bool +missingIndexComponentRequiresReplay(IndexComponent *component) +{ + return component->info->saveOnly; +} + +/** + * Read a component's state. + * + * @param component The component to read. + * + * @return UDS_SUCCESS, an error code from reading, or UDS_INVALID_ARGUMENT + * if the component is NULL. + **/ +int readIndexComponent(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Write a state file. + * + * @param component The component to write + * + * @return UDS_SUCCESS, an error code from writing, or UDS_INVALID_ARGUMENT + * if the component is NULL. + **/ +int writeIndexComponent(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Start an incremental save for this component (all zones). + * + * @param [in] component The index component. + * + * @return UDS_SUCCESS or an error code. + **/ +int startIndexComponentIncrementalSave(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Perform an incremental save for a component in a particular zone. + * + * @param [in] component The index component. + * @param [in] zone The zone number. + * @param [out] completed Pointer to hold completion status result. + * + * @return UDS_SUCCESS or an error code. + * + * @note If an incremental save is not supported, a regular + * save will be performed if this is the first call in zone 0. + **/ + int performIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Perform an incremental save for a non-multizone component synchronized + * with the chapter writer. + * + * @param component The index component. + **/ +int performIndexComponentChapterWriterSave(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Force the completion of an incremental save currently in progress in + * a particular zone. + * + * @param [in] component The index component. + * @param [in] zone The zone number. + * @param [out] completed Pointer to hold completion status result. + * + * @return UDS_SUCCESS or an error code. + **/ +int finishIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Force the completion of an incremental save in all zones and complete + * the overal save. + * + * @param [in] component The index component. + * + * @return UDS_SUCCESS or an error code. + * + * @note If all zones call finishIndexComponentZoneSave first, only + * the common non-index-related completion code is required, + * which protects access to the index data structures from the + * invoking thread. + **/ +int finishIndexComponentIncrementalSave(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Abort the incremental save currently in progress in a particular zone. + * + * @param [in] component The index component. + * @param [in] zone The zone number. + * @param [out] completed Pointer to hold completion status result. + * + * @return UDS_SUCCESS or an error code. + * + * @note "Completed" in this case means completed or aborted. + * Once any zone calls this function the entire save is + * useless unless every zone indicates CS_COMPLETED_PREVIOUSLY. + **/ +int abortIndexComponentZoneSave(IndexComponent *component, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Abort an incremental save currently in progress + * + * @param [in] component The index component. + * + * @return UDS_SUCCESS or an error code. + * + * @note If all zones call abortIndexComponentZoneSave first, only + * the common non-index-related completion code is required, + * which protects access to the index data structures from the + * invoking thread. + **/ +int abortIndexComponentIncrementalSave(IndexComponent *component) + __attribute__((warn_unused_result)); + +/** + * Remove or invalidate component state. + * + * @param component The component whose file is to be removed. If NULL + * no action is taken. + **/ +__attribute__((warn_unused_result)) +int discardIndexComponent(IndexComponent *component); + +/** + * Get a buffered reader for the specified component part. + * + * @param [in] portal The component portal. + * @param [in] part The component ordinal number. + * @param [out] readerPtr Where to put the buffered reader. + * + * @return UDS_SUCCESS or an error code. + * + * @note the reader is managed by the component portal + **/ +__attribute__((warn_unused_result)) +int getBufferedReaderForPortal(ReadPortal *portal, + unsigned int part, + BufferedReader **readerPtr); + +#endif /* INDEX_COMPONENT_H */ diff --git a/source/uds/indexConfig.c b/source/uds/indexConfig.c new file mode 100644 index 0000000..7ef86f2 --- /dev/null +++ b/source/uds/indexConfig.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexConfig.c#2 $ + */ + +#include "indexConfig.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" + +static const byte INDEX_CONFIG_MAGIC[] = "ALBIC"; +static const byte INDEX_CONFIG_VERSION[] = "06.02"; +static const byte INDEX_CONFIG_VERSION_6_01[] = "06.01"; + +enum { + INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1, + INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION) - 1 +}; + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeIndexConfig(Buffer *buffer, UdsConfiguration config) +{ + int result = getUInt32LEFromBuffer(buffer, &config->recordPagesPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->chaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->sparseChaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->cacheChapters); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->checkpointFrequency); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->masterIndexMeanDelta); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->bytesPerPage); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &config->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &config->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_COMPONENT; + } + return result; +} + +/**********************************************************************/ +static int readVersion(BufferedReader *reader, + UdsConfiguration conf, + const char **versionPtr) +{ + byte buffer[INDEX_CONFIG_VERSION_LENGTH]; + int result = readFromBufferedReader(reader, buffer, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot read index config version"); + } + if (memcmp(INDEX_CONFIG_VERSION, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0) { + Buffer *buffer; + result = makeBuffer(sizeof(*conf), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logErrorWithStringError(result, "cannot read config data"); + } + clearBuffer(buffer); + result = decodeIndexConfig(buffer, conf); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + if (versionPtr != NULL) { + *versionPtr = "current"; + } + return result; + } else if (memcmp(INDEX_CONFIG_VERSION_6_01, buffer, + INDEX_CONFIG_VERSION_LENGTH) == 0) { + struct udsConfiguration6_01 oldConf; + result = readFromBufferedReader(reader, &oldConf, sizeof(oldConf)); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, + "failed to read version 6.01 config file"); + return result; + } + conf->recordPagesPerChapter = oldConf.recordPagesPerChapter; + conf->chaptersPerVolume = oldConf.chaptersPerVolume; + conf->sparseChaptersPerVolume = oldConf.sparseChaptersPerVolume; + conf->cacheChapters = oldConf.cacheChapters; + conf->checkpointFrequency = oldConf.checkpointFrequency; + conf->masterIndexMeanDelta = oldConf.masterIndexMeanDelta; + conf->bytesPerPage = oldConf.bytesPerPage; + conf->sparseSampleRate = oldConf.sparseSampleRate; + conf->nonce = 0; + if (versionPtr != NULL) { + *versionPtr = "6.01"; + } + return UDS_UNSUPPORTED_VERSION; + } + + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unsupported configuration version: '%.*s'", + INDEX_CONFIG_VERSION_LENGTH, buffer); +} + +/**********************************************************************/ +int readConfigContents(BufferedReader *reader, + UdsConfiguration config) +{ + int result = verifyBufferedData(reader, INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + const char *version = NULL; + result = readVersion(reader, config, &version); + if (result != UDS_SUCCESS) { + if (result == UDS_UNSUPPORTED_VERSION) { + logNoticeWithStringError(result, "Found index config version %s", + version); + } else { + logErrorWithStringError(result, "Failed to read index config"); + } + } + return result; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int encodeIndexConfig(Buffer *buffer, UdsConfiguration config) +{ + int result = putUInt32LEIntoBuffer(buffer, config->recordPagesPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->chaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->sparseChaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->cacheChapters); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config-> checkpointFrequency); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->masterIndexMeanDelta); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->bytesPerPage); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, config->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, config->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(*config), + "%zu bytes encoded, of %zu expected", + contentLength(buffer), sizeof(*config)); + return result; +} + +/**********************************************************************/ +int writeConfigContents(BufferedWriter *writer, + UdsConfiguration config) +{ + int result = writeToBufferedWriter(writer, INDEX_CONFIG_MAGIC, + INDEX_CONFIG_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + result = writeToBufferedWriter(writer, INDEX_CONFIG_VERSION, + INDEX_CONFIG_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + Buffer *buffer; + result = makeBuffer(sizeof(*config), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = encodeIndexConfig(buffer, config); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + return result; +} + +/**********************************************************************/ +int makeConfiguration(UdsConfiguration conf, Configuration **configPtr) +{ + *configPtr = NULL; + if (conf == NULL) { + return logErrorWithStringError(UDS_CONF_REQUIRED, + "received an invalid config"); + } + + Configuration *config; + int result = ALLOCATE(1, Configuration, "configuration", &config); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeGeometry(conf->bytesPerPage, + conf->recordPagesPerChapter, + conf->chaptersPerVolume, + conf->sparseChaptersPerVolume, + &config->geometry); + if (result != UDS_SUCCESS) { + freeConfiguration(config); + return result; + } + + config->sparseSampleRate = conf->sparseSampleRate; + config->cacheChapters = conf->cacheChapters; + config->masterIndexMeanDelta = conf->masterIndexMeanDelta; + + *configPtr = config; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeConfiguration(Configuration *config) +{ + if (config != NULL) { + freeGeometry(config->geometry); + FREE(config); + } +} diff --git a/source/uds/indexConfig.h b/source/uds/indexConfig.h new file mode 100644 index 0000000..dab3d6a --- /dev/null +++ b/source/uds/indexConfig.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexConfig.h#2 $ + */ + +#ifndef INDEX_CONFIG_H +#define INDEX_CONFIG_H 1 + +#include "config.h" +#include "geometry.h" + +/** + * A set of configuration parameters for the indexer. + **/ +struct configuration { + /* Parameters for the volume */ + + /* The volume layout */ + Geometry *geometry; + + /* Size of the page cache and sparse chapter index cache, in chapters */ + unsigned int cacheChapters; + + /** Parameters for the master index */ + + /* The mean delta for the master index */ + unsigned int masterIndexMeanDelta; + + /* Sampling rate for sparse indexing */ + unsigned int sparseSampleRate; +}; + +#endif /* INDEX_CONFIG_H */ diff --git a/source/uds/indexInternals.c b/source/uds/indexInternals.c new file mode 100644 index 0000000..48268c7 --- /dev/null +++ b/source/uds/indexInternals.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexInternals.c#7 $ + */ + +#include "indexInternals.h" + +#include "errors.h" +#include "indexCheckpoint.h" +#include "indexStateData.h" +#include "indexZone.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "openChapter.h" +#include "request.h" +#include "stringUtils.h" +#include "threads.h" +#include "typeDefs.h" +#include "volume.h" +#include "zone.h" + +static const unsigned int MAX_COMPONENT_COUNT = 4; + +/**********************************************************************/ +int allocateIndex(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + unsigned int zoneCount, + LoadType loadType, + Index **newIndex) +{ + unsigned int checkpoint_frequency + = userParams == NULL ? 0 : userParams->checkpoint_frequency; + if (checkpoint_frequency >= config->geometry->chaptersPerVolume) { + return UDS_BAD_CHECKPOINT_FREQUENCY; + } + + Index *index; + int result = ALLOCATE(1, Index, "index", &index); + if (result != UDS_SUCCESS) { + return result; + } + + index->existed = (loadType != LOAD_CREATE); + index->hasSavedOpenChapter = true; + index->loadedType = LOAD_UNDEFINED; + + result = makeIndexCheckpoint(index); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + setIndexCheckpointFrequency(index->checkpoint, checkpoint_frequency); + + getIndexLayout(layout, &index->layout); + index->zoneCount = zoneCount; + + result = ALLOCATE(index->zoneCount, IndexZone *, "zones", + &index->zones); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = makeIndexState(layout, index->zoneCount, MAX_COMPONENT_COUNT, + &index->state); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = addIndexStateComponent(index->state, &INDEX_STATE_INFO, index, + NULL); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + + result = makeVolume(config, index->layout, userParams, + VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS, index->zoneCount, + &index->volume); + if (result != UDS_SUCCESS) { + freeIndex(index); + return result; + } + index->volume->lookupMode = LOOKUP_NORMAL; + + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + result = makeIndexZone(index, i); + if (result != UDS_SUCCESS) { + freeIndex(index); + return logErrorWithStringError(result, "Could not create index zone"); + } + } + + result = addIndexStateComponent(index->state, &OPEN_CHAPTER_INFO, index, + NULL); + if (result != UDS_SUCCESS) { + freeIndex(index); + return logErrorWithStringError(result, "Could not create open chapter"); + } + + *newIndex = index; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void releaseIndex(Index *index) +{ + if (index == NULL) { + return; + } + + if (index->zones != NULL) { + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + freeIndexZone(index->zones[i]); + } + FREE(index->zones); + } + + freeVolume(index->volume); + + freeIndexState(&index->state); + freeIndexCheckpoint(index->checkpoint); + putIndexLayout(&index->layout); + FREE(index); +} diff --git a/source/uds/indexInternals.h b/source/uds/indexInternals.h new file mode 100644 index 0000000..16cb56a --- /dev/null +++ b/source/uds/indexInternals.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexInternals.h#3 $ + */ + +#ifndef INDEX_INTERNALS_H +#define INDEX_INTERNALS_H + +#include "index.h" +#include "loadType.h" +#include "request.h" + +/** + * Construct a new index from the given configuration. + * + * @param layout The index layout to use + * @param config The configuration to use + * @param userParams The index session parameters. If NULL, the default + * session parameters will be used. + * @param zoneCount The number of zones for this index to use + * @param loadType How to create the index: it can be create only, allow + * loading from files, and allow rebuilding from the volume + * @param newIndex A pointer to hold a pointer to the new index + * + * @return UDS_SUCCESS or an error code + **/ +int allocateIndex(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + unsigned int zoneCount, + LoadType loadType, + Index **newIndex) + __attribute__((warn_unused_result)); + +/** + * Clean up the index and its memory. + * + * @param index The index to destroy. + **/ +void releaseIndex(Index *index); + +#endif /* INDEX_INTERNALS_H */ diff --git a/source/uds/indexLayout.c b/source/uds/indexLayout.c new file mode 100644 index 0000000..cb019ff --- /dev/null +++ b/source/uds/indexLayout.c @@ -0,0 +1,2409 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexLayout.c#19 $ + */ + +#include "indexLayout.h" + +#include "buffer.h" +#include "compiler.h" +#include "config.h" +#include "indexConfig.h" +#include "layoutRegion.h" +#include "logger.h" +#include "masterIndexOps.h" +#include "memoryAlloc.h" +#include "nonce.h" +#include "openChapter.h" + +/* + * Overall layout of an index on disk: + * + * The layout is divided into a number of fixed-size regions, the sizes of + * which are computed when the index is created. Every header and region + * begins on 4K block boundary. Save regions are further sub-divided into + * regions of their own. + * + * Each region has a kind and an instance number. Some kinds only have one + * instance and therefore use RL_SOLE_INSTANCE (-1) as the instance number. + * The RL_KIND_INDEX uses instances to represent sub-indices, where used. + * A save region can either hold a checkpoint or a clean shutdown (determined + * by the type). The instances determine which available save slot is used. + * The RL_KIND_MASTER_INDEX uses instances to record which zone is being saved. + * + * +-+-+--------+--------+--------+-----+--- -+-+ + * | | | I N D E X 0 101, 0 | ... | | + * |H|C+--------+--------+--------+-----+--- -+S| + * |D|f| Volume | Save | Save | | |e| + * |R|g| Region | Region | Region | ... | ... |a| + * | | | 201 -1 | 202 0 | 202 1 | | |l| + * +-+-+--------+--------+--------+-----+--- -+-+ + * + * The header contains the encoded regional layout table as well as + * the saved index configuration record. The sub-index regions and their + * subdivisions are maintained in the same table. + * + * There are at least two save regions per sub-index to preserve the old + * state should the saving of a state be incomplete. They are used in + * a round-robin fashion. + * + * Anatomy of a save region: + * + * +-+-----+------+------+-----+ -+-----+ + * |H| IPM | MI | MI | | | OC | + * |D| | zone | zone | ... | | | + * |R| 301 | 302 | 302 | | | 303 | + * | | -1 | 0 | 1 | | | -1 | + * +-+-----+------+------+-----+ -+-----+ + * + * Every region header has a type (and version). In save regions, + * the open chapter only appears in RL_TYPE_SAVE not RL_TYPE_CHECKPOINT, + * although the same space is reserved for both. + * + * The header contains the encoded regional layout table as well as the + * index state record for that save or checkpoint. Each save or checkpoint + * has a unique generation number and nonce which is used to seed the + * checksums of those regions. + */ + +typedef struct indexSaveData_v1 { + uint64_t timestamp; // ms since epoch... + uint64_t nonce; + uint32_t version; // 1 + uint32_t unused__; +} IndexSaveData; + +typedef struct indexSaveLayout { + LayoutRegion indexSave; + LayoutRegion header; + unsigned int numZones; + LayoutRegion indexPageMap; + LayoutRegion freeSpace; + LayoutRegion *masterIndexZones; + LayoutRegion *openChapter; + IndexSaveType saveType; + IndexSaveData saveData; + Buffer *indexStateBuffer; + bool read; + bool written; +} IndexSaveLayout; + +typedef struct subIndexLayout { + LayoutRegion subIndex; + uint64_t nonce; + LayoutRegion volume; + IndexSaveLayout *saves; +} SubIndexLayout; + +typedef struct superBlockData_v1 { + byte magicLabel[32]; + byte nonceInfo[32]; + uint64_t nonce; + uint32_t version; // 2 + uint32_t blockSize; // for verification + uint16_t numIndexes; // 1 + uint16_t maxSaves; + uint64_t openChapterBlocks; + uint64_t pageMapBlocks; +} SuperBlockData; + +struct indexLayout { + IOFactory *factory; + off_t offset; + struct index_version indexVersion; + SuperBlockData super; + LayoutRegion header; + LayoutRegion config; + SubIndexLayout index; + LayoutRegion seal; + uint64_t totalBlocks; + int refCount; +}; + +/** + * Structure used to compute single file layout sizes. + * + * Note that the masterIndexBlocks represent all zones and are sized for + * the maximum number of blocks that would be needed regardless of the number + * of zones (up to the maximum value) that are used at run time. + * + * Similarly, the number of saves is sized for the minimum safe value + * assuming checkpointing is enabled, since that is also a run-time parameter. + **/ +typedef struct saveLayoutSizes { + Configuration config; // this is a captive copy + Geometry geometry; // this is a captive copy + unsigned int numSaves; // per sub-index + size_t blockSize; // in bytes + uint64_t volumeBlocks; // per sub-index + uint64_t masterIndexBlocks; // per save + uint64_t pageMapBlocks; // per save + uint64_t openChapterBlocks; // per save + uint64_t saveBlocks; // per sub-index + uint64_t subIndexBlocks; // per sub-index + uint64_t totalBlocks; // for whole layout +} SaveLayoutSizes; + +enum { + INDEX_STATE_BUFFER_SIZE = 512, + MAX_SAVES = 5, +}; + +static const byte SINGLE_FILE_MAGIC_1[32] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +enum { + SINGLE_FILE_MAGIC_1_LENGTH = sizeof(SINGLE_FILE_MAGIC_1), +}; + +static int reconstituteSingleFileLayout(IndexLayout *layout, + SuperBlockData *super, + RegionTable *table, + uint64_t firstBlock) + __attribute__((warn_unused_result)); +static int writeIndexSaveLayout(IndexLayout *layout, IndexSaveLayout *isl) + __attribute__((warn_unused_result)); + +/*****************************************************************************/ +static INLINE uint64_t blockCount(uint64_t bytes, uint32_t blockSize) +{ + uint64_t blocks = bytes / blockSize; + if (bytes % blockSize > 0) { + ++blocks; + } + return blocks; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int computeSizes(SaveLayoutSizes *sls, + const UdsConfiguration config, + size_t blockSize, + unsigned int numCheckpoints) +{ + if (config->bytesPerPage % blockSize != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "page size not a multiple of block size"); + } + + Configuration *cfg = NULL; + int result = makeConfiguration(config, &cfg); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot compute layout size"); + } + + memset(sls, 0, sizeof(*sls)); + + // internalize the configuration and geometry... + + sls->geometry = *cfg->geometry; + sls->config = *cfg; + sls->config.geometry = &sls->geometry; + + freeConfiguration(cfg); + + sls->numSaves = 2 + numCheckpoints; + sls->blockSize = blockSize; + sls->volumeBlocks = sls->geometry.bytesPerVolume / blockSize; + + result = computeMasterIndexSaveBlocks(&sls->config, blockSize, + &sls->masterIndexBlocks); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot compute index save size"); + } + + sls->pageMapBlocks = + blockCount(computeIndexPageMapSaveSize(&sls->geometry), blockSize); + sls->openChapterBlocks = + blockCount(computeSavedOpenChapterSize(&sls->geometry), blockSize); + sls->saveBlocks = 1 + (sls->masterIndexBlocks + + sls->pageMapBlocks + sls->openChapterBlocks); + sls->subIndexBlocks = sls->volumeBlocks + (sls->numSaves * sls->saveBlocks); + sls->totalBlocks = 3 + sls->subIndexBlocks; + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int udsComputeIndexSize(const UdsConfiguration config, + unsigned int numCheckpoints, + uint64_t *indexSize) +{ + SaveLayoutSizes sizes; + int result = computeSizes(&sizes, config, UDS_BLOCK_SIZE, numCheckpoints); + if (result != UDS_SUCCESS) { + return result; + } + + if (indexSize != NULL) { + *indexSize = sizes.totalBlocks * sizes.blockSize; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int openLayoutReader(IndexLayout *layout, + LayoutRegion *lr, + BufferedReader **readerPtr) +{ + off_t start = lr->startBlock * layout->super.blockSize; + size_t size = lr->numBlocks * layout->super.blockSize; + return openBufferedReader(layout->factory, start, size, readerPtr); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int openLayoutWriter(IndexLayout *layout, + LayoutRegion *lr, + BufferedWriter **writerPtr) +{ + off_t start = lr->startBlock * layout->super.blockSize; + size_t size = lr->numBlocks * layout->super.blockSize; + return openBufferedWriter(layout->factory, start, size, writerPtr); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int decodeIndexSaveData(Buffer *buffer, IndexSaveData *saveData) +{ + int result = getUInt64LEFromBuffer(buffer, &saveData->timestamp); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &saveData->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &saveData->version); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &saveData->unused__); + if (result != UDS_SUCCESS) { + return result; + } + // The unused padding has to be zeroed for correct nonce calculation + if (saveData->unused__ != 0) { + return UDS_CORRUPT_COMPONENT; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer), sizeof(*saveData)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_COMPONENT; + } + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int decodeRegionHeader(Buffer *buffer, RegionHeader *header) +{ + int result = getUInt64LEFromBuffer(buffer, &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->regionBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &header->type); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &header->version); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &header->numRegions); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &header->payload); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer), sizeof(*header)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_COMPONENT; + } + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int decodeLayoutRegion(Buffer *buffer, LayoutRegion *region) +{ + size_t cl1 = contentLength(buffer); + + int result = getUInt64LEFromBuffer(buffer, ®ion->startBlock); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, ®ion->numBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, ®ion->checksum); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, ®ion->kind); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, ®ion->instance); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(cl1 - contentLength(buffer) == sizeof(*region), + "%zu bytes decoded, of %zu expected", + cl1 - contentLength(buffer), sizeof(*region)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_COMPONENT; + } + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int loadRegionTable(BufferedReader *reader, RegionTable **tablePtr) +{ + Buffer *buffer; + int result = makeBuffer(sizeof(RegionHeader), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logErrorWithStringError(result, "cannot read region table header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + RegionHeader header; + result = decodeRegionHeader(buffer, &header); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + if (header.magic != REGION_MAGIC) { + return UDS_NO_INDEX; + } + if (header.version != 1) { + return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, + "unknown region table version %" PRIu16, + header.version); + } + + RegionTable *table; + result = ALLOCATE_EXTENDED(RegionTable, header.numRegions, LayoutRegion, + "single file layout region table", &table); + if (result != UDS_SUCCESS) { + return result; + } + + table->header = header; + result = makeBuffer(header.numRegions * sizeof(LayoutRegion), &buffer); + if (result != UDS_SUCCESS) { + FREE(table); + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + FREE(table); + freeBuffer(&buffer); + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "cannot read region table layouts"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + FREE(table); + freeBuffer(&buffer); + return result; + } + unsigned int i; + for (i = 0; i < header.numRegions; i++){ + result = decodeLayoutRegion(buffer, &table->regions[i]); + if (result != UDS_SUCCESS) { + FREE(table); + freeBuffer(&buffer); + return result; + } + } + freeBuffer(&buffer); + *tablePtr = table; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int decodeSuperBlockData(Buffer *buffer, SuperBlockData *super) +{ + int result = getBytesFromBuffer(buffer, 32, super->magicLabel); + if (result != UDS_SUCCESS) { + return result; + } + result = getBytesFromBuffer(buffer, 32, super->nonceInfo); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &super->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &super->version); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &super->blockSize); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &super->numIndexes); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEFromBuffer(buffer, &super->maxSaves); + if (result != UDS_SUCCESS) { + return result; + } + result = skipForward(buffer, 4); // aligment + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &super->openChapterBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &super->pageMapBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer), sizeof(*super)); + if (result != UDS_SUCCESS) { + return UDS_CORRUPT_COMPONENT; + } + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int readSuperBlockData(BufferedReader *reader, + SuperBlockData *super, + size_t savedSize) +{ + if (savedSize != sizeof(SuperBlockData)) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected super block data size %zu", + savedSize); + } + + if (sizeof(super->magicLabel) != SINGLE_FILE_MAGIC_1_LENGTH) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "super block magic label size incorrect"); + } + + Buffer *buffer; + int result = makeBuffer(savedSize, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logErrorWithStringError(result, "cannot read region table header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = decodeSuperBlockData(buffer, super); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot read super block data"); + } + + if (memcmp(super->magicLabel, SINGLE_FILE_MAGIC_1, + SINGLE_FILE_MAGIC_1_LENGTH) != 0) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unknown superblock magic label"); + } + + if ((super->version < SUPER_VERSION_MINIMUM) + || (super->version > SUPER_VERSION_MAXIMUM)) { + return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, + "unknown superblock version number %" + PRIu32, + super->version); + } + + // We dropped the usage of multiple subindices before we ever ran UDS code in + // the kernel. We do not have code that will handle multiple subindices. + if (super->numIndexes != 1) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "invalid subindex count %" PRIu32, + super->numIndexes); + } + + if (generateMasterNonce(super->nonceInfo, sizeof(super->nonceInfo)) != + super->nonce) + { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "inconsistent superblock nonce"); + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int allocateSingleFileParts(IndexLayout *layout, + SuperBlockData *super) +{ + int result = ALLOCATE(super->maxSaves, IndexSaveLayout, __func__, + &layout->index.saves); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int loadSuperBlock(IndexLayout *layout, + size_t blockSize, + uint64_t firstBlock, + BufferedReader *reader) +{ + RegionTable *table = NULL; + int result = loadRegionTable(reader, &table); + if (result != UDS_SUCCESS) { + return result; + } + + if (table->header.type != RH_TYPE_SUPER) { + FREE(table); + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "not a superblock region table"); + } + + SuperBlockData superBlockData; + result = readSuperBlockData(reader, &superBlockData, table->header.payload); + if (result != UDS_SUCCESS) { + FREE(table); + return logErrorWithStringError(result, "unknown superblock format"); + } + + if (superBlockData.blockSize != blockSize) { + FREE(table); + return logErrorWithStringError(UDS_WRONG_INDEX_CONFIG, + "superblock saved blockSize %" PRIu32 + " differs from supplied blockSize %zu", + superBlockData.blockSize, blockSize); + } + initializeIndexVersion(&layout->indexVersion, superBlockData.version); + + result = allocateSingleFileParts(layout, &superBlockData); + if (result != UDS_SUCCESS) { + FREE(table); + return result; + } + + result = reconstituteSingleFileLayout(layout, &superBlockData, table, + firstBlock); + FREE(table); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int readIndexSaveData(BufferedReader *reader, + IndexSaveData *saveData, + size_t savedSize, + Buffer **bufferPtr) +{ + int result = UDS_SUCCESS; + if (savedSize == 0) { + memset(saveData, 0, sizeof(*saveData)); + } else { + if (savedSize < sizeof(IndexSaveData)) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected index save data size %zu", + savedSize); + } + + Buffer *buffer; + result = makeBuffer(sizeof(*saveData), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logErrorWithStringError(result, "cannot read index save data"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + result = decodeIndexSaveData(buffer, saveData); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + + savedSize -= sizeof(IndexSaveData); + + if (saveData->version > 1) { + return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, + "unkown index save verion number %" + PRIu32, + saveData->version); + } + + if (savedSize > INDEX_STATE_BUFFER_SIZE) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected index state buffer size %zu", + savedSize); + } + } + + Buffer *buffer = NULL; + + if (saveData->version != 0) { + result = makeBuffer(INDEX_STATE_BUFFER_SIZE, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + if (savedSize > 0) { + result = readFromBufferedReader(reader, getBufferContents(buffer), + savedSize); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = resetBufferEnd(buffer, savedSize); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + } + } + + *bufferPtr = buffer; + return UDS_SUCCESS; +} + +/*****************************************************************************/ + +typedef struct { + LayoutRegion *nextRegion; + LayoutRegion *lastRegion; + uint64_t nextBlock; + int result; +} RegionIterator; + +/*****************************************************************************/ +__attribute__((format(printf, 2, 3))) +static void iterError(RegionIterator *iter, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + int r = vLogWithStringError(LOG_ERR, UDS_UNEXPECTED_RESULT, fmt, args); + va_end(args); + if (iter->result == UDS_SUCCESS) { + iter->result = r; + } +} + +/** + * Set the next layout region in the layout according to a region table + * iterator, unless the iterator already contains an error + * + * @param expect whether to record an error or return false + * @param lr the layout region field to set + * @param iter the region iterator, which also holds the cumulative + * result + * @param numBlocks if non-zero, the expected number of blocks + * @param kind the expected kind of the region + * @param instance the expected instance number of the region + * + * @return true if we meet expectations, false if we do not + **/ +static bool expectLayout(bool expect, + LayoutRegion *lr, + RegionIterator *iter, + uint64_t numBlocks, + RegionKind kind, + unsigned int instance) +{ + if (iter->result != UDS_SUCCESS) { + return false; + } + + if (iter->nextRegion == iter->lastRegion) { + if (expect) { + iterError(iter, "ran out of layout regions in region table"); + } + return false; + } + + if (iter->nextRegion->startBlock != iter->nextBlock) { + iterError(iter, "layout region not at expected offset"); + return false; + } + + if (iter->nextRegion->kind != kind) { + if (expect) { + iterError(iter, "layout region has incorrect kind"); + } + return false; + } + + if (iter->nextRegion->instance != instance) { + iterError(iter, "layout region has incorrect instance"); + return false; + } + + if (numBlocks > 0 && iter->nextRegion->numBlocks != numBlocks) { + iterError(iter, "layout region size is incorrect"); + return false; + } + + if (lr != NULL) { + *lr = *iter->nextRegion; + } + + iter->nextBlock += iter->nextRegion->numBlocks; + iter->nextRegion++; + return true; +} + +/*****************************************************************************/ +static void setupLayout(LayoutRegion *lr, + uint64_t *nextAddrPtr, + uint64_t regionSize, + unsigned int kind, + unsigned int instance) +{ + *lr = (LayoutRegion) { + .startBlock = *nextAddrPtr, + .numBlocks = regionSize, + .checksum = 0, + .kind = kind, + .instance = instance, + }; + *nextAddrPtr += regionSize; +} + +/*****************************************************************************/ +static void populateIndexSaveLayout(IndexSaveLayout *isl, + SuperBlockData *super, + unsigned int numZones, + IndexSaveType saveType) +{ + uint64_t nextBlock = isl->indexSave.startBlock; + + setupLayout(&isl->header, &nextBlock, 1, RL_KIND_HEADER, RL_SOLE_INSTANCE); + setupLayout(&isl->indexPageMap, &nextBlock, super->pageMapBlocks, + RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); + + uint64_t blocksAvail = (isl->indexSave.numBlocks - + (nextBlock - isl->indexSave.startBlock) - + super->openChapterBlocks); + + if (numZones > 0) { + uint64_t miBlockCount = blocksAvail / numZones; + unsigned int z; + for (z = 0; z < numZones; ++z) { + LayoutRegion *miz = &isl->masterIndexZones[z]; + setupLayout(miz, &nextBlock, miBlockCount, RL_KIND_MASTER_INDEX, z); + } + } + if (saveType == IS_SAVE && isl->openChapter != NULL) { + setupLayout(isl->openChapter, &nextBlock, super->openChapterBlocks, + RL_KIND_OPEN_CHAPTER, RL_SOLE_INSTANCE); + } + setupLayout(&isl->freeSpace, &nextBlock, + (isl->indexSave.numBlocks - + (nextBlock - isl->indexSave.startBlock)), + RL_KIND_SCRATCH, RL_SOLE_INSTANCE); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int reconstructIndexSave(IndexSaveLayout *isl, + IndexSaveData *saveData, + SuperBlockData *super, + RegionTable *table) +{ + isl->numZones = 0; + isl->saveData = *saveData; + isl->read = false; + isl->written = false; + + if (table->header.type == RH_TYPE_SAVE) { + isl->saveType = IS_SAVE; + } else if (table->header.type == RH_TYPE_CHECKPOINT) { + isl->saveType = IS_CHECKPOINT; + } else { + isl->saveType = NO_SAVE; + } + + if ((table->header.numRegions == 0) || + ((table->header.numRegions == 1) && + (table->regions[0].kind == RL_KIND_SCRATCH))) + { + populateIndexSaveLayout(isl, super, 0, NO_SAVE); + return UDS_SUCCESS; + } + + RegionIterator iter = { + .nextRegion = table->regions, + .lastRegion = table->regions + table->header.numRegions, + .nextBlock = isl->indexSave.startBlock, + .result = UDS_SUCCESS, + }; + + expectLayout(true, &isl->header, &iter, 1, RL_KIND_HEADER, RL_SOLE_INSTANCE); + expectLayout(true, &isl->indexPageMap, &iter, 0, + RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); + unsigned int n = 0; + RegionIterator tmpIter; + for (tmpIter = iter; + expectLayout(false, NULL, &tmpIter, 0, RL_KIND_MASTER_INDEX, n); + ++n) + ; + isl->numZones = n; + + int result = UDS_SUCCESS; + + if (isl->numZones > 0) { + result = ALLOCATE(n, LayoutRegion, "master index layout regions", + &isl->masterIndexZones); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (isl->saveType == IS_SAVE) { + result = ALLOCATE(1, LayoutRegion, "open chapter layout region", + &isl->openChapter); + if (result != UDS_SUCCESS) { + FREE(isl->masterIndexZones); + return result; + } + } + + unsigned int z; + for (z = 0; z < isl->numZones; ++z) { + expectLayout(true, &isl->masterIndexZones[z], &iter, 0, + RL_KIND_MASTER_INDEX, z); + } + if (isl->saveType == IS_SAVE) { + expectLayout(true, isl->openChapter, &iter, 0, + RL_KIND_OPEN_CHAPTER, RL_SOLE_INSTANCE); + } + if (!expectLayout(false, &isl->freeSpace, &iter, 0, + RL_KIND_SCRATCH, RL_SOLE_INSTANCE)) + { + isl->freeSpace = (LayoutRegion) { + .startBlock = iter.nextBlock, + .numBlocks = (isl->indexSave.startBlock + + isl->indexSave.numBlocks) - iter.nextBlock, + .checksum = 0, + .kind = RL_KIND_SCRATCH, + .instance = RL_SOLE_INSTANCE, + }; + iter.nextBlock = isl->freeSpace.startBlock + isl->freeSpace.numBlocks; + } + + if (iter.result != UDS_SUCCESS) { + return iter.result; + } + if (iter.nextRegion != iter.lastRegion) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "expected %ld additional regions", + iter.lastRegion - iter.nextRegion); + } + if (iter.nextBlock != isl->indexSave.startBlock + isl->indexSave.numBlocks) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "index save layout table incomplete"); + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int loadIndexSave(IndexSaveLayout *isl, + SuperBlockData *super, + BufferedReader *reader, + unsigned int saveId) +{ + RegionTable *table = NULL; + int result = loadRegionTable(reader, &table); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot read index 0 save %u header", + saveId); + } + + if (table->header.regionBlocks != isl->indexSave.numBlocks) { + uint64_t regionBlocks = table->header.regionBlocks; + FREE(table); + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "unexpected index 0 save %u " + "region block count %llu", + saveId, regionBlocks); + } + + if (table->header.type != RH_TYPE_SAVE && + table->header.type != RH_TYPE_CHECKPOINT && + table->header.type != RH_TYPE_UNSAVED) + { + unsigned int type = table->header.type; + FREE(table); + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, "unexpected" + " index 0 save %u header type %u", + saveId, type); + } + + IndexSaveData indexSaveData; + result = readIndexSaveData(reader, &indexSaveData, table->header.payload, + &isl->indexStateBuffer); + if (result != UDS_SUCCESS) { + FREE(table); + return logErrorWithStringError(result, + "unknown index 0 save %u data format", + saveId); + } + + result = reconstructIndexSave(isl, &indexSaveData, super, table); + FREE(table); + + if (result != UDS_SUCCESS) { + freeBuffer(&isl->indexStateBuffer); + return logErrorWithStringError(result, + "cannot reconstruct index 0 save %u", + saveId); + } + isl->read = true; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int loadSubIndexRegions(IndexLayout *layout) +{ + unsigned int j; + for (j = 0; j < layout->super.maxSaves; ++j) { + IndexSaveLayout *isl = &layout->index.saves[j]; + + BufferedReader *reader; + int result = openLayoutReader(layout, &isl->indexSave, &reader); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "cannot get reader for index 0 save %u", + j); + while (j-- > 0) { + IndexSaveLayout *isl = &layout->index.saves[j]; + FREE(isl->masterIndexZones); + FREE(isl->openChapter); + freeBuffer(&isl->indexStateBuffer); + } + return result; + } + + result = loadIndexSave(isl, &layout->super, reader, j); + freeBufferedReader(reader); + if (result != UDS_SUCCESS) { + while (j-- > 0) { + IndexSaveLayout *isl = &layout->index.saves[j]; + FREE(isl->masterIndexZones); + FREE(isl->openChapter); + freeBuffer(&isl->indexStateBuffer); + } + return result; + } + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static int loadIndexLayout(IndexLayout *layout) +{ + BufferedReader *reader; + int result = openBufferedReader(layout->factory, layout->offset, + UDS_BLOCK_SIZE, &reader); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "unable to read superblock"); + } + + result = loadSuperBlock(layout, UDS_BLOCK_SIZE, + layout->offset / UDS_BLOCK_SIZE, reader); + freeBufferedReader(reader); + if (result != UDS_SUCCESS) { + FREE(layout->index.saves); + layout->index.saves = NULL; + return result; + } + + result = loadSubIndexRegions(layout); + if (result != UDS_SUCCESS) { + FREE(layout->index.saves); + layout->index.saves = NULL; + return result; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static void generateSuperBlockData(size_t blockSize, + unsigned int maxSaves, + uint64_t openChapterBlocks, + uint64_t pageMapBlocks, + SuperBlockData *super) +{ + memset(super, 0, sizeof(*super)); + memcpy(super->magicLabel, SINGLE_FILE_MAGIC_1, SINGLE_FILE_MAGIC_1_LENGTH); + createUniqueNonceData(super->nonceInfo, sizeof(super->nonceInfo)); + + super->nonce = generateMasterNonce(super->nonceInfo, + sizeof(super->nonceInfo)); + super->version = SUPER_VERSION_CURRENT; + super->blockSize = blockSize; + super->numIndexes = 1; + super->maxSaves = maxSaves; + super->openChapterBlocks = openChapterBlocks; + super->pageMapBlocks = pageMapBlocks; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int resetIndexSaveLayout(IndexSaveLayout *isl, + uint64_t *nextBlockPtr, + uint64_t saveBlocks, + uint64_t pageMapBlocks, + unsigned int instance) +{ + uint64_t startBlock = *nextBlockPtr; + + if (isl->masterIndexZones) { + FREE(isl->masterIndexZones); + } + if (isl->openChapter) { + FREE(isl->openChapter); + } + if (isl->indexStateBuffer) { + freeBuffer(&isl->indexStateBuffer); + } + memset(isl, 0, sizeof(*isl)); + isl->saveType = NO_SAVE; + setupLayout(&isl->indexSave, &startBlock, saveBlocks, RL_KIND_SAVE, + instance); + setupLayout(&isl->header, nextBlockPtr, 1, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + setupLayout(&isl->indexPageMap, nextBlockPtr, pageMapBlocks, + RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); + uint64_t remaining = startBlock - *nextBlockPtr; + setupLayout(&isl->freeSpace, nextBlockPtr, remaining, RL_KIND_SCRATCH, + RL_SOLE_INSTANCE); + // number of zones is a save-time parameter + // presence of open chapter is a save-time parameter + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static void defineSubIndexNonce(SubIndexLayout *sil, + uint64_t masterNonce, + unsigned int indexId) +{ + struct subIndexNonceData { + uint64_t offset; + uint16_t indexId; + }; + byte buffer[sizeof(struct subIndexNonceData)] = { 0 }; + size_t offset = 0; + encodeUInt64LE(buffer, &offset, sil->subIndex.startBlock); + encodeUInt16LE(buffer, &offset, indexId); + sil->nonce = generateSecondaryNonce(masterNonce, buffer, sizeof(buffer)); + if (sil->nonce == 0) { + sil->nonce = generateSecondaryNonce(~masterNonce + 1, + buffer, sizeof(buffer)); + } +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int setupSubIndex(SubIndexLayout *sil, + uint64_t *nextBlockPtr, + SaveLayoutSizes *sls, + unsigned int instance, + uint64_t masterNonce) +{ + uint64_t startBlock = *nextBlockPtr; + + setupLayout(&sil->subIndex, &startBlock, sls->subIndexBlocks, + RL_KIND_INDEX, instance); + setupLayout(&sil->volume, nextBlockPtr, sls->volumeBlocks, + RL_KIND_VOLUME, RL_SOLE_INSTANCE); + unsigned int i; + for (i = 0; i < sls->numSaves; ++i) { + int result = resetIndexSaveLayout(&sil->saves[i], nextBlockPtr, + sls->saveBlocks, sls->pageMapBlocks, i); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (startBlock != *nextBlockPtr) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "sub index layout regions don't agree"); + } + + defineSubIndexNonce(sil, masterNonce, instance); + return UDS_SUCCESS; +} + +/*****************************************************************************/ +/** + * Initialize a single file layout using the save layout sizes specified. + * + * @param layout the layout to initialize + * @param offset the offset in bytes from the start of the backing storage + * @param size the size in bytes of the backing storage + * @param sls a populated SaveLayoutSizes object + * + * @return UDS_SUCCESS or an error code, potentially + * UDS_INSUFFICIENT_INDEX_SPACE if the size of the backing store + * is not sufficient for the index configuration, + * UDS_BAD_INDEX_ALIGNMENT if the offset specified does not + * align properly with the index block and page sizes] + * various other errors + **/ +__attribute__((warn_unused_result)) +static int initSingleFileLayout(IndexLayout *layout, + uint64_t offset, + uint64_t size, + SaveLayoutSizes *sls) +{ + layout->totalBlocks = sls->totalBlocks; + + if (size < sls->totalBlocks * sls->blockSize) { + return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, + "not enough space for index as configured"); + } + + generateSuperBlockData(sls->blockSize, sls->numSaves, sls->openChapterBlocks, + sls->pageMapBlocks, &layout->super); + initializeIndexVersion(&layout->indexVersion, SUPER_VERSION_CURRENT); + + int result = allocateSingleFileParts(layout, &layout->super); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t nextBlock = offset / sls->blockSize; + + setupLayout(&layout->header, &nextBlock, 1, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + setupLayout(&layout->config, &nextBlock, 1, RL_KIND_CONFIG, + RL_SOLE_INSTANCE); + result = setupSubIndex(&layout->index, &nextBlock, sls, 0, + layout->super.nonce); + if (result != UDS_SUCCESS) { + return result; + } + setupLayout(&layout->seal, &nextBlock, 1, RL_KIND_SEAL, RL_SOLE_INSTANCE); + if (nextBlock * sls->blockSize > offset + size) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "layout does not fit as expected"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static void expectSubIndex(SubIndexLayout *sil, + RegionIterator *iter, + SuperBlockData *super, + unsigned int instance) +{ + if (iter->result != UDS_SUCCESS) { + return; + } + + uint64_t startBlock = iter->nextBlock; + + expectLayout(true, &sil->subIndex, iter, 0, RL_KIND_INDEX, instance); + + uint64_t endBlock = iter->nextBlock; + iter->nextBlock = startBlock; + + expectLayout(true, &sil->volume, iter, 0, RL_KIND_VOLUME, RL_SOLE_INSTANCE); + + unsigned int i; + for (i = 0; i < super->maxSaves; ++i) { + IndexSaveLayout *isl = &sil->saves[i]; + expectLayout(true, &isl->indexSave, iter, 0, RL_KIND_SAVE, i); + } + + if (iter->nextBlock != endBlock) { + iterError(iter, "sub index region does not span all saves"); + } + + defineSubIndexNonce(sil, super->nonce, instance); +} + +/*****************************************************************************/ + +/** + * Initialize a single file layout from the region table and super block data + * stored in stable storage. + * + * @param layout the layout to initialize + * @param region the IO region for this layout + * @param super the super block data read from the superblock + * @param table the region table read from the superblock + * @param firstBlock the first block number in the region + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int reconstituteSingleFileLayout(IndexLayout *layout, + SuperBlockData *super, + RegionTable *table, + uint64_t firstBlock) +{ + layout->super = *super; + layout->totalBlocks = table->header.regionBlocks; + + RegionIterator iter = { + .nextRegion = table->regions, + .lastRegion = table->regions + table->header.numRegions, + .nextBlock = firstBlock, + .result = UDS_SUCCESS + }; + + expectLayout(true, &layout->header, &iter, 1, RL_KIND_HEADER, + RL_SOLE_INSTANCE); + expectLayout(true, &layout->config, &iter, 1, RL_KIND_CONFIG, + RL_SOLE_INSTANCE); + expectSubIndex(&layout->index, &iter, &layout->super, 0); + expectLayout(true, &layout->seal, &iter, 1, RL_KIND_SEAL, RL_SOLE_INSTANCE); + + if (iter.result != UDS_SUCCESS) { + return iter.result; + } + + if (iter.nextBlock != firstBlock + layout->totalBlocks) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "layout table does not span total blocks"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int saveSubIndexRegions(IndexLayout *layout) +{ + SubIndexLayout *sil = &layout->index; + unsigned int j; + for (j = 0; j < layout->super.maxSaves; ++j) { + IndexSaveLayout *isl = &sil->saves[j]; + int result = writeIndexSaveLayout(layout, isl); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "unable to format index %u save 0 layout", + j); + } + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int makeSingleFileRegionTable(IndexLayout *layout, + unsigned int *numRegionsPtr, + RegionTable **tablePtr) +{ + unsigned int numRegions = + 1 + // header + 1 + // config + 1 + // index + 1 + // volume + layout->super.maxSaves + // saves + 1; // seal + + RegionTable *table; + int result = ALLOCATE_EXTENDED(RegionTable, numRegions, LayoutRegion, + "layout region table", &table); + if (result != UDS_SUCCESS) { + return result; + } + + LayoutRegion *lr = &table->regions[0]; + *lr++ = layout->header; + *lr++ = layout->config; + SubIndexLayout *sil = &layout->index; + *lr++ = sil->subIndex; + *lr++ = sil->volume; + unsigned int j; + for (j = 0; j < layout->super.maxSaves; ++j) { + *lr++ = sil->saves[j].indexSave; + } + *lr++ = layout->seal; + + result = ASSERT((lr == &table->regions[numRegions]), + "incorrect number of regions"); + if (result != UDS_SUCCESS) { + return result; + } + + *numRegionsPtr = numRegions; + *tablePtr = table; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int encodeIndexSaveData(Buffer *buffer, IndexSaveData *saveData) +{ + int result = putUInt64LEIntoBuffer(buffer, saveData->timestamp); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, saveData->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, saveData->version); + if (result != UDS_SUCCESS) { + return result; + } + result = zeroBytes(buffer, 4); /* padding */ + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof *saveData, + "%zu bytes encoded of %zu expected", + contentLength(buffer), sizeof(*saveData)); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int encodeRegionHeader(Buffer *buffer, RegionHeader *header) +{ + size_t startingLength = contentLength(buffer); + int result = putUInt64LEIntoBuffer(buffer, REGION_MAGIC); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->regionBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, header->type); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, header->version); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, header->numRegions); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, header->payload); + if (result != UDS_SUCCESS) { + return result; + } + result + = ASSERT_LOG_ONLY(contentLength(buffer) - startingLength == sizeof(*header), + "%zu bytes encoded, of %zu expected", + contentLength(buffer) - startingLength, sizeof(*header)); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int encodeLayoutRegion(Buffer *buffer, LayoutRegion *region) +{ + size_t startingLength = contentLength(buffer); + int result = putUInt64LEIntoBuffer(buffer, region->startBlock); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, region->numBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, region->checksum); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, region->kind); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, region->instance); + if (result != UDS_SUCCESS) { + return result; + } + result + = ASSERT_LOG_ONLY(contentLength(buffer) - startingLength == sizeof(*region), + "%zu bytes encoded, of %zu expected", + contentLength(buffer) - startingLength, sizeof(*region)); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int encodeSuperBlockData(Buffer *buffer, SuperBlockData *super) +{ + int result = putBytes(buffer, 32, &super->magicLabel); + if (result != UDS_SUCCESS) { + return result; + } + result = putBytes(buffer, 32, &super->nonceInfo); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, super->nonce); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, super->version); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, super->blockSize); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, super->numIndexes); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt16LEIntoBuffer(buffer, super->maxSaves); + if (result != UDS_SUCCESS) { + return result; + } + result = zeroBytes(buffer, 4); // aligment + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, super->openChapterBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, super->pageMapBlocks); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(SuperBlockData), + "%zu bytes encoded, of %zu expected", + contentLength(buffer), sizeof(SuperBlockData)); + return result; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int writeSingleFileHeader(IndexLayout *layout, + RegionTable *table, + unsigned int numRegions, + BufferedWriter *writer) +{ + table->header = (RegionHeader) { + .magic = REGION_MAGIC, + .regionBlocks = layout->totalBlocks, + .type = RH_TYPE_SUPER, + .version = 1, + .numRegions = numRegions, + .payload = sizeof(layout->super), + }; + + size_t tableSize = sizeof(RegionTable) + numRegions * sizeof(LayoutRegion); + + Buffer *buffer; + int result = makeBuffer(tableSize, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeRegionHeader(buffer, &table->header); + + unsigned int i; + for (i = 0; i < numRegions; i++) { + if (result == UDS_SUCCESS) { + result = encodeLayoutRegion(buffer, &table->regions[i]); + } + } + + if (result == UDS_SUCCESS) { + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + } + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeBuffer(sizeof(layout->super), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeSuperBlockData(buffer, &layout->super); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + return flushBufferedWriter(writer); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int saveSingleFileConfiguration(IndexLayout *layout) +{ + int result = saveSubIndexRegions(layout); + if (result != UDS_SUCCESS) { + return result; + } + + RegionTable *table; + unsigned int numRegions; + result = makeSingleFileRegionTable(layout, &numRegions, &table); + if (result != UDS_SUCCESS) { + return result; + } + + BufferedWriter *writer = NULL; + result = openLayoutWriter(layout, &layout->header, &writer); + if (result != UDS_SUCCESS) { + FREE(table); + return result; + } + + result = writeSingleFileHeader(layout, table, numRegions, writer); + FREE(table); + freeBufferedWriter(writer); + + return result; +} + +/*****************************************************************************/ +void putIndexLayout(IndexLayout **layoutPtr) +{ + if (layoutPtr == NULL) { + return; + } + IndexLayout *layout = *layoutPtr; + *layoutPtr = NULL; + if ((layout == NULL) || (--layout->refCount > 0)) { + return; + } + + SubIndexLayout *sil = &layout->index; + if (sil->saves != NULL) { + unsigned int j; + for (j = 0; j < layout->super.maxSaves; ++j) { + IndexSaveLayout *isl = &sil->saves[j]; + FREE(isl->masterIndexZones); + FREE(isl->openChapter); + freeBuffer(&isl->indexStateBuffer); + } + } + FREE(sil->saves); + + if (layout->factory != NULL) { + putIOFactory(layout->factory); + } + FREE(layout); +} + +/*****************************************************************************/ +void getIndexLayout(IndexLayout *layout, IndexLayout **layoutPtr) +{ + ++layout->refCount; + *layoutPtr = layout; +} + +/*****************************************************************************/ +const struct index_version *getIndexVersion(IndexLayout *layout) +{ + return &layout->indexVersion; +} + +/*****************************************************************************/ +int writeIndexConfig(IndexLayout *layout, UdsConfiguration config) +{ + BufferedWriter *writer = NULL; + int result = openLayoutWriter(layout, &layout->config, &writer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "failed to open config region"); + } + + result = writeConfigContents(writer, config); + if (result != UDS_SUCCESS) { + freeBufferedWriter(writer); + return logErrorWithStringError(result, "failed to write config region"); + } + result = flushBufferedWriter(writer); + if (result != UDS_SUCCESS) { + freeBufferedWriter(writer); + return logErrorWithStringError(result, "cannot flush config writer"); + } + freeBufferedWriter(writer); + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int verifyIndexConfig(IndexLayout *layout, UdsConfiguration config) +{ + BufferedReader *reader = NULL; + int result = openLayoutReader(layout, &layout->config, &reader); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "failed to open config reader"); + } + + struct udsConfiguration storedConfig; + result = readConfigContents(reader, &storedConfig); + if (result != UDS_SUCCESS) { + freeBufferedReader(reader); + return logErrorWithStringError(result, "failed to read config region"); + } + freeBufferedReader(reader); + + return (areUdsConfigurationsEqual(&storedConfig, config) + ? UDS_SUCCESS + : UDS_NO_INDEX); +} + +#ifdef __KERNEL__ +/*****************************************************************************/ +int openVolumeBufio(IndexLayout *layout, + size_t blockSize, + unsigned int reservedBuffers, + struct dm_bufio_client **clientPtr) +{ + off_t offset = layout->index.volume.startBlock * layout->super.blockSize; + return makeBufio(layout->factory, offset, blockSize, reservedBuffers, + clientPtr); +} +#else +/*****************************************************************************/ +int openVolumeRegion(IndexLayout *layout, IORegion **regionPtr) +{ + LayoutRegion *lr = &layout->index.volume; + off_t start = lr->startBlock * layout->super.blockSize; + size_t size = lr->numBlocks * layout->super.blockSize; + int result = makeIORegion(layout->factory, start, size, regionPtr); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot access index volume region"); + } + return UDS_SUCCESS; +} +#endif + +/*****************************************************************************/ +uint64_t getVolumeNonce(IndexLayout *layout) +{ + return layout->index.nonce; +} + +/*****************************************************************************/ +static uint64_t generateIndexSaveNonce(uint64_t volumeNonce, + IndexSaveLayout *isl) +{ + struct SaveNonceData { + IndexSaveData data; + uint64_t offset; + } nonceData; + + nonceData.data = isl->saveData; + nonceData.data.nonce = 0; + nonceData.offset = isl->indexSave.startBlock; + + byte buffer[sizeof(nonceData)]; + size_t offset = 0; + encodeUInt64LE(buffer, &offset, nonceData.data.timestamp); + encodeUInt64LE(buffer, &offset, nonceData.data.nonce); + encodeUInt32LE(buffer, &offset, nonceData.data.version); + encodeUInt32LE(buffer, &offset, 0U); // padding + encodeUInt64LE(buffer, &offset, nonceData.offset); + ASSERT_LOG_ONLY(offset == sizeof(nonceData), + "%zu bytes encoded of %zu expected", + offset, sizeof(nonceData)); + return generateSecondaryNonce(volumeNonce, buffer, sizeof(buffer)); +} + +/*****************************************************************************/ +static int validateIndexSaveLayout(IndexSaveLayout *isl, + uint64_t volumeNonce, + uint64_t *saveTimePtr) +{ + if (isl->saveType == NO_SAVE || isl->numZones == 0 || + isl->saveData.timestamp == 0) + { + return UDS_BAD_STATE; + } + if (isl->saveData.nonce != generateIndexSaveNonce(volumeNonce, isl)) { + return UDS_BAD_STATE; + } + if (saveTimePtr != NULL) { + *saveTimePtr = isl->saveData.timestamp; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int selectOldestIndexSaveLayout(SubIndexLayout *sil, + unsigned int maxSaves, + IndexSaveLayout **islPtr) +{ + IndexSaveLayout *oldest = NULL; + uint64_t oldestTime = 0; + + // find the oldest valid or first invalid slot + IndexSaveLayout *isl; + for (isl = sil->saves; isl < sil->saves + maxSaves; ++isl) { + uint64_t saveTime = 0; + int result = validateIndexSaveLayout(isl, sil->nonce, &saveTime); + if (result != UDS_SUCCESS) { + saveTime = 0; + } + if (oldest == NULL || saveTime < oldestTime) { + oldest = isl; + oldestTime = saveTime; + } + } + + int result = ASSERT((oldest != NULL), "no oldest or free save slot"); + if (result != UDS_SUCCESS) { + return result; + } + *islPtr = oldest; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int selectLatestIndexSaveLayout(SubIndexLayout *sil, + unsigned int maxSaves, + IndexSaveLayout **islPtr) +{ + IndexSaveLayout *latest = NULL; + uint64_t latestTime = 0; + + // find the latest valid save slot + IndexSaveLayout *isl; + for (isl = sil->saves; isl < sil->saves + maxSaves; ++isl) { + uint64_t saveTime = 0; + int result = validateIndexSaveLayout(isl, sil->nonce, &saveTime); + if (result != UDS_SUCCESS) { + continue; + } + if (saveTime > latestTime) { + latest = isl; + latestTime = saveTime; + } + } + + if (latest == NULL) { + return UDS_INDEX_NOT_SAVED_CLEANLY; + } + *islPtr = latest; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static uint64_t getTimeMS(AbsTime time) +{ + time_t t = asTimeT(time); + RelTime r = timeDifference(time, fromTimeT(t)); + return (uint64_t) t * 1000 + relTimeToMilliseconds(r); +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int instantiateIndexSaveLayout(IndexSaveLayout *isl, + SuperBlockData *super, + uint64_t volumeNonce, + unsigned int numZones, + IndexSaveType saveType) +{ + int result = UDS_SUCCESS; + if (isl->openChapter && saveType == IS_CHECKPOINT) { + FREE(isl->openChapter); + isl->openChapter = NULL; + } else if (isl->openChapter == NULL && saveType == IS_SAVE) { + result = ALLOCATE(1, LayoutRegion, "open chapter layout", + &isl->openChapter); + if (result != UDS_SUCCESS) { + return result; + } + } + if (numZones != isl->numZones) { + if (isl->masterIndexZones != NULL) { + FREE(isl->masterIndexZones); + } + result = ALLOCATE(numZones, LayoutRegion, "master index zone layouts", + &isl->masterIndexZones); + if (result != UDS_SUCCESS) { + return result; + } + isl->numZones = numZones; + } + + populateIndexSaveLayout(isl, super, numZones, saveType); + + result = makeBuffer(INDEX_STATE_BUFFER_SIZE, &isl->indexStateBuffer); + if (result != UDS_SUCCESS) { + return result; + } + + isl->read = isl->written = false; + isl->saveType = saveType; + memset(&isl->saveData, 0, sizeof(isl->saveData)); + isl->saveData.timestamp = getTimeMS(currentTime(CLOCK_REALTIME)); + isl->saveData.version = 1; + + isl->saveData.nonce = generateIndexSaveNonce(volumeNonce, isl); + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int invalidateOldSave(IndexLayout *layout, IndexSaveLayout *isl) +{ + uint64_t startBlock = isl->indexSave.startBlock; + uint64_t saveBlocks = isl->indexSave.numBlocks; + unsigned int save = isl->indexSave.instance; + + int result = resetIndexSaveLayout(isl, &startBlock, saveBlocks, + layout->super.pageMapBlocks, save); + if (result != UDS_SUCCESS) { + return result; + } + + return writeIndexSaveLayout(layout, isl); +} + +/*****************************************************************************/ +int setupIndexSaveSlot(IndexLayout *layout, + unsigned int numZones, + IndexSaveType saveType, + unsigned int *saveSlotPtr) +{ + SubIndexLayout *sil = &layout->index; + + IndexSaveLayout *isl = NULL; + int result = selectOldestIndexSaveLayout(sil, layout->super.maxSaves, &isl); + if (result != UDS_SUCCESS) { + return result; + } + + result = invalidateOldSave(layout, isl); + if (result != UDS_SUCCESS) { + return result; + } + + result = instantiateIndexSaveLayout(isl, &layout->super, sil->nonce, + numZones, saveType); + if (result != UDS_SUCCESS) { + return result; + } + + *saveSlotPtr = isl - sil->saves; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int findLatestIndexSaveSlot(IndexLayout *layout, + unsigned int *numZonesPtr, + unsigned int *slotPtr) +{ + SubIndexLayout *sil = &layout->index; + + IndexSaveLayout *isl = NULL; + int result = selectLatestIndexSaveLayout(sil, layout->super.maxSaves, &isl); + if (result != UDS_SUCCESS) { + return result; + } + + if (numZonesPtr != NULL) { + *numZonesPtr = isl->numZones; + } + if (slotPtr != NULL) { + *slotPtr = isl - sil->saves; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int makeIndexSaveRegionTable(IndexSaveLayout *isl, + unsigned int *numRegionsPtr, + RegionTable **tablePtr) +{ + unsigned int numRegions = + 1 + // header + 1 + // index page map + isl->numZones + // master index zones + (bool) isl->openChapter; // open chapter if needed + + if (isl->freeSpace.numBlocks > 0) { + numRegions++; + } + + RegionTable *table; + int result = ALLOCATE_EXTENDED(RegionTable, numRegions, LayoutRegion, + "layout region table for ISL", &table); + if (result != UDS_SUCCESS) { + return result; + } + + LayoutRegion *lr = &table->regions[0]; + *lr++ = isl->header; + *lr++ = isl->indexPageMap; + unsigned int z; + for (z = 0; z < isl->numZones; ++z) { + *lr++ = isl->masterIndexZones[z]; + } + if (isl->openChapter) { + *lr++ = *isl->openChapter; + } + if (isl->freeSpace.numBlocks > 0) { + *lr++ = isl->freeSpace; + } + + result = ASSERT((lr == &table->regions[numRegions]), + "incorrect number of ISL regions"); + if (result != UDS_SUCCESS) { + return result; + } + + *numRegionsPtr = numRegions; + *tablePtr = table; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static unsigned int regionTypeForSaveType(IndexSaveType saveType) +{ + switch (saveType) { + case IS_SAVE: + return RH_TYPE_SAVE; + + case IS_CHECKPOINT: + return RH_TYPE_CHECKPOINT; + + default: + break; + } + + return RH_TYPE_UNSAVED; +} + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int writeIndexSaveHeader(IndexSaveLayout *isl, + RegionTable *table, + unsigned int numRegions, + BufferedWriter *writer) +{ + size_t payload = sizeof(isl->saveData); + if (isl->indexStateBuffer != NULL) { + payload += contentLength(isl->indexStateBuffer); + } + + table->header = (RegionHeader) { + .magic = REGION_MAGIC, + .regionBlocks = isl->indexSave.numBlocks, + .type = regionTypeForSaveType(isl->saveType), + .version = 1, + .numRegions = numRegions, + .payload = payload, + }; + + size_t tableSize = sizeof(RegionTable) + numRegions * sizeof(LayoutRegion); + Buffer *buffer; + int result = makeBuffer(tableSize, &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeRegionHeader(buffer, &table->header); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + unsigned int i; + for (i = 0; i < numRegions; i++) { + result = encodeLayoutRegion(buffer, &table->regions[i]); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == tableSize, + "%zu bytes encoded of %zu expected", + contentLength(buffer), tableSize); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeBuffer(sizeof(isl->saveData), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeIndexSaveData(buffer, &isl->saveData); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + + if (isl->indexStateBuffer != NULL) { + result = writeToBufferedWriter(writer, + getBufferContents(isl->indexStateBuffer), + contentLength(isl->indexStateBuffer)); + if (result != UDS_SUCCESS) { + return result; + } + } + + return flushBufferedWriter(writer); +} + +/*****************************************************************************/ +static int writeIndexSaveLayout(IndexLayout *layout, IndexSaveLayout *isl) +{ + unsigned int numRegions; + RegionTable *table; + int result = makeIndexSaveRegionTable(isl, &numRegions, &table); + if (result != UDS_SUCCESS) { + return result; + } + + BufferedWriter *writer = NULL; + result = openLayoutWriter(layout, &isl->header, &writer); + if (result != UDS_SUCCESS) { + FREE(table); + return result; + } + + result = writeIndexSaveHeader(isl, table, numRegions, writer); + FREE(table); + freeBufferedWriter(writer); + + isl->written = true; + return result; +} + +/*****************************************************************************/ +int commitIndexSave(IndexLayout *layout, unsigned int saveSlot) +{ + int result = ASSERT((saveSlot < layout->super.maxSaves), + "save slot out of range"); + if (result != UDS_SUCCESS) { + return result; + } + + IndexSaveLayout *isl = &layout->index.saves[saveSlot]; + + if (bufferUsed(isl->indexStateBuffer) == 0) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "%s: no index state data saved", __func__); + } + + return writeIndexSaveLayout(layout, isl); +} + +/*****************************************************************************/ + +static void mutilateIndexSaveInfo(IndexSaveLayout *isl) +{ + memset(&isl->saveData, 0, sizeof(isl->saveData)); + isl->read = isl->written = 0; + isl->saveType = NO_SAVE; + isl->numZones = 0; + freeBuffer(&isl->indexStateBuffer); +} + +/*****************************************************************************/ +int cancelIndexSave(IndexLayout *layout, unsigned int saveSlot) +{ + int result = ASSERT((saveSlot < layout->super.maxSaves), + "save slot out of range"); + if (result != UDS_SUCCESS) { + return result; + } + + mutilateIndexSaveInfo(&layout->index.saves[saveSlot]); + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int discardIndexSaves(IndexLayout *layout, bool all) +{ + int result = UDS_SUCCESS; + SubIndexLayout *sil = &layout->index; + + if (all) { + unsigned int i; + for (i = 0; i < layout->super.maxSaves; ++i) { + IndexSaveLayout *isl = &sil->saves[i]; + result = firstError(result, invalidateOldSave(layout, isl)); + } + } else { + IndexSaveLayout *isl; + result = selectLatestIndexSaveLayout(sil, layout->super.maxSaves, &isl); + if (result == UDS_SUCCESS) { + result = invalidateOldSave(layout, isl); + } + } + + return result; +} + +/*****************************************************************************/ +static int createIndexLayout(IndexLayout *layout, + uint64_t size, + const UdsConfiguration config) +{ + if (config == NULL) { + return UDS_CONF_PTR_REQUIRED; + } + + SaveLayoutSizes sizes; + int result = computeSizes(&sizes, config, UDS_BLOCK_SIZE, 0); + if (result != UDS_SUCCESS) { + return result; + } + + if (size < sizes.totalBlocks * sizes.blockSize) { + return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, + "layout requires at least %" PRIu64 + " bytes", + sizes.totalBlocks * sizes.blockSize); + } + + result = initSingleFileLayout(layout, layout->offset, size, &sizes); + if (result != UDS_SUCCESS) { + return result; + } + + result = saveSingleFileConfiguration(layout); + if (result != UDS_SUCCESS) { + return result; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +Buffer *getIndexStateBuffer(IndexLayout *layout, unsigned int slot) +{ + return layout->index.saves[slot].indexStateBuffer; +} + +/*****************************************************************************/ +static int findLayoutRegion(IndexLayout *layout, + unsigned int slot, + const char *operation, + RegionKind kind, + unsigned int zone, + LayoutRegion **lrPtr) +{ + int result = ASSERT((slot < layout->super.maxSaves), "%s not started", + operation); + if (result != UDS_SUCCESS) { + return result; + } + + IndexSaveLayout *isl = &layout->index.saves[slot]; + + LayoutRegion *lr = NULL; + switch (kind) { + case RL_KIND_INDEX_PAGE_MAP: + lr = &isl->indexPageMap; + break; + + case RL_KIND_OPEN_CHAPTER: + if (isl->openChapter == NULL) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "%s: %s has no open chapter", + __func__, operation); + } + lr = isl->openChapter; + break; + + case RL_KIND_MASTER_INDEX: + if (isl->masterIndexZones == NULL || zone >= isl->numZones) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "%s: %s has no master index zone %u", + __func__, operation, zone); + } + lr = &isl->masterIndexZones[zone]; + break; + + default: + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "%s: unexpected kind %u", + __func__, kind); + } + + *lrPtr = lr; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int openIndexBufferedReader(IndexLayout *layout, + unsigned int slot, + RegionKind kind, + unsigned int zone, + BufferedReader **readerPtr) +{ + LayoutRegion *lr = NULL; + int result = findLayoutRegion(layout, slot, "load", kind, zone, &lr); + if (result != UDS_SUCCESS) { + return result; + } + return openLayoutReader(layout, lr, readerPtr); +} + +/*****************************************************************************/ +int openIndexBufferedWriter(IndexLayout *layout, + unsigned int slot, + RegionKind kind, + unsigned int zone, + BufferedWriter **writerPtr) +{ + LayoutRegion *lr = NULL; + int result = findLayoutRegion(layout, slot, "save", kind, zone, &lr); + if (result != UDS_SUCCESS) { + return result; + } + return openLayoutWriter(layout, lr, writerPtr); +} + +/*****************************************************************************/ +int makeIndexLayoutFromFactory(IOFactory *factory, + off_t offset, + uint64_t namedSize, + bool newLayout, + const UdsConfiguration config, + IndexLayout **layoutPtr) +{ + // Get the device size and round it down to a multiple of UDS_BLOCK_SIZE. + size_t size = getWritableSize(factory) & -UDS_BLOCK_SIZE; + if (namedSize > size) { + return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, + "index storage (%zu) is smaller than the" + " requested size %llu", + size, namedSize); + } + if ((namedSize > 0) && (namedSize < size)) { + size = namedSize; + } + + // Get the index size according the the config + uint64_t configSize; + int result = udsComputeIndexSize(config, 0, &configSize); + if (result != UDS_SUCCESS) { + return result; + } + if (size < configSize) { + return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, + "index storage (%zu) is smaller than the" + " required size %llu", + size, configSize); + } + size = configSize; + + IndexLayout *layout = NULL; + result = ALLOCATE(1, IndexLayout, __func__, &layout); + if (result != UDS_SUCCESS) { + return result; + } + layout->refCount = 1; + + getIOFactory(factory); + layout->factory = factory; + layout->offset = offset; + + if (newLayout) { + // Populate the layout from the UDSConfiguration + result = createIndexLayout(layout, size, config); + } else { + // Populate the layout from the saved index. + result = loadIndexLayout(layout); + } + if (result != UDS_SUCCESS) { + putIndexLayout(&layout); + return result; + } + *layoutPtr = layout; + return UDS_SUCCESS; +} diff --git a/source/uds/indexLayout.h b/source/uds/indexLayout.h new file mode 100644 index 0000000..4144799 --- /dev/null +++ b/source/uds/indexLayout.h @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexLayout.h#13 $ + */ + +#ifndef INDEX_LAYOUT_H +#define INDEX_LAYOUT_H + +#include "buffer.h" +#include "indexState.h" +#include "indexVersion.h" +#include "ioFactory.h" +#include "uds.h" + +typedef struct indexLayout IndexLayout; + +/** + * Construct an index layout. This is a platform specific function that uses + * the name string, a flag that indicates old vs. new indices, and a + * UDSConfiguration (for new indices) to make an IOFactory and invoke + * makeIndexLayoutFromFactory. + * + * @param name String naming the index. Each platform will use its own + * conventions to interpret the string, but in general it is + * a space-separated sequence of param=value settings. For + * backward compatibility a string without an equals is + * treated as a platform-specific default parameter value. + * @param newLayout Whether this is a new layout. + * @param config The UdsConfiguration required for a new layout. + * @param layoutPtr Where to store the new index layout + * + * @return UDS_SUCCESS or an error code. + **/ +int makeIndexLayout(const char *name, + bool newLayout, + const UdsConfiguration config, + IndexLayout **layoutPtr) + __attribute__((warn_unused_result)); + +/** + * Construct an index layout using an IOFactory. This method is common to all + * platforms. + * + * @param factory The IOFactory for the block storage containing the index. + * @param offset The offset of the start of the index within the block + * storage address space. + * @param namedSize The size in bytes of the space within the block storage + * address space, as specified in the name string. + * @param newLayout Whether this is a new layout. + * @param config The UdsConfiguration required for a new layout. + * @param layoutPtr Where to store the new index layout + * + * @return UDS_SUCCESS or an error code. + **/ +int makeIndexLayoutFromFactory(IOFactory *factory, + off_t offset, + uint64_t namedSize, + bool newLayout, + const UdsConfiguration config, + IndexLayout **layoutPtr) + __attribute__((warn_unused_result)); + +/** + * Decrement the use count of an index layout. If the count goes to zero, free + * the index layout. + * + * @param layoutPtr Where the layout is being stored. Always reset to NULL. + **/ +void putIndexLayout(IndexLayout **layoutPtr); + +/*****************************************************************************/ +int cancelIndexSave(IndexLayout *layout, unsigned int saveSlot) + __attribute__((warn_unused_result)); + +/*****************************************************************************/ +int commitIndexSave(IndexLayout *layout, unsigned int saveSlot) + __attribute__((warn_unused_result)); + +/*****************************************************************************/ +int discardIndexSaves(IndexLayout *layout, bool all) + __attribute__((warn_unused_result)); + +/** + * Find the latest index save slot. + * + * @param [in] layout The single file layout. + * @param [out] numZonesPtr Where to store the actual number of zones + * that were saved. + * @param [out] slotPtr Where to store the slot number we found. + * + * @return UDS_SUCCESS or an error code. + **/ +int findLatestIndexSaveSlot(IndexLayout *layout, + unsigned int *numZonesPtr, + unsigned int *slotPtr) + __attribute__((warn_unused_result)); + +/** + * Get another reference to an index layout, incrementing it's use count. + * + * @param layout The index layout. + * @param layoutPtr Where the new layout pointer is being stored. + **/ +void getIndexLayout(IndexLayout *layout, IndexLayout **layoutPtr); + +/** + * Open a BufferedReader for a specified state, kind, and zone. + * + * @param layout The index layout + * @param slot The save slot + * @param kind The kind if index save region to open. + * @param zone The zone number for the region. + * @param readerPtr Where to store the BufferedReader. + * + * @return UDS_SUCCESS or an error code. + **/ +int openIndexBufferedReader(IndexLayout *layout, + unsigned int slot, + RegionKind kind, + unsigned int zone, + BufferedReader **readerPtr) + __attribute__((warn_unused_result)); + +/** + * Open a BufferedWriter for a specified state, kind, and zone. + * + * @param layout The index layout + * @param slot The save slot + * @param kind The kind if index save region to open. + * @param zone The zone number for the region. + * @param writerPtr Where to store the BufferedWriter. + * + * @return UDS_SUCCESS or an error code. + **/ +int openIndexBufferedWriter(IndexLayout *layout, + unsigned int slot, + RegionKind kind, + unsigned int zone, + BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); + +/** + * Obtain the nonce to be used to store or validate the loading of volume index + * pages. + * + * @param [in] layout The index layout. + * + * @return The nonce to use. + **/ +uint64_t getVolumeNonce(IndexLayout *layout) + __attribute__((warn_unused_result)); + +#ifdef __KERNEL__ +/** + * Obtain a dm_bufio_client for the specified index volume. + * + * @param [in] layout The index layout. + * @param [in] blockSize The size of a volume page + * @param [in] reservedBuffers The count of reserved buffers + * @param [out] clientPtr Where to put the new dm_bufio_client + * + * @return UDS_SUCCESS or an error code. + **/ +int openVolumeBufio(IndexLayout *layout, + size_t blockSize, + unsigned int reservedBuffers, + struct dm_bufio_client **clientPtr) + __attribute__((warn_unused_result)); +#else +/** + * Obtain an IORegion for the specified index volume. + * + * @param [in] layout The index layout. + * @param [out] regionPtr Where to put the new region. + * + * @return UDS_SUCCESS or an error code. + **/ +int openVolumeRegion(IndexLayout *layout, struct ioRegion **regionPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Read the index configuration, and verify that it matches the given + * configuration. + * + * @param layout the generic index layout + * @param config the index configuration + * + * @return UDS_SUCCESS or an error code + **/ +int verifyIndexConfig(IndexLayout *layout, UdsConfiguration config) + __attribute__((warn_unused_result)); + +/** + * Determine which index save slot to use for a new index save. + * + * Also allocates the masterIndex regions and, if needed, the openChapter + * region. + * + * @param [in] layout The index layout. + * @param [in] numZones Actual number of zones currently in use. + * @param [in] saveType The index save type. + * @param [out] saveSlotPtr Where to store the save slot number. + * + * @return UDS_SUCCESS or an error code + **/ +int setupIndexSaveSlot(IndexLayout *layout, + unsigned int numZones, + IndexSaveType saveType, + unsigned int *saveSlotPtr) + __attribute__((warn_unused_result)); + +/** + * Write the index configuration. + * + * @param layout the generic index layout + * @param config the index configuration to write + * + * @return UDS_SUCCESS or an error code + **/ +int writeIndexConfig(IndexLayout *layout, UdsConfiguration config) + __attribute__((warn_unused_result)); + +/** + * Get the index state buffer + * + * @param layout the index layout + * @param slot the save slot + * + * @return UDS_SUCCESS or an error code + **/ +Buffer *getIndexStateBuffer(IndexLayout *layout, unsigned int slot) + __attribute__((warn_unused_result)); + +/** + * Get the index version parameters. + * + * @param layout the index layout + * + * @return the index version parameters. + **/ +const struct index_version *getIndexVersion(IndexLayout *layout) + __attribute__((warn_unused_result)); + +#endif // INDEX_LAYOUT_H diff --git a/source/uds/indexLayoutLinuxKernel.c b/source/uds/indexLayoutLinuxKernel.c new file mode 100644 index 0000000..8301166 --- /dev/null +++ b/source/uds/indexLayoutLinuxKernel.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/indexLayoutLinuxKernel.c#5 $ + */ + +#include "indexLayout.h" +#include "indexLayoutParser.h" +#include "memoryAlloc.h" + +/*****************************************************************************/ +int makeIndexLayout(const char *name, + bool newLayout, + const UdsConfiguration config, + IndexLayout **layoutPtr) +{ + char *dev = NULL; + uint64_t offset = 0; + uint64_t size = 0; + + LayoutParameter parameterTable[] = { + { "dev", LP_STRING | LP_DEFAULT, { .str = &dev } }, + { "offset", LP_UINT64, { .num = &offset } }, + { "size", LP_UINT64, { .num = &size } }, + }; + size_t numParameters = sizeof(parameterTable) / sizeof(*parameterTable); + + char *params = NULL; + int result = duplicateString(name, "makeIndexLayout parameters", ¶ms); + if (result != UDS_SUCCESS) { + return result; + } + + // note dev will be set to memory owned by params + result = parseLayoutString(params, parameterTable, numParameters); + if (result != UDS_SUCCESS) { + FREE(params); + return result; + } + + IOFactory *factory = NULL; + result = makeIOFactory(dev, &factory); + FREE(params); + if (result != UDS_SUCCESS) { + return result; + } + IndexLayout *layout; + result = makeIndexLayoutFromFactory(factory, offset, size, newLayout, config, + &layout); + putIOFactory(factory); + if (result != UDS_SUCCESS) { + return result; + } + *layoutPtr = layout; + return UDS_SUCCESS; +} diff --git a/source/uds/indexLayoutParser.c b/source/uds/indexLayoutParser.c new file mode 100644 index 0000000..808def7 --- /dev/null +++ b/source/uds/indexLayoutParser.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexLayoutParser.c#2 $ + */ + +#include "indexLayoutParser.h" + +#include "errors.h" +#include "logger.h" +#include "permassert.h" +#include "stringUtils.h" +#include "typeDefs.h" +#include "uds.h" + +/*****************************************************************************/ +__attribute__((warn_unused_result)) +static int setParameterValue(LayoutParameter *lp, char *data) +{ + if ((lp->type & LP_TYPE_MASK) == LP_UINT64) { + int result = parseUint64(data, lp->value.num); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, + "bad numeric value %s", data); + } + } else if ((lp->type & LP_TYPE_MASK) == LP_STRING) { + *lp->value.str = data; + } else { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "unkown LayoutParameter type code %x", + (lp->type & LP_TYPE_MASK)); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int parseLayoutString(char *info, LayoutParameter *params, size_t count) +{ + if (!strchr(info, '=')) { + LayoutParameter *lp; + for (lp = params; lp < params + count; ++lp) { + if (lp->type & LP_DEFAULT) { + int result = setParameterValue(lp, info); + if (result != UDS_SUCCESS) { + return result; + } + break; + } + } + } else { + char *data = NULL; + char *token; + for (token = nextToken(info, " ", &data); + token; + token = nextToken(NULL, " ", &data)) + { + char *equal = strchr(token, '='); + LayoutParameter *lp; + for (lp = params; lp < params + count; ++lp) { + if (!equal && (lp->type & LP_DEFAULT)) { + break; + } else if (strncmp(token, lp->name, equal - token) == 0 && + strlen(lp->name) == (size_t) (equal - token)) { + break; + } + } + if (lp == NULL) { + return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, + "unkown index parameter %s", + token); + } + if (lp->seen) { + return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, + "duplicate index parameter %s", + token); + } + lp->seen = true; + int result = setParameterValue(lp, equal ? equal + 1 : token); + if (result != UDS_SUCCESS) { + return result; + } + } + } + return UDS_SUCCESS; +} diff --git a/source/uds/indexLayoutParser.h b/source/uds/indexLayoutParser.h new file mode 100644 index 0000000..35b492a --- /dev/null +++ b/source/uds/indexLayoutParser.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexLayoutParser.h#1 $ + */ + +#ifndef INDEX_LAYOUT_PARSER_H +#define INDEX_LAYOUT_PARSER_H + +#include "typeDefs.h" + +typedef enum { + LP_STRING = 0x001, + LP_UINT64 = 0x002, + LP_TYPE_MASK = 0x0FF, + LP_DEFAULT = 0x100, +} LPType; + +typedef struct layoutParameter { + const char *name; + LPType type; + union { + char **str; + uint64_t *num; + } value; + bool seen; +} LayoutParameter; + +/** + * Function to parse an index layout specification. + * + * This parser treats the specification as a set of name=value parameters + * or, in the absence of an '=' character, a single value for a default + * parameter. The list of acceptable parameters is specified as an array + * of LayoutParameter entries. Each such parameter contains the address + * of the variable in which the value is to be stored. + * + * @param info A copy of the index layout specification that + * will be altered by the parser to insert null + * characters after each value. Note that string + * parameter values will point into the memory of + * this string, so this specification cannot be + * deallocated until all uses of the parameter + * values are over. + * @param params The table of parameters the caller expects to + * find in the ``info'' string. Currently this + * parser can handle string and uint64_t values. + * @param count The size of the parameter table. + * + * @return UDS_SUCCESS or an error code, particularly + * UDS_INDEX_NAME_REQUIRED for all parsing errors. + **/ +int parseLayoutString(char *info, LayoutParameter *params, size_t count) + __attribute__((warn_unused_result)); + +#endif // INDEX_LAYOUT_PARSER_H diff --git a/source/uds/indexPageMap.c b/source/uds/indexPageMap.c new file mode 100644 index 0000000..a915179 --- /dev/null +++ b/source/uds/indexPageMap.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexPageMap.c#4 $ + */ + +#include "indexPageMap.h" + +#include "buffer.h" +#include "bufferedWriter.h" +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "indexComponent.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "stringUtils.h" +#include "threads.h" +#include "uds.h" + +static int readIndexPageMap(ReadPortal *portal); +static int writeIndexPageMap(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone); + +static const byte INDEX_PAGE_MAP_MAGIC[] = "ALBIPM02"; +enum { + INDEX_PAGE_MAP_MAGIC_LENGTH = sizeof(INDEX_PAGE_MAP_MAGIC) - 1, +}; + +const IndexComponentInfo INDEX_PAGE_MAP_INFO = { + .kind = RL_KIND_INDEX_PAGE_MAP, + .name = "index page map", + .saveOnly = false, + .chapterSync = true, + .multiZone = false, + .ioStorage = true, + .loader = readIndexPageMap, + .saver = writeIndexPageMap, + .incremental = NULL, +}; + +/*****************************************************************************/ +static INLINE size_t numEntries(const Geometry *geometry) +{ + return geometry->chaptersPerVolume * (geometry->indexPagesPerChapter - 1); +} + +/*****************************************************************************/ +int makeIndexPageMap(const Geometry *geometry, IndexPageMap **mapPtr) +{ + unsigned int deltaListsPerChapter = geometry->deltaListsPerChapter; + int result + = ASSERT_WITH_ERROR_CODE(((deltaListsPerChapter - 1) <= UINT16_MAX), + UDS_BAD_STATE, + "delta lists per chapter (%u) is too large", + deltaListsPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + + IndexPageMap *map; + result = ALLOCATE(1, IndexPageMap, "Index Page Map", &map); + if (result != UDS_SUCCESS) { + return result; + } + + map->geometry = geometry; + + result = ALLOCATE(numEntries(geometry), + IndexPageMapEntry, + "Index Page Map Entries", + &map->entries); + if (result != UDS_SUCCESS) { + freeIndexPageMap(map); + return result; + } + + *mapPtr = map; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void freeIndexPageMap(IndexPageMap *map) +{ + if (map != NULL) { + FREE(map->entries); + FREE(map); + } +} + +/*****************************************************************************/ +uint64_t getLastUpdate(const IndexPageMap *map) +{ + return map->lastUpdate; +} + +/*****************************************************************************/ +int updateIndexPageMap(IndexPageMap *map, + uint64_t virtualChapterNumber, + unsigned int chapterNumber, + unsigned int indexPageNumber, + unsigned int deltaListNumber) +{ + const Geometry *geometry = map->geometry; + if ((virtualChapterNumber < map->lastUpdate) + || (virtualChapterNumber > map->lastUpdate + 1)) { + // if the lastUpdate is 0, this is likely to be normal because we are + // replaying the volume + if (map->lastUpdate != 0) { + logWarning("unexpected index page map update, jumping from %" PRIu64 + " to %llu", + map->lastUpdate, virtualChapterNumber); + } + } + map->lastUpdate = virtualChapterNumber; + + if (chapterNumber >= geometry->chaptersPerVolume) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "chapter number %u exceeds maximum %u", + chapterNumber, geometry->chaptersPerVolume - 1); + } + if (indexPageNumber >= geometry->indexPagesPerChapter) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "index page number %u exceeds maximum %u", + indexPageNumber, geometry->indexPagesPerChapter - 1); + } + if (deltaListNumber >= geometry->deltaListsPerChapter) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "delta list number %u exceeds maximum %u", + deltaListNumber, geometry->deltaListsPerChapter - 1); + } + + if (indexPageNumber == (geometry->indexPagesPerChapter - 1)) { + /* + * There is no entry for the last index page of a chapter since its entry + * would always be geometry->deltaListsPerChapter - 1. + */ + return UDS_SUCCESS; + } + + size_t slot + = (chapterNumber * (geometry->indexPagesPerChapter - 1)) + indexPageNumber; + map->entries[slot] = (IndexPageMapEntry) deltaListNumber; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int findIndexPageNumber(const IndexPageMap *map, + const UdsChunkName *name, + unsigned int chapterNumber, + unsigned int *indexPageNumberPtr) +{ + const Geometry *geometry = map->geometry; + if (chapterNumber >= geometry->chaptersPerVolume) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "chapter number %u exceeds maximum %u", + chapterNumber, geometry->chaptersPerVolume - 1); + } + + unsigned int deltaListNumber = hashToChapterDeltaList(name, geometry); + unsigned int slot = (chapterNumber * (geometry->indexPagesPerChapter - 1)); + unsigned int limit = slot + (geometry->indexPagesPerChapter - 1); + unsigned int indexPageNumber = 0; + for (; slot < limit; indexPageNumber++, slot++) { + if (deltaListNumber <= map->entries[slot]) { + break; + } + } + + // This should be a clear post-condition of the loop above, but just in case + // it's not obvious, the check is cheap. + int result = ASSERT((indexPageNumber < geometry->indexPagesPerChapter), + "index page number too large"); + if (result != UDS_SUCCESS) { + return result; + } + + *indexPageNumberPtr = indexPageNumber; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getListNumberBounds(const IndexPageMap *map, + unsigned int chapterNumber, + unsigned int indexPageNumber, + IndexPageBounds *bounds) +{ + const Geometry *geometry = map->geometry; + int result = ASSERT((chapterNumber < geometry->chaptersPerVolume), + "chapter number is valid"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((indexPageNumber < geometry->indexPagesPerChapter), + "index page number is valid"); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int slot = chapterNumber * (geometry->indexPagesPerChapter - 1); + bounds->lowestList = ((indexPageNumber == 0) + ? 0 + : map->entries[slot + indexPageNumber - 1] + 1); + bounds->highestList = ((indexPageNumber == geometry->indexPagesPerChapter - 1) + ? geometry->deltaListsPerChapter - 1 + : map->entries[slot + indexPageNumber]); + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +size_t indexPageMapSize(const Geometry *geometry) +{ + return sizeof(IndexPageMapEntry) * numEntries(geometry); +} + +/*****************************************************************************/ +static int writeIndexPageMap(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone) +{ + int result = ASSERT((zone == 0), "unimplemented zone %d", zone); + if (result != UDS_SUCCESS) { + return result; + } + + IndexPageMap *map = indexComponentData(component); + + Buffer *buffer; + result = makeBuffer(INDEX_PAGE_MAP_MAGIC_LENGTH + sizeof(map->lastUpdate), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = putBytes(buffer, INDEX_PAGE_MAP_MAGIC_LENGTH, INDEX_PAGE_MAP_MAGIC); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = putUInt64LEIntoBuffer(buffer, map->lastUpdate); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot write index page map header"); + } + result = makeBuffer(indexPageMapSize(map->geometry), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result + = putUInt16LEsIntoBuffer(buffer, numEntries(map->geometry), map->entries); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(writer, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot write index page map data"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +uint64_t computeIndexPageMapSaveSize(const Geometry *geometry) +{ + return indexPageMapSize(geometry) + + INDEX_PAGE_MAP_MAGIC_LENGTH + sizeof(((IndexPageMap *) 0)->lastUpdate); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeIndexPageMap(Buffer *buffer, IndexPageMap *map) +{ + int result = getUInt64LEFromBuffer(buffer, &map->lastUpdate); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt16LEsFromBuffer(buffer, numEntries(map->geometry), + map->entries); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + return result; +} + +/*****************************************************************************/ +static int readIndexPageMap(ReadPortal *portal) +{ + IndexPageMap *map = indexComponentData(portal->component); + + BufferedReader *reader = NULL; + + int result = getBufferedReaderForPortal(portal, 0, &reader); + if (result != UDS_SUCCESS) { + return result; + } + + result = verifyBufferedData(reader, INDEX_PAGE_MAP_MAGIC, + INDEX_PAGE_MAP_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "bad index page map saved magic"); + } + + Buffer *buffer; + result + = makeBuffer(sizeof(map->lastUpdate) + indexPageMapSize(map->geometry), + &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(reader, getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + logErrorWithStringError(result, "cannot read index page map data"); + return result; + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = decodeIndexPageMap(buffer, map); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + logDebug("read index page map, last update %llu", map->lastUpdate); + return UDS_SUCCESS; +} diff --git a/source/uds/indexPageMap.h b/source/uds/indexPageMap.h new file mode 100644 index 0000000..3767cdd --- /dev/null +++ b/source/uds/indexPageMap.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexPageMap.h#2 $ + */ + +#ifndef INDEX_PAGE_MAP_H +#define INDEX_PAGE_MAP_H 1 + +#include "common.h" +#include "geometry.h" +#include "indexComponent.h" + +extern const IndexComponentInfo INDEX_PAGE_MAP_INFO; + +typedef struct indexPageMap IndexPageMap; + +typedef struct { + unsigned int lowestList; + unsigned int highestList; +} IndexPageBounds; + +/* + * Notes on IndexPageMap + * + * Each volume maintains an index page map which records how the chapter delta + * lists are distributed among the index pages for that chapter. + * + * The map is conceptually a two-dimensional array indexed by chapter number + * and index page number within the chapter. Each entry contains the number + * of the last delta list on that index page. In order to save memory, the + * information for the last page in each chapter is not recorded, as it is + * known from the geometry. + */ + +typedef uint16_t IndexPageMapEntry; + +struct indexPageMap { + const Geometry *geometry; + uint64_t lastUpdate; + IndexPageMapEntry *entries; +}; + +/** + * Create an index page map. + * + * @param geometry The geometry governing the index. + * @param mapPtr A pointer to hold the new map. + * + * @return A success or error code. + **/ +int makeIndexPageMap(const Geometry *geometry, IndexPageMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Free an index page map. + * + * @param map The index page map to destroy. + **/ +void freeIndexPageMap(IndexPageMap *map); + +/** + * Get the virtual chapter number of the last update to the index page map. + * + * @param map The index page map + * + * @return the virtual chapter number of the last chapter updated + **/ +uint64_t getLastUpdate(const IndexPageMap *map); + +/** + * Update an index page map entry. + * + * @param map The map to update + * @param virtualChapterNumber The virtual chapter number being updated. + * @param chapterNumber The chapter of the entry to update + * @param indexPageNumber The index page of the entry to update + * @param deltaListNumber The value of the new entry + * + * @return UDS_SUCCESS or an error code + **/ +int updateIndexPageMap(IndexPageMap *map, + uint64_t virtualChapterNumber, + unsigned int chapterNumber, + unsigned int indexPageNumber, + unsigned int deltaListNumber) + __attribute__((warn_unused_result)); + +/** + * Find the page number of the index page in a chapter that will contain the + * chapter index entry for a given chunk name, if it exists. + * + * @param [in] map The map to search + * @param [in] name The chunk name + * @param [in] chapterNumber The chapter containing the index page + * @param [out] indexPageNumberPtr A pointer to hold the result, guaranteed to + * be a valid index page number on UDS_SUCCESS + * + * @return UDS_SUCCESS, or UDS_INVALID_ARGUMENT if the chapter number + * is out of range + **/ +int findIndexPageNumber(const IndexPageMap *map, + const UdsChunkName *name, + unsigned int chapterNumber, + unsigned int *indexPageNumberPtr) + __attribute__((warn_unused_result)); + +/** + * Get the lowest and highest numbered delta lists for the given immutable + * chapter index page from the index page map. + * + * @param map The index page map + * @param chapterNumber The chapter containing the delta list + * @param indexPageNumber The index page number within the chapter + * @param bounds A structure to hold the list number bounds + * for the given page + * + * @return UDS_SUCCESS or an error code + **/ +int getListNumberBounds(const IndexPageMap *map, + unsigned int chapterNumber, + unsigned int indexPageNumber, + IndexPageBounds *bounds) + __attribute__((warn_unused_result)); + +/** + * Compute the size of the index page map save image, including all headers. + * + * @param geometry The index geometry. + * + * @return The number of bytes required to save the index page map. + **/ +uint64_t computeIndexPageMapSaveSize(const Geometry *geometry); + +/** + * Escaped for testing.... + * + * @param geometry The index geometry. + * + * @return The number of bytes required for the page map data, + * exclusive of headers. + **/ +size_t indexPageMapSize(const Geometry *geometry) + __attribute__((warn_unused_result)); + +#endif // INDEX_PAGE_MAP_H diff --git a/source/uds/indexRouter.c b/source/uds/indexRouter.c new file mode 100644 index 0000000..b9b0a9e --- /dev/null +++ b/source/uds/indexRouter.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexRouter.c#7 $ + */ + +#include "indexRouter.h" + +#include "compiler.h" +#include "indexCheckpoint.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "requestQueue.h" +#include "zone.h" + +/** + * This is the request processing function invoked by the zone's RequestQueue + * worker thread. + * + * @param request the request to be indexed or executed by the zone worker + **/ +static void executeZoneRequest(Request *request) +{ + executeIndexRouterRequest(request->router, request); +} + +/** + * Construct and enqueue asynchronous control messages to add the chapter + * index for a given virtual chapter to the sparse chapter index cache. + * + * @param router the router containing the relevant queues + * @param index the index with the relevant cache and chapter + * @param virtualChapter the virtual chapter number of the chapter to cache + **/ +static void enqueueBarrierMessages(IndexRouter *router, + Index *index, + uint64_t virtualChapter) +{ + ZoneMessage barrier = { + .index = index, + .data = { + .barrier = { + .virtualChapter = virtualChapter, + } + } + }; + unsigned int zone; + for (zone = 0; zone < router->zoneCount; zone++) { + int result = launchZoneControlMessage(REQUEST_SPARSE_CACHE_BARRIER, + barrier, zone, router); + ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation"); + } +} + +/** + * This is the request processing function for the triage stage queue. Each + * request is resolved in the master index, determining if it is a hook or + * not, and if a hook, what virtual chapter (if any) it might be found in. If + * a virtual chapter is found, this enqueues a sparse chapter cache barrier in + * every zone before enqueueing the request in its zone. + * + * @param request the request to triage + **/ +static void triageRequest(Request *request) +{ + IndexRouter *router = request->router; + Index *index = router->index; + + // Check if the name is a hook in the index pointing at a sparse chapter. + uint64_t sparseVirtualChapter = triageIndexRequest(index, request); + if (sparseVirtualChapter != UINT64_MAX) { + // Generate and place a barrier request on every zone queue. + enqueueBarrierMessages(router, index, sparseVirtualChapter); + } + + enqueueRequest(request, STAGE_INDEX); +} + +/** + * Initialize the zone queues and the triage queue. + * + * @param router the router containing the queues + * @param geometry the geometry governing the indexes + * + * @return UDS_SUCCESS or error code + **/ +static int initializeLocalIndexQueues(IndexRouter *router, + const Geometry *geometry) +{ + unsigned int i; + for (i = 0; i < router->zoneCount; i++) { + int result = makeRequestQueue("indexW", &executeZoneRequest, + &router->zoneQueues[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + + // The triage queue is only needed for sparse multi-zone indexes. + if ((router->zoneCount > 1) && isSparse(geometry)) { + int result = makeRequestQueue("triageW", &triageRequest, + &router->triageQueue); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +static INLINE RequestQueue *getZoneQueue(IndexRouter *router, + unsigned int zoneNumber) +{ + return router->zoneQueues[zoneNumber]; +} + +/**********************************************************************/ +int makeIndexRouter(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + LoadType loadType, + IndexLoadContext *loadContext, + IndexRouterCallback callback, + IndexRouter **routerPtr) +{ + unsigned int zoneCount = getZoneCount(userParams); + IndexRouter *router; + int result = ALLOCATE_EXTENDED(IndexRouter, zoneCount, RequestQueue *, + "index router", &router); + if (result != UDS_SUCCESS) { + return result; + } + + router->callback = callback; + router->zoneCount = zoneCount; + + result = initializeLocalIndexQueues(router, config->geometry); + if (result != UDS_SUCCESS) { + freeIndexRouter(router); + return result; + } + + result = makeIndex(layout, config, userParams, router->zoneCount, loadType, + loadContext, &router->index); + if (result != UDS_SUCCESS) { + freeIndexRouter(router); + return logErrorWithStringError(result, "failed to create index"); + } + + router->needToSave = (router->index->loadedType != LOAD_LOAD); + *routerPtr = router; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int saveIndexRouter(IndexRouter *router) +{ + if (!router->needToSave) { + return UDS_SUCCESS; + } + int result = saveIndex(router->index); + router->needToSave = (result != UDS_SUCCESS); + return result; +} + +/**********************************************************************/ +void freeIndexRouter(IndexRouter *router) +{ + if (router == NULL) { + return; + } + requestQueueFinish(router->triageQueue); + unsigned int i; + for (i = 0; i < router->zoneCount; i++) { + requestQueueFinish(router->zoneQueues[i]); + } + freeIndex(router->index); + FREE(router); +} + +/**********************************************************************/ +RequestQueue *selectIndexRouterQueue(IndexRouter *router, + Request *request, + RequestStage nextStage) +{ + if (request->isControlMessage) { + return getZoneQueue(router, request->zoneNumber); + } + + if (nextStage == STAGE_TRIAGE) { + // The triage queue is only needed for multi-zone sparse indexes and won't + // be allocated by the router if not needed, so simply check for NULL. + if (router->triageQueue != NULL) { + return router->triageQueue; + } + // Dense index or single zone, so route it directly to the zone queue. + } else if (nextStage != STAGE_INDEX) { + ASSERT_LOG_ONLY(false, "invalid index stage: %d", nextStage); + return NULL; + } + + Index *index = router->index; + request->zoneNumber = getMasterIndexZone(index->masterIndex, + &request->chunkName); + return getZoneQueue(router, request->zoneNumber); +} + +/**********************************************************************/ +void executeIndexRouterRequest(IndexRouter *router, Request *request) +{ + if (request->isControlMessage) { + int result = dispatchIndexZoneControlRequest(request); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "error executing control message: %d", + request->action); + } + request->status = result; + enterCallbackStage(request); + return; + } + + router->needToSave = true; + if (request->requeued && !isSuccessful(request->status)) { + request->status = makeUnrecoverable(request->status); + router->callback(request); + return; + } + + Index *index = router->index; + int result = dispatchIndexRequest(index, request); + if (result == UDS_QUEUED) { + // Take the request off the pipeline. + return; + } + + request->status = result; + router->callback(request); +} diff --git a/source/uds/indexRouter.h b/source/uds/indexRouter.h new file mode 100644 index 0000000..a96262b --- /dev/null +++ b/source/uds/indexRouter.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexRouter.h#3 $ + */ + +#ifndef INDEX_ROUTER_H +#define INDEX_ROUTER_H + +#include "compiler.h" +#include "index.h" +#include "indexSession.h" +#include "request.h" + +/** + * Callback after a query, update or remove request completes and fills in + * select fields in the request: status for all requests, oldMetadata and + * hashExists for query and update requests. + * + * @param request request object. + **/ +typedef void (*IndexRouterCallback)(Request *request); + +struct indexRouter { + IndexRouterCallback callback; + unsigned int zoneCount; + bool needToSave; + Index *index; + RequestQueue *triageQueue; + RequestQueue *zoneQueues[]; +}; + +/** + * Construct and initialize an IndexRouter instance. + * + * @param layout the IndexLayout that describes the stored index + * @param config the configuration to use + * @param userParams the index session parameters. If NULL, the default + * session parameters will be used. + * @param loadType selects whether to create, load, or rebuild the index + * @param loadContext the index load context to use + * @param callback the function to invoke when a request completes or fails + * @param routerPtr a pointer in which to store the new router + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndexRouter(IndexLayout *layout, + const Configuration *config, + const struct uds_parameters *userParams, + LoadType loadType, + IndexLoadContext *loadContext, + IndexRouterCallback callback, + IndexRouter **routerPtr) + __attribute__((warn_unused_result)); + +/** + * Executes the index operation for a UDS request and calls the callback upon + * completion. + * + * @param router The index router. + * @param request A pointer to the Request to process. + **/ +void executeIndexRouterRequest(IndexRouter *router, Request *request); + +/** + * Save the index router state to persistent storage. + * + * It is the responsibility of the caller to ensure that there are no other + * uses of the index during a call to this method. It is necessary that there + * be no index requests from any block context nor any other attempt to save + * the index until after a call to saveIndexRouter returns. + * + * @param router the index router to save + * + * @return UDS_SUCCESS if successful. + **/ +int saveIndexRouter(IndexRouter *router) __attribute__((warn_unused_result)); + +/** + * Destroy the index router and free its memory. + * + * @param router the index router to destroy (may be NULL) + * + * @return UDS_SUCCESS if successful. + **/ +void freeIndexRouter(IndexRouter *router); + +/** + * Select and return the request queue responsible for executing the next + * index stage of a request, updating the request with any associated state + * (such as the zone number for UDS requests on a local index). + * + * @param router The index router. + * @param request The Request destined for the queue. + * @param nextStage The next request stage (STAGE_TRIAGE or STAGE_INDEX). + * + * @return the next index stage queue (the local triage queue, local zone + * queue, or remote RPC send queue) + **/ +RequestQueue *selectIndexRouterQueue(IndexRouter *router, + Request *request, + RequestStage nextStage); + +/** + * Wait for the index router to finish all operations that access a local + * storage device. + * + * @param router The index router. + **/ +static INLINE void waitForIdleIndexRouter(IndexRouter *router) +{ + waitForIdleChapterWriter(router->index->chapterWriter); +} + +#endif /* INDEX_ROUTER_H */ diff --git a/source/uds/indexSession.c b/source/uds/indexSession.c new file mode 100644 index 0000000..15e5b3f --- /dev/null +++ b/source/uds/indexSession.c @@ -0,0 +1,554 @@ +/* + * %Copyright% + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexSession.c#10 $ + */ + +#include "indexSession.h" + +#include "indexCheckpoint.h" +#include "indexRouter.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "requestQueue.h" + +/**********************************************************************/ +static void collectStats(const struct uds_index_session *indexSession, + UdsContextStats *stats) +{ + const SessionStats *sessionStats = &indexSession->stats; + + stats->currentTime = asTimeT(currentTime(CLOCK_REALTIME)); + + stats->postsFound = READ_ONCE(sessionStats->postsFound); + stats->inMemoryPostsFound = READ_ONCE(sessionStats->postsFoundOpenChapter); + stats->densePostsFound = READ_ONCE(sessionStats->postsFoundDense); + stats->sparsePostsFound = READ_ONCE(sessionStats->postsFoundSparse); + stats->postsNotFound = READ_ONCE(sessionStats->postsNotFound); + stats->updatesFound = READ_ONCE(sessionStats->updatesFound); + stats->updatesNotFound = READ_ONCE(sessionStats->updatesNotFound); + stats->deletionsFound = READ_ONCE(sessionStats->deletionsFound); + stats->deletionsNotFound = READ_ONCE(sessionStats->deletionsNotFound); + stats->queriesFound = READ_ONCE(sessionStats->queriesFound); + stats->queriesNotFound = READ_ONCE(sessionStats->queriesNotFound); + stats->requests = READ_ONCE(sessionStats->requests); +} + +/**********************************************************************/ +static void handleCallbacks(Request *request) +{ + if (request->status == UDS_SUCCESS) { + // Measure the turnaround time of this request and include that time, + // along with the rest of the request, in the context's StatCounters. + updateRequestContextStats(request); + } + + if (request->callback != NULL) { + // The request has specified its own callback and does not expect to be + // freed. + struct uds_index_session *indexSession = request->session; + request->found = (request->location != LOC_UNAVAILABLE); + request->callback((UdsRequest *) request); + // We do this release after the callback because of the contract of the + // udsFlushIndexSession method. + releaseIndexSession(indexSession); + return; + } + + // Should not get here, because this is either a control message or it has a + // callback method. + freeRequest(request); +} + +/**********************************************************************/ +int checkIndexSession(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + unsigned int state = indexSession->state; + unlockMutex(&indexSession->requestMutex); + + if (state == IS_FLAG_LOADED) { + return UDS_SUCCESS; + } else if (state & IS_FLAG_DISABLED) { + return UDS_DISABLED; + } else if ((state & IS_FLAG_LOADING) + || (state & IS_FLAG_SUSPENDED) + || (state & IS_FLAG_WAITING)) { + return UDS_SUSPENDED; + } + + return UDS_NO_INDEXSESSION; +} + +/**********************************************************************/ +int getIndexSession(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + indexSession->requestCount++; + unlockMutex(&indexSession->requestMutex); + + int result = checkIndexSession(indexSession); + if (result != UDS_SUCCESS) { + releaseIndexSession(indexSession); + return result; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void releaseIndexSession(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + if (--indexSession->requestCount == 0) { + broadcastCond(&indexSession->requestCond); + } + unlockMutex(&indexSession->requestMutex); +} + +/**********************************************************************/ +int startLoadingIndexSession(struct uds_index_session *indexSession) +{ + int result; + lockMutex(&indexSession->requestMutex); + if (indexSession->state & IS_FLAG_SUSPENDED) { + result = UDS_SUSPENDED; + } else if (indexSession->state != 0) { + result = UDS_INDEXSESSION_IN_USE; + } else { + indexSession->state |= IS_FLAG_LOADING; + result = UDS_SUCCESS; + } + unlockMutex(&indexSession->requestMutex); + return result; +} + +/**********************************************************************/ +void finishLoadingIndexSession(struct uds_index_session *indexSession, + int result) +{ + lockMutex(&indexSession->requestMutex); + indexSession->state &= ~IS_FLAG_LOADING; + if (result == UDS_SUCCESS) { + indexSession->state |= IS_FLAG_LOADED; + } + broadcastCond(&indexSession->requestCond); + unlockMutex(&indexSession->requestMutex); +} + +/**********************************************************************/ +void disableIndexSession(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + indexSession->state |= IS_FLAG_DISABLED; + unlockMutex(&indexSession->requestMutex); +} + +/**********************************************************************/ +int makeEmptyIndexSession(struct uds_index_session **indexSessionPtr) +{ + struct uds_index_session *session; + int result = ALLOCATE(1, struct uds_index_session, __func__, &session); + if (result != UDS_SUCCESS) { + return result; + } + + result = initMutex(&session->requestMutex); + if (result != UDS_SUCCESS) { + FREE(session); + return result; + } + + result = initCond(&session->requestCond); + if (result != UDS_SUCCESS) { + destroyMutex(&session->requestMutex); + FREE(session); + return result; + } + + result = initMutex(&session->loadContext.mutex); + if (result != UDS_SUCCESS) { + destroyCond(&session->requestCond); + destroyMutex(&session->requestMutex); + FREE(session); + return result; + } + + result = initCond(&session->loadContext.cond); + if (result != UDS_SUCCESS) { + destroyMutex(&session->loadContext.mutex); + destroyCond(&session->requestCond); + destroyMutex(&session->requestMutex); + FREE(session); + return result; + } + + result = makeRequestQueue("callbackW", &handleCallbacks, + &session->callbackQueue); + if (result != UDS_SUCCESS) { + destroyCond(&session->loadContext.cond); + destroyMutex(&session->loadContext.mutex); + destroyCond(&session->requestCond); + destroyMutex(&session->requestMutex); + FREE(session); + return result; + } + + *indexSessionPtr = session; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsSuspendIndexSession(struct uds_index_session *session, bool save) +{ + int result; + bool saveIndex = false; + bool suspendIndex = false; + lockMutex(&session->requestMutex); + // Wait for any pending close operation to complete. + while (session->state & IS_FLAG_CLOSING) { + waitCond(&session->requestCond, &session->requestMutex); + } + if ((session->state & IS_FLAG_WAITING) + || (session->state & IS_FLAG_DESTROYING)) { + result = EBUSY; + } else if (session->state & IS_FLAG_SUSPENDED) { + result = UDS_SUCCESS; + } else if (session->state & IS_FLAG_LOADING) { + session->state |= IS_FLAG_WAITING; + suspendIndex = true; + result = UDS_SUCCESS; + } else if (!(session->state & IS_FLAG_LOADED)) { + session->state |= IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + result = UDS_SUCCESS; + } else { + saveIndex = save; + if (saveIndex) { + session->state |= IS_FLAG_WAITING; + } else { + session->state |= IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + } + result = UDS_SUCCESS; + } + unlockMutex(&session->requestMutex); + + if (!saveIndex && !suspendIndex) { + return result; + } + + if (saveIndex) { + result = udsSaveIndex(session); + lockMutex(&session->requestMutex); + session->state &= ~IS_FLAG_WAITING; + session->state |= IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + unlockMutex(&session->requestMutex); + return result; + } + + lockMutex(&session->loadContext.mutex); + switch (session->loadContext.status) { + case INDEX_OPENING: + session->loadContext.status = INDEX_SUSPENDING; + + // Wait until the index indicates that it is not replaying. + while ((session->loadContext.status != INDEX_SUSPENDED) + && (session->loadContext.status != INDEX_READY)) { + waitCond(&session->loadContext.cond, + &session->loadContext.mutex); + } + break; + + case INDEX_READY: + // Index load does not need to be suspended. + break; + + case INDEX_SUSPENDED: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + // These cases should not happen. + ASSERT_LOG_ONLY(false, "Bad load context state %u", + session->loadContext.status); + break; + } + unlockMutex(&session->loadContext.mutex); + + lockMutex(&session->requestMutex); + session->state &= ~IS_FLAG_WAITING; + session->state |= IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + unlockMutex(&session->requestMutex); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsResumeIndexSession(struct uds_index_session *session) +{ + lockMutex(&session->requestMutex); + if (session->state & IS_FLAG_WAITING) { + unlockMutex(&session->requestMutex); + return EBUSY; + } + + /* If not suspended, just succeed */ + if (!(session->state & IS_FLAG_SUSPENDED)) { + unlockMutex(&session->requestMutex); + return UDS_SUCCESS; + } + + if (!(session->state & IS_FLAG_LOADING)) { + session->state &= ~IS_FLAG_SUSPENDED; + unlockMutex(&session->requestMutex); + return UDS_SUCCESS; + } + + session->state |= IS_FLAG_WAITING; + unlockMutex(&session->requestMutex); + + lockMutex(&session->loadContext.mutex); + switch (session->loadContext.status) { + case INDEX_SUSPENDED: + session->loadContext.status = INDEX_OPENING; + // Notify the index to start replaying again. + broadcastCond(&session->loadContext.cond); + break; + + case INDEX_READY: + // There is no index rebuild to resume. + break; + + case INDEX_OPENING: + case INDEX_SUSPENDING: + case INDEX_FREEING: + default: + // These cases should not happen; do nothing. + ASSERT_LOG_ONLY(false, "Bad load context state %u", + session->loadContext.status); + break; + } + unlockMutex(&session->loadContext.mutex); + + lockMutex(&session->requestMutex); + session->state &= ~IS_FLAG_WAITING; + session->state &= ~IS_FLAG_SUSPENDED; + broadcastCond(&session->requestCond); + unlockMutex(&session->requestMutex); + return UDS_SUCCESS; +} + +/**********************************************************************/ +static void waitForNoRequestsInProgress(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + while (indexSession->requestCount > 0) { + waitCond(&indexSession->requestCond, &indexSession->requestMutex); + } + unlockMutex(&indexSession->requestMutex); +} + +/**********************************************************************/ +int saveAndFreeIndex(struct uds_index_session *indexSession) +{ + int result = UDS_SUCCESS; + IndexRouter *router = indexSession->router; + if (router != NULL) { + lockMutex(&indexSession->requestMutex); + bool suspended = (indexSession->state & IS_FLAG_SUSPENDED); + unlockMutex(&indexSession->requestMutex); + if (!suspended) { + result = saveIndexRouter(router); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "ignoring error from saveIndexRouter"); + } + } + freeIndexRouter(router); + indexSession->router = NULL; + + // Reset all index state that happens to be in the index session, so it + // doesn't affect any future index. + lockMutex(&indexSession->loadContext.mutex); + indexSession->loadContext.status = INDEX_OPENING; + unlockMutex(&indexSession->loadContext.mutex); + + lockMutex(&indexSession->requestMutex); + // Only the suspend bit will remain relevant. + indexSession->state &= IS_FLAG_SUSPENDED; + unlockMutex(&indexSession->requestMutex); + } + + logDebug("Closed index"); + return result; +} + +/**********************************************************************/ +int udsCloseIndex(struct uds_index_session *indexSession) +{ + lockMutex(&indexSession->requestMutex); + + // Wait for any pending suspend, resume or close operations to complete. + while ((indexSession->state & IS_FLAG_WAITING) + || (indexSession->state & IS_FLAG_CLOSING)) { + waitCond(&indexSession->requestCond, &indexSession->requestMutex); + } + + int result = UDS_SUCCESS; + if (indexSession->state & IS_FLAG_SUSPENDED) { + result = UDS_SUSPENDED; + } else if ((indexSession->state & IS_FLAG_DESTROYING) + || !(indexSession->state & IS_FLAG_LOADED)) { + // The index doesn't exist, hasn't finished loading, or is being destroyed. + result = UDS_NO_INDEXSESSION; + } else { + indexSession->state |= IS_FLAG_CLOSING; + } + unlockMutex(&indexSession->requestMutex); + if (result != UDS_SUCCESS) { + return result; + } + + logDebug("Closing index"); + waitForNoRequestsInProgress(indexSession); + result = saveAndFreeIndex(indexSession); + + lockMutex(&indexSession->requestMutex); + indexSession->state &= ~IS_FLAG_CLOSING; + broadcastCond(&indexSession->requestCond); + unlockMutex(&indexSession->requestMutex); + return result; +} + +/**********************************************************************/ +int udsDestroyIndexSession(struct uds_index_session *indexSession) +{ + logDebug("Destroying index session"); + + bool loadPending = false; + lockMutex(&indexSession->requestMutex); + + // Wait for any pending suspend, resume, or close operations to complete. + while ((indexSession->state & IS_FLAG_WAITING) + || (indexSession->state & IS_FLAG_CLOSING)) { + waitCond(&indexSession->requestCond, &indexSession->requestMutex); + } + + if (indexSession->state & IS_FLAG_DESTROYING) { + unlockMutex(&indexSession->requestMutex); + return EBUSY; + } + + indexSession->state |= IS_FLAG_DESTROYING; + loadPending = ((indexSession->state & IS_FLAG_LOADING) + && (indexSession->state & IS_FLAG_SUSPENDED)); + unlockMutex(&indexSession->requestMutex); + + if (loadPending) { + // Tell the index to terminate the rebuild. + lockMutex(&indexSession->loadContext.mutex); + if (indexSession->loadContext.status == INDEX_SUSPENDED) { + indexSession->loadContext.status = INDEX_FREEING; + broadcastCond(&indexSession->loadContext.cond); + } + unlockMutex(&indexSession->loadContext.mutex); + + // Wait until the load exits before proceeding. + lockMutex(&indexSession->requestMutex); + while (indexSession->state & IS_FLAG_LOADING) { + waitCond(&indexSession->requestCond, &indexSession->requestMutex); + } + unlockMutex(&indexSession->requestMutex); + } + + waitForNoRequestsInProgress(indexSession); + int result = saveAndFreeIndex(indexSession); + requestQueueFinish(indexSession->callbackQueue); + indexSession->callbackQueue = NULL; + destroyCond(&indexSession->loadContext.cond); + destroyMutex(&indexSession->loadContext.mutex); + destroyCond(&indexSession->requestCond); + destroyMutex(&indexSession->requestMutex); + logDebug("Destroyed index session"); + FREE(indexSession); + return result; +} + +/**********************************************************************/ +int udsFlushIndexSession(struct uds_index_session *indexSession) +{ + waitForNoRequestsInProgress(indexSession); + // Wait until any open chapter writes are complete + waitForIdleIndexRouter(indexSession->router); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsSaveIndex(struct uds_index_session *indexSession) +{ + waitForNoRequestsInProgress(indexSession); + // saveIndexRouter waits for open chapter writes to complete + return saveIndexRouter(indexSession->router); +} + +/**********************************************************************/ +int udsSetCheckpointFrequency(struct uds_index_session *indexSession, + unsigned int frequency) +{ + setIndexCheckpointFrequency(indexSession->router->index->checkpoint, + frequency); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsGetIndexConfiguration(struct uds_index_session *indexSession, + UdsConfiguration *conf) +{ + if (conf == NULL) { + return logErrorWithStringError(UDS_CONF_PTR_REQUIRED, + "received a NULL config pointer"); + } + int result = ALLOCATE(1, struct udsConfiguration, __func__, conf); + if (result == UDS_SUCCESS) { + **conf = indexSession->userConfig; + } + return result; +} + +/**********************************************************************/ +int udsGetIndexStats(struct uds_index_session *indexSession, + UdsIndexStats *stats) +{ + if (stats == NULL) { + return logErrorWithStringError(UDS_INDEX_STATS_PTR_REQUIRED, + "received a NULL index stats pointer"); + } + getIndexStats(indexSession->router->index, stats); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int udsGetIndexSessionStats(struct uds_index_session *indexSession, + UdsContextStats *stats) +{ + if (stats == NULL) { + return logWarningWithStringError(UDS_CONTEXT_STATS_PTR_REQUIRED, + "received a NULL context stats pointer"); + } + collectStats(indexSession, stats); + return UDS_SUCCESS; +} diff --git a/source/uds/indexSession.h b/source/uds/indexSession.h new file mode 100644 index 0000000..1467fd2 --- /dev/null +++ b/source/uds/indexSession.h @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexSession.h#6 $ + */ + +#ifndef INDEX_SESSION_H +#define INDEX_SESSION_H + +#include "atomicDefs.h" +#include "config.h" +#include "cpu.h" +#include "opaqueTypes.h" +#include "threads.h" +#include "uds.h" + +/** + * The bit position of flags used to indicate index session states. + **/ +typedef enum { + IS_FLAG_BIT_START = 8, + /** Flag indicating that the session is loading */ + IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START, + /** Flag indicating that that the session has been loaded */ + IS_FLAG_BIT_LOADED, + /** Flag indicating that the session is disabled permanently */ + IS_FLAG_BIT_DISABLED, + /** Flag indicating that the session is suspended */ + IS_FLAG_BIT_SUSPENDED, + /** Flag indicating that the session is waiting for an index state change */ + IS_FLAG_BIT_WAITING, + /** Flag indicating that that the session is closing */ + IS_FLAG_BIT_CLOSING, + /** Flag indicating that that the session is being destroyed */ + IS_FLAG_BIT_DESTROYING, +} IndexSessionFlagBit; + +/** + * The index session state flags. + **/ +typedef enum { + IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED), + IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING), + IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED), + IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED), + IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING), + IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING), + IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING), +} IndexSessionFlag; + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) sessionStats { + uint64_t postsFound; /* Post calls that found an entry */ + uint64_t postsFoundOpenChapter; /* Post calls found in the open chapter */ + uint64_t postsFoundDense; /* Post calls found in the dense index */ + uint64_t postsFoundSparse; /* Post calls found in the sparse index */ + uint64_t postsNotFound; /* Post calls that did not find an entry */ + uint64_t updatesFound; /* Update calls that found an entry */ + uint64_t updatesNotFound; /* Update calls that did not find an entry */ + uint64_t deletionsFound; /* Delete calls that found an entry */ + uint64_t deletionsNotFound; /* Delete calls that did not find an entry */ + uint64_t queriesFound; /* Query calls that found an entry */ + uint64_t queriesNotFound; /* Query calls that did not find an entry */ + uint64_t requests; /* Total number of requests */ +} SessionStats; + +/** + * States used in the index load context, reflecting the state of the index. + **/ +typedef enum { + /** The index has not been loaded or rebuilt completely */ + INDEX_OPENING = 0, + /** The index is able to handle requests */ + INDEX_READY, + /** The index has a pending request to suspend */ + INDEX_SUSPENDING, + /** The index is suspended in the midst of a rebuild */ + INDEX_SUSPENDED, + /** The index is being shut down while suspended */ + INDEX_FREEING, +} IndexSuspendStatus; + +/** + * The CondVar here must be notified when the status changes to + * INDEX_SUSPENDED, in order to wake up the waiting udsSuspendIndexSession() + * call. It must also be notified when the status changes away from + * INDEX_SUSPENDED, to resume rebuild the index from checkForSuspend() in the + * index. + **/ +typedef struct indexLoadContext { + Mutex mutex; + CondVar cond; + IndexSuspendStatus status; // Covered by indexLoadContext.mutex. +} IndexLoadContext; + +/** + * The request CondVar here must be notified when IS_FLAG_WAITING is cleared, + * in case udsCloseIndex() or udsDestroyIndexSession() is waiting on that flag. + * It must also be notified when IS_FLAG_CLOSING is cleared, in case + * udsSuspendIndexSession(), udsCloseIndex() or udsDestroyIndexSession() is + * waiting on that flag. + * Finally, it must also be notified when IS_FLAG_LOADING is cleared, to inform + * udsDestroyIndexSession() that the index session can be safely freed. + **/ +struct uds_index_session { + unsigned int state; // Covered by requestMutex. + IndexRouter *router; + RequestQueue *callbackQueue; + struct udsConfiguration userConfig; + IndexLoadContext loadContext; + // Asynchronous Request synchronization + Mutex requestMutex; + CondVar requestCond; + int requestCount; + // Request statistics, all owned by the callback thread + SessionStats stats; +}; + +/** + * Check that the index session is usable. + * + * @param indexSession the session to query + * + * @return UDS_SUCCESS or an error code + **/ +int checkIndexSession(struct uds_index_session *indexSession) + __attribute__((warn_unused_result)); + +/** + * Make sure that the IndexSession is allowed to load an index, and if so, set + * its state to indicate that the load has started. + * + * @param indexSession the session to load with + * + * @return UDS_SUCCESS, or an error code if an index already exists. + **/ +int startLoadingIndexSession(struct uds_index_session *indexSession) + __attribute__((warn_unused_result)); + +/** + * Update the IndexSession state after attempting to load an index, to indicate + * that the load has completed, and whether or not it succeeded. + * + * @param indexSession the session that was loading + * @param result the result of the load operation + **/ +void finishLoadingIndexSession(struct uds_index_session *indexSession, + int result); + +/** + * Disable an index session due to an error. + * + * @param indexSession the session to be disabled + **/ +void disableIndexSession(struct uds_index_session *indexSession); + +/** + * Acquire the index session for an asynchronous index request. + * + * The pointer must eventually be released with a corresponding call to + * releaseIndexSession(). + * + * @param indexSession The index session + * + * @return UDS_SUCCESS or an error code + **/ +int getIndexSession(struct uds_index_session *indexSession) + __attribute__((warn_unused_result)); + +/** + * Release a pointer to an index session. + * + * @param indexSession The session to release + **/ +void releaseIndexSession(struct uds_index_session *indexSession); + +/** + * Construct a new, empty index session. + * + * @param indexSessionPtr The pointer to receive the new session + * + * @return UDS_SUCCESS or an error code + **/ +int makeEmptyIndexSession(struct uds_index_session **indexSessionPtr) + __attribute__((warn_unused_result)); + +/** + * Save an index while the session is quiescent. + * + * During the call to #udsSaveIndex, there should be no other call to + * #udsSaveIndex and there should be no calls to #udsStartChunkOperation. + * + * @param indexSession The session to save + * + * @return Either #UDS_SUCCESS or an error code + **/ +int udsSaveIndex(struct uds_index_session *indexSession) + __attribute__((warn_unused_result)); + +/** + * Close the index by saving the underlying index. + * + * @param indexSession The index session to be shut down and freed + **/ +int saveAndFreeIndex(struct uds_index_session *indexSession); + +/** + * Set the checkpoint frequency of the grid. + * + * @param session The index session to be modified. + * @param frequency New checkpoint frequency. + * + * @return Either UDS_SUCCESS or an error code. + * + **/ +int udsSetCheckpointFrequency(struct uds_index_session *session, + unsigned int frequency) + __attribute__((warn_unused_result)); + +#endif /* INDEX_SESSION_H */ diff --git a/source/uds/indexState.c b/source/uds/indexState.c new file mode 100644 index 0000000..86b9fd3 --- /dev/null +++ b/source/uds/indexState.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexState.c#6 $ + */ + +#include "indexState.h" + +#include "errors.h" +#include "indexComponent.h" +#include "indexLayout.h" +#include "logger.h" +#include "memoryAlloc.h" + + +/*****************************************************************************/ +int makeIndexState(IndexLayout *layout, + unsigned int numZones, + unsigned int maxComponents, + IndexState **statePtr) +{ + if (maxComponents == 0) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "cannot make index state with maxComponents 0"); + } + + IndexState *state = NULL; + int result = ALLOCATE_EXTENDED(IndexState, maxComponents, IndexComponent *, + "index state", &state); + if (result != UDS_SUCCESS) { + return result; + } + + state->count = 0; + state->layout = layout; + state->length = maxComponents; + state->loadZones = 0; + state->loadSlot = UINT_MAX; + state->saveSlot = UINT_MAX; + state->saving = false; + state->zoneCount = numZones; + + *statePtr = state; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void freeIndexState(IndexState **statePtr) +{ + IndexState *state = *statePtr; + *statePtr = NULL; + if (state != NULL) { + unsigned int i; + for (i = 0; i < state->count; ++i) { + freeIndexComponent(&state->entries[i]); + } + FREE(state); + } +} + +/*****************************************************************************/ +/** + * Add a component to the index state. + * + * @param state The index state. + * @param component The index component. + * + * @return UDS_SUCCESS or an error code. + **/ +static int addComponentToIndexState(IndexState *state, + IndexComponent *component) +{ + if (findIndexComponent(state, component->info) != NULL) { + return logErrorWithStringError( + UDS_INVALID_ARGUMENT, "cannot add state component %s: already present", + component->info->name); + } + + if (state->count >= state->length) { + return logErrorWithStringError( + UDS_RESOURCE_LIMIT_EXCEEDED, + "cannot add state component %s, %u components already added", + component->info->name, state->count); + } + + state->entries[state->count] = component; + ++state->count; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int addIndexStateComponent(IndexState *state, + const IndexComponentInfo *info, + void *data, + void *context) +{ + IndexComponent *component = NULL; + int result = makeIndexComponent(state, info, state->zoneCount, data, context, + &component); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot make region index component"); + } + + result = addComponentToIndexState(state, component); + if (result != UDS_SUCCESS) { + freeIndexComponent(&component); + return result; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +IndexComponent *findIndexComponent(const IndexState *state, + const IndexComponentInfo *info) +{ + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (info == component->info) { + return component; + } + } + return NULL; +} + +/*****************************************************************************/ +static const char *indexSaveTypeName(IndexSaveType saveType) +{ + return saveType == IS_SAVE ? "save" : "checkpoint"; +} + +/*****************************************************************************/ +int loadIndexState(IndexState *state, bool *replayPtr) +{ + int result = findLatestIndexSaveSlot(state->layout, &state->loadZones, + &state->loadSlot); + if (result != UDS_SUCCESS) { + return result; + } + + bool replayRequired = false; + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + result = readIndexComponent(component); + if (result != UDS_SUCCESS) { + if (!missingIndexComponentRequiresReplay(component)) { + state->loadZones = 0; + state->loadSlot = UINT_MAX; + return logErrorWithStringError(result, "index component %s", + indexComponentName(component)); + } + replayRequired = true; + } + } + + state->loadZones = 0; + state->loadSlot = UINT_MAX; + if (replayPtr != NULL) { + *replayPtr = replayRequired; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int prepareToSaveIndexState(IndexState *state, IndexSaveType saveType) +{ + if (state->saving) { + return logErrorWithStringError(UDS_BAD_STATE, + "already saving the index state"); + } + int result = setupIndexSaveSlot(state->layout, state->zoneCount, saveType, + &state->saveSlot); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot prepare index %s", + indexSaveTypeName(saveType)); + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +/** + * Complete the saving of an index state. + * + * @param state the index state + * + * @return UDS_SUCCESS or an error code + **/ +static int completeIndexSaving(IndexState *state) +{ + state->saving = false; + int result = commitIndexSave(state->layout, state->saveSlot); + state->saveSlot = UINT_MAX; + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot commit index state"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +static int cleanupSave(IndexState *state) +{ + int result = cancelIndexSave(state->layout, state->saveSlot); + state->saveSlot = UINT_MAX; + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot cancel index save"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int saveIndexState(IndexState *state) +{ + int result = prepareToSaveIndexState(state, IS_SAVE); + if (result != UDS_SUCCESS) { + return result; + } + + +unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + result = writeIndexComponent(component); + if (result != UDS_SUCCESS) { + cleanupSave(state); + return result; + } + } + return completeIndexSaving(state); +} + +/*****************************************************************************/ +int writeIndexStateCheckpoint(IndexState *state) +{ + int result = prepareToSaveIndexState(state, IS_CHECKPOINT); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + result = writeIndexComponent(component); + if (result != UDS_SUCCESS) { + cleanupSave(state); + return result; + } + } + + return completeIndexSaving(state); +} + +/*****************************************************************************/ +int startIndexStateCheckpoint(IndexState *state) +{ + int result = prepareToSaveIndexState(state, IS_CHECKPOINT); + if (result != UDS_SUCCESS) { + return result; + } + + state->saving = true; + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + result = startIndexComponentIncrementalSave(component); + if (result != UDS_SUCCESS) { + abortIndexStateCheckpoint(state); + return result; + } + } + + return result; +} + +/*****************************************************************************/ +int performIndexStateCheckpointChapterSynchronizedSaves(IndexState *state) +{ + if (!state->saving) { + return UDS_SUCCESS; + } + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component) || + !deferIndexComponentCheckpointToChapterWriter(component)) { + continue; + } + int result = performIndexComponentChapterWriterSave(component); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * Wrapper function to do a zone-based checkpoint operation. + * + * @param [in] state the index state + * @param [in] zone the zone number + * @param [in] compFunc the index component function to use + * @param [out] completed if non-NULL, where to save the completion status + * + * @return UDS_SUCCESS or an error code + * + **/ +static int doIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + int (*compFunc)(IndexComponent *, + unsigned int, + CompletionStatus *), + CompletionStatus *completed) +{ + if (!state->saving) { + if (completed != NULL) { + *completed = CS_COMPLETED_PREVIOUSLY; + } + return UDS_SUCCESS; + } + + CompletionStatus status = CS_COMPLETED_PREVIOUSLY; + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + if (zone > 0 && !component->info->multiZone) { + continue; + } + CompletionStatus componentStatus = CS_NOT_COMPLETED; + int result = (*compFunc)(component, zone, &componentStatus); + if (result != UDS_SUCCESS) { + return result; + } + // compute rolling least status + if (componentStatus < status) { + status = componentStatus; + } + } + + if (completed != NULL) { + *completed = status; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int performIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) +{ + return doIndexStateCheckpointInZone(state, zone, + &performIndexComponentZoneSave, + completed); +} + +/*****************************************************************************/ +int finishIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) +{ + return doIndexStateCheckpointInZone(state, zone, + &finishIndexComponentZoneSave, + completed); +} + +/*****************************************************************************/ +int abortIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) +{ + return doIndexStateCheckpointInZone(state, zone, + &abortIndexComponentZoneSave, completed); +} + +/*****************************************************************************/ +int finishIndexStateCheckpoint(IndexState *state) +{ + if (!state->saving) { + return UDS_SUCCESS; + } + + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + int result = finishIndexComponentIncrementalSave(component); + if (result != UDS_SUCCESS) { + abortIndexStateCheckpoint(state); + return result; + } + } + + int result = completeIndexSaving(state); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int abortIndexStateCheckpoint(IndexState *state) +{ + if (!state->saving) { + return logErrorWithStringError(UDS_BAD_STATE, + "not saving the index state"); + } + + logError("aborting index state checkpoint"); + + int result = UDS_SUCCESS; + unsigned int i; + for (i = 0; i < state->count; ++i) { + IndexComponent *component = state->entries[i]; + if (skipIndexComponentOnCheckpoint(component)) { + continue; + } + int tmp = abortIndexComponentIncrementalSave(component); + if (result == UDS_SUCCESS) { + result = tmp; + } + } + + cleanupSave(state); + state->saving = false; + + return result; +} + +/*****************************************************************************/ +int discardIndexStateData(IndexState *state) +{ + int result = discardIndexSaves(state->layout, true); + state->saveSlot = UINT_MAX; + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "%s: cannot destroy all index saves", + __func__); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int discardLastIndexStateSave(IndexState *state) +{ + int result = discardIndexSaves(state->layout, false); + state->saveSlot = UINT_MAX; + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "%s: cannot destroy latest index save", + __func__); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +Buffer *getStateIndexStateBuffer(IndexState *state, IOAccessMode mode) +{ + unsigned int slot = mode == IO_READ ? state->loadSlot : state->saveSlot; + return getIndexStateBuffer(state->layout, slot); +} + +/*****************************************************************************/ +int openStateBufferedReader(IndexState *state, + RegionKind kind, + unsigned int zone, + BufferedReader **readerPtr) +{ + return openIndexBufferedReader(state->layout, state->loadSlot, kind, zone, + readerPtr); +} + +/*****************************************************************************/ +int openStateBufferedWriter(IndexState *state, + RegionKind kind, + unsigned int zone, + BufferedWriter **writerPtr) +{ + return openIndexBufferedWriter(state->layout, state->saveSlot, kind, zone, + writerPtr); +} diff --git a/source/uds/indexState.h b/source/uds/indexState.h new file mode 100644 index 0000000..82899c1 --- /dev/null +++ b/source/uds/indexState.h @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexState.h#5 $ + */ + +#ifndef INDEX_STATE_H +#define INDEX_STATE_H 1 + +#include "buffer.h" +#include "indexComponent.h" + + +/** + * Used here and in SingleFileLayout. + **/ +typedef enum { + IS_SAVE, + IS_CHECKPOINT, + NO_SAVE = 9999, +} IndexSaveType; + +/* + * Used in getStateIndexStateBuffer to identify whether the index state buffer + * is for the index being loaded or the index being saved. + */ +typedef enum { + IO_READ = 0x1, + IO_WRITE = 0x2, +} IOAccessMode; + +/** + * The index state structure controls the loading and saving of the index + * state. + **/ +typedef struct indexState { + struct indexLayout *layout; + unsigned int zoneCount; // number of index zones to use + unsigned int loadZones; + unsigned int loadSlot; + unsigned int saveSlot; + unsigned int count; // count of registered entries (<= length) + unsigned int length; // total span of array allocation + bool saving; // incremental save in progress + IndexComponent *entries[]; // array of index component entries +} IndexState; + +/** + * Make an index state object, + * + * @param [in] layout The index layout. + * @param [in] numZones The number of zones to use. + * @param [in] maxComponents The maximum number of components to be handled. + * @param [out] statePtr Where to store the index state object. + * + * @return UDS_SUCCESS or an error code + **/ +int makeIndexState(struct indexLayout *layout, + unsigned int numZones, + unsigned int maxComponents, + IndexState **statePtr) + __attribute__((warn_unused_result)); + +/** + * Free an index state (generically). + * + * @param statePtr The pointer to the index state to be freed and + * set to NULL. + **/ +void freeIndexState(IndexState **statePtr); + +/** + * Add an index component to an index state. + * + * @param state The index directory in which to add this component. + * @param info The index component file specification. + * @param data The per-component data structure. + * @param context The load/save context of the component. + * + * @return UDS_SUCCESS or an error code. + **/ +int addIndexStateComponent(IndexState *state, + const IndexComponentInfo *info, + void *data, + void *context) + __attribute__((warn_unused_result)); + +/** + * Load index state + * + * @param state The index state. + * @param replayPtr If set, the place to hold whether a replay is required. + * + * @return UDS_SUCCESS or error + **/ +int loadIndexState(IndexState *state, bool *replayPtr) + __attribute__((warn_unused_result)); + +/** + * Save the current index state, including the open chapter. + * + * @param state The index state. + * + * @return UDS_SUCCESS or error + **/ +int saveIndexState(IndexState *state) __attribute__((warn_unused_result)); + +/** + * Prepare to save the index state. + * + * @param state the index state + * @param saveType whether a checkpoint or save + * + * @return UDS_SUCCESS or an error code + **/ +int prepareToSaveIndexState(IndexState *state, IndexSaveType saveType) + __attribute__((warn_unused_result)); + +/** + * Write index checkpoint non-incrementally (for testing). + * + * @param state The index state. + * + * @return UDS_SUCCESS or error + **/ +int writeIndexStateCheckpoint(IndexState *state) + __attribute__((warn_unused_result)); + +/** + * Sets up an index state checkpoint which will proceed incrementally. + * May create the directory but does not actually write any data. + * + * @param state The index state. + * + * @return UDS_SUCCESS or an error code. + **/ +int startIndexStateCheckpoint(IndexState *state) + __attribute__((warn_unused_result)); + +/** + * Perform operations on index state checkpoints that are synchronized to + * the chapter writer thread. + * + * @param state The index state. + * + * @return UDS_SUCCESS or an error code. + **/ +int performIndexStateCheckpointChapterSynchronizedSaves(IndexState *state) + __attribute__((warn_unused_result)); + +/** + * Performs zone-specific (and, for zone 0, general) incremental checkpointing. + * + * @param [in] state The index state. + * @param [in] zone The zone number. + * @param [out] completed Set to whether the checkpoint has completed + * for this zone. + * + * @return UDS_SUCCESS or an error code. + **/ +int performIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Force the completion of an incremental index state checkpoint + * for a particular zone. + * + * @param [in] state The index state. + * @param [in] zone The zone number. + * @param [out] completed Set to whether the checkpoint has completed + * for this zone. + * + * @return UDS_SUCCESS or an error code. + **/ +int finishIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed) + __attribute__((warn_unused_result)); + +/** + * Force the completion of an incremental index state checkpoint once + * all zones are completed. + * + * @param [in] state The index state. + * + * @return UDS_SUCCESS or an error code. + **/ +int finishIndexStateCheckpoint(IndexState *state) + __attribute__((warn_unused_result)); + +/** + * Aborts an index state checkpoint which is proceeding incrementally + * for a particular zone. + * + * @param [in] state The index state. + * @param [in] zone The zone number. + * @param [out] completed Set to whether the checkpoint has completed or + * aborted for this zone. + * + * @return UDS_SUCCESS or an error code. + **/ +int abortIndexStateCheckpointInZone(IndexState *state, + unsigned int zone, + CompletionStatus *completed); + +/** + * Aborts an index state checkpoint which is proceeding incrementally, + * once all the zones are aborted. + * + * @param [in] state The index state. + * + * @return UDS_SUCCESS or an error code. + **/ +int abortIndexStateCheckpoint(IndexState *state); + +/** + * Remove or disable the index state data, for testing. + * + * @param state The index state + * + * @return UDS_SUCCESS or an error code + * + * @note the return value of this function is frequently ignored + **/ +int discardIndexStateData(IndexState *state); + +/** + * Discard the last index state save, for testing. + * + * @param state The index state + * + * @return UDS_SUCCESS or an error code + * + * @note the return value of this function is frequently ignored + **/ +int discardLastIndexStateSave(IndexState *state); + +/** + * Find index component, for testing. + * + * @param state The index state + * @param info The index component file specification + * + * @return The index component, or NULL if not found + **/ +IndexComponent *findIndexComponent(const IndexState *state, + const IndexComponentInfo *info) + __attribute__((warn_unused_result)); + +/** + * Get the indexStateBuffer for a specified mode. + * + * @param state The index state. + * @param mode One of IO_READ or IO_WRITE. + * + * @return the index state buffer + **/ +Buffer *getStateIndexStateBuffer(IndexState *state, IOAccessMode mode) + __attribute__((warn_unused_result)); + +/** + * Open a BufferedReader for a specified state, kind, and zone. + * This helper function is used by IndexComponent. + * + * @param state The index state. + * @param kind The kind if index save region to open. + * @param zone The zone number for the region. + * @param readerPtr Where to store the BufferedReader. + * + * @return UDS_SUCCESS or an error code. + **/ +int openStateBufferedReader(IndexState *state, + RegionKind kind, + unsigned int zone, + BufferedReader **readerPtr) + __attribute__((warn_unused_result)); + +/** + * Open a BufferedWriter for a specified state, kind, and zone. + * This helper function is used by IndexComponent. + * + * @param state The index state. + * @param kind The kind if index save region to open. + * @param zone The zone number for the region. + * @param writerPtr Where to store the BufferedWriter. + * + * @return UDS_SUCCESS or an error code. + **/ +int openStateBufferedWriter(IndexState *state, + RegionKind kind, + unsigned int zone, + BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); + +#endif // INDEX_STATE_H diff --git a/source/uds/indexStateData.c b/source/uds/indexStateData.c new file mode 100644 index 0000000..62038f0 --- /dev/null +++ b/source/uds/indexStateData.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexStateData.c#3 $ + */ + +#include "indexStateData.h" + +#include "buffer.h" +#include "errors.h" +#include "index.h" +#include "logger.h" +#include "uds.h" + +/* The index state version header */ +typedef struct { + int32_t signature; + int32_t versionID; +} IndexStateVersion; + +/* The version 301 index state */ +typedef struct { + uint64_t newestChapter; + uint64_t oldestChapter; + uint64_t lastCheckpoint; + uint32_t unused; + uint32_t padding; +} IndexStateData301; + +static const IndexStateVersion INDEX_STATE_VERSION_301 = { + .signature = -1, + .versionID = 301, +}; + +/** + * The index state index component reader. + * + * @param portal the ReadPortal that handles the read of the component + * + * @return UDS_SUCCESS or an error code + **/ +static int readIndexStateData(ReadPortal *portal) +{ + Buffer *buffer = getStateIndexStateBuffer(portal->component->state, IO_READ); + int result = rewindBuffer(buffer, uncompactedAmount(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + + IndexStateVersion fileVersion; + result = getInt32LEFromBuffer(buffer, &fileVersion.signature); + if (result != UDS_SUCCESS) { + return result; + } + result = getInt32LEFromBuffer(buffer, &fileVersion.versionID); + if (result != UDS_SUCCESS) { + return result; + } + + if (fileVersion.signature != -1 || fileVersion.versionID != 301) { + return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, + "Index state version %d,%d is unsupported", + fileVersion.signature, + fileVersion.versionID); + } + + IndexStateData301 state; + result = getUInt64LEFromBuffer(buffer, &state.newestChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &state.oldestChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &state.lastCheckpoint); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &state.unused); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &state.padding); + if (result != UDS_SUCCESS) { + return result; + } + + if ((state.unused != 0) || (state.padding != 0)) { + return UDS_CORRUPT_COMPONENT; + } + + Index *index = indexComponentData(portal->component); + index->newestVirtualChapter = state.newestChapter; + index->oldestVirtualChapter = state.oldestChapter; + index->lastCheckpoint = state.lastCheckpoint; + return UDS_SUCCESS; +} + +/** + * The index state index component writer. + * + * @param component The component whose state is to be saved (an Index) + * @param writer The buffered writer. + * @param zone The zone to write. + * + * @return UDS_SUCCESS or an error code + **/ +static int writeIndexStateData(IndexComponent *component, + BufferedWriter *writer __attribute__((unused)), + unsigned int zone __attribute__((unused))) +{ + Buffer *buffer = getStateIndexStateBuffer(component->state, IO_WRITE); + int result = resetBufferEnd(buffer, 0); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, INDEX_STATE_VERSION_301.signature); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, INDEX_STATE_VERSION_301.versionID); + if (result != UDS_SUCCESS) { + return result; + } + + Index *index = indexComponentData(component); + IndexStateData301 state = { + .newestChapter = index->newestVirtualChapter, + .oldestChapter = index->oldestVirtualChapter, + .lastCheckpoint = index->lastCheckpoint, + }; + + result = putUInt64LEIntoBuffer(buffer, state.newestChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, state.oldestChapter); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, state.lastCheckpoint); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, state.unused); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, state.padding); + if (result != UDS_SUCCESS) { + return result; + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ + +const IndexComponentInfo INDEX_STATE_INFO = { + .kind = RL_KIND_INDEX_STATE, + .name = "index state", + .saveOnly = false, + .chapterSync = true, + .multiZone = false, + .ioStorage = false, + .loader = readIndexStateData, + .saver = writeIndexStateData, + .incremental = NULL, +}; diff --git a/source/uds/indexStateData.h b/source/uds/indexStateData.h new file mode 100644 index 0000000..b6aa9b2 --- /dev/null +++ b/source/uds/indexStateData.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexStateData.h#1 $ + */ + +#ifndef INDEX_STATE_DATA_H +#define INDEX_STATE_DATA_H 1 + +#include "indexComponent.h" + +extern const IndexComponentInfo INDEX_STATE_INFO; + +#endif /* not INDEX_STATE_DATA_H */ diff --git a/source/uds/indexVersion.c b/source/uds/indexVersion.c new file mode 100644 index 0000000..df16e73 --- /dev/null +++ b/source/uds/indexVersion.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexVersion.c#1 $ + */ + +#include "indexVersion.h" + +void initializeIndexVersion(struct index_version *version, + uint32_t superVersion) +{ + /* + * Version 1 was introduced for the first single file layout. It was used in + * RHEL7 and in RHEL8.0 Beta. No kernel index ever used an earlier version. + */ + + /* + * Version 2 was created when we discovered that the volume header page was + * written in native endian format. It was used in RHEL8.0 and RHEL8.1. We + * stopped reading and the volume header page, and changed to version 2 so + * that an index creaed on RHEL8 cannot be taken back an used on RHEL7. + * + * Versions 1 and 2 are identical in normal operation (i.e. after the index + * is loaded). + */ + + /* + * Version 3 was created when we discovered the the chapter index headers + * were written in native endian format. It was first used in RHEL8.2 and is + * the current version for new indices. + * + * Versions before 3 read and write native endian chapter headers. Version 3 + * reads chapter headers in any endian order, and writes little-endian + * chapter headers. + */ + bool chapterIndexHeaderNativeEndian = superVersion < 3; + + *version = (struct index_version) { + .chapterIndexHeaderNativeEndian = chapterIndexHeaderNativeEndian, + }; +} diff --git a/source/uds/indexVersion.h b/source/uds/indexVersion.h new file mode 100644 index 0000000..f46b2e9 --- /dev/null +++ b/source/uds/indexVersion.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexVersion.h#1 $ + */ + +#ifndef INDEX_VERSION_H +#define INDEX_VERSION_H + +#include "typeDefs.h" + +struct index_version { + bool chapterIndexHeaderNativeEndian; +}; + +enum { + SUPER_VERSION_MINIMUM = 1, + SUPER_VERSION_MAXIMUM = 3, + SUPER_VERSION_CURRENT = 3, +}; + +/** + * Initialize the version parameters that we normally learn when loading the + * index but need to use during index operation. + * + * @param version The version parameters + * @param superVersion The SuperBlock version number + **/ +void initializeIndexVersion(struct index_version *version, + uint32_t superVersion); + +#endif // INDEX_VERSION_H diff --git a/source/uds/indexZone.c b/source/uds/indexZone.c new file mode 100644 index 0000000..f3cd8ed --- /dev/null +++ b/source/uds/indexZone.c @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexZone.c#4 $ + */ + +#include "indexZone.h" + +#include "errors.h" +#include "index.h" +#include "indexCheckpoint.h" +#include "indexRouter.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "request.h" +#include "sparseCache.h" +#include "uds.h" + +/**********************************************************************/ +int makeIndexZone(struct index *index, unsigned int zoneNumber) +{ + IndexZone *zone; + int result = ALLOCATE(1, IndexZone, "index zone", &zone); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeOpenChapter(index->volume->geometry, index->zoneCount, + &zone->openChapter); + if (result != UDS_SUCCESS) { + freeIndexZone(zone); + return result; + } + + result = makeOpenChapter(index->volume->geometry, index->zoneCount, + &zone->writingChapter); + if (result != UDS_SUCCESS) { + freeIndexZone(zone); + return result; + } + + zone->index = index; + zone->id = zoneNumber; + index->zones[zoneNumber] = zone; + + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeIndexZone(IndexZone *zone) +{ + if (zone == NULL) { + return; + } + + freeOpenChapter(zone->openChapter); + freeOpenChapter(zone->writingChapter); + FREE(zone); +} + +/**********************************************************************/ +bool isZoneChapterSparse(const IndexZone *zone, + uint64_t virtualChapter) +{ + return isChapterSparse(zone->index->volume->geometry, + zone->oldestVirtualChapter, + zone->newestVirtualChapter, + virtualChapter); +} + +/**********************************************************************/ +void setActiveChapters(IndexZone *zone) +{ + zone->oldestVirtualChapter = zone->index->oldestVirtualChapter; + zone->newestVirtualChapter = zone->index->newestVirtualChapter; +} + +/** + * Swap the open and writing chapters after blocking until there are no active + * chapter writers on the index. + * + * @param zone The zone swapping chapters + * + * @return UDS_SUCCESS or a return code + **/ +static int swapOpenChapter(IndexZone *zone) +{ + // Wait for any currently writing chapter to complete + int result = finishPreviousChapter(zone->index->chapterWriter, + zone->newestVirtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + + // Swap the writing and open chapters + OpenChapterZone *tempChapter = zone->openChapter; + zone->openChapter = zone->writingChapter; + zone->writingChapter = tempChapter; + return UDS_SUCCESS; +} + +/** + * Advance to a new open chapter, and forget the oldest chapter in the + * index if necessary. + * + * @param zone The zone containing the chapter to reap + * + * @return UDS_SUCCESS or an error code + **/ +static int reapOldestChapter(IndexZone *zone) +{ + Index *index = zone->index; + unsigned int chaptersPerVolume = index->volume->geometry->chaptersPerVolume; + int result + = ASSERT(((zone->newestVirtualChapter - zone->oldestVirtualChapter) + <= chaptersPerVolume), + "newest (%llu) and oldest (%llu) virtual chapters " + "less than or equal to chapters per volume (%u)", + zone->newestVirtualChapter, zone->oldestVirtualChapter, + chaptersPerVolume); + if (result != UDS_SUCCESS) { + return result; + } + + setMasterIndexZoneOpenChapter(index->masterIndex, zone->id, + zone->newestVirtualChapter); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int executeSparseCacheBarrierMessage(IndexZone *zone, + BarrierMessageData *barrier) +{ + /* + * Check if the chapter index for the virtual chapter is already in the + * cache, and if it's not, rendezvous with the other zone threads to add the + * chapter index to the sparse index cache. + */ + return updateSparseCache(zone, barrier->virtualChapter); +} + +/** + * Handle notification that some other zone has closed its open chapter. If + * the chapter that was closed is still the open chapter for this zone, + * close it now in order to minimize skew. + * + * @param zone The zone receiving the notification + * @param chapterClosed The notification + * + * @return UDS_SUCCESS or an error code + **/ +static int handleChapterClosed(IndexZone *zone, + ChapterClosedMessageData *chapterClosed) +{ + if (zone->newestVirtualChapter == chapterClosed->virtualChapter) { + return openNextChapter(zone, NULL); + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int dispatchIndexZoneControlRequest(Request *request) +{ + ZoneMessage *message = &request->zoneMessage; + IndexZone *zone = message->index->zones[request->zoneNumber]; + + switch (request->action) { + case REQUEST_SPARSE_CACHE_BARRIER: + return executeSparseCacheBarrierMessage(zone, &message->data.barrier); + + case REQUEST_ANNOUNCE_CHAPTER_CLOSED: + return handleChapterClosed(zone, &message->data.chapterClosed); + + default: + return ASSERT_FALSE("valid control message type: %d", request->action); + } +} + +/** + * Announce the closure of the current open chapter to the other zones. + * + * @param request The request which caused the chapter to close + * (may be NULL) + * @param zone The zone which first closed the chapter + * @param closedChapter The chapter which was closed + * + * @return UDS_SUCCESS or an error code + **/ +static int announceChapterClosed(Request *request, + IndexZone *zone, + uint64_t closedChapter) +{ + IndexRouter *router = ((request != NULL) ? request->router : NULL); + + ZoneMessage zoneMessage = { + .index = zone->index, + .data = { + .chapterClosed = { .virtualChapter = closedChapter } + } + }; + + unsigned int i; + for (i = 0; i < zone->index->zoneCount; i++) { + if (zone->id == i) { + continue; + } + int result; + if (router != NULL) { + result = launchZoneControlMessage(REQUEST_ANNOUNCE_CHAPTER_CLOSED, + zoneMessage, i, router); + } else { + // We're in a test which doesn't have zone queues, so we can just + // call the message function directly. + result = handleChapterClosed(zone->index->zones[i], + &zoneMessage.data.chapterClosed); + } + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int openNextChapter(IndexZone *zone, Request *request) +{ + logDebug("closing chapter %llu of zone %d after %u entries (%u short)", + zone->newestVirtualChapter, zone->id, zone->openChapter->size, + zone->openChapter->capacity - zone->openChapter->size); + + int result = swapOpenChapter(zone); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t closedChapter = zone->newestVirtualChapter++; + result = reapOldestChapter(zone); + if (result != UDS_SUCCESS) { + return logUnrecoverable(result, "reapOldestChapter failed"); + } + + resetOpenChapter(zone->openChapter); + + // begin, continue, or finish the checkpoint processing + // moved above startClosingChapter because some of the + // checkpoint processing now done by the chapter writer thread + result = processCheckpointing(zone->index, + zone->id, + zone->newestVirtualChapter); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int finishedZones = startClosingChapter(zone->index->chapterWriter, + zone->id, + zone->writingChapter); + if ((finishedZones == 1) && (zone->index->zoneCount > 1)) { + // This is the first zone of a multi-zone index to close this chapter, + // so inform the other zones in order to control zone skew. + result = announceChapterClosed(request, zone, closedChapter); + if (result != UDS_SUCCESS) { + return result; + } + } + + // If the chapter being opened won't overwrite the oldest chapter, we're + // done. + if (!areSamePhysicalChapter(zone->index->volume->geometry, + zone->newestVirtualChapter, + zone->oldestVirtualChapter)) { + return UDS_SUCCESS; + } + + uint64_t victim = zone->oldestVirtualChapter++; + if (finishedZones < zone->index->zoneCount) { + // We are not the last zone to close the chapter, so we're done + return UDS_SUCCESS; + } + + /* + * We are the last zone to close the chapter, so clean up the cache. That + * it is safe to let the last thread out of the previous chapter to do this + * relies on the fact that although the new open chapter shadows the oldest + * chapter in the cache, until we write the new open chapter to disk, we'll + * never look for it in the cache. + */ + return forgetChapter(zone->index->volume, victim, INVALIDATION_EXPIRE); +} + +/**********************************************************************/ +IndexRegion computeIndexRegion(const IndexZone *zone, + uint64_t virtualChapter) +{ + if (virtualChapter == zone->newestVirtualChapter) { + return LOC_IN_OPEN_CHAPTER; + } + return (isZoneChapterSparse(zone, virtualChapter) + ? LOC_IN_SPARSE : LOC_IN_DENSE); +} + +/**********************************************************************/ +int getRecordFromZone(IndexZone *zone, + Request *request, + bool *found, + uint64_t virtualChapter) +{ + if (virtualChapter == zone->newestVirtualChapter) { + searchOpenChapter(zone->openChapter, &request->chunkName, + &request->oldMetadata, found); + return UDS_SUCCESS; + } + + if ((zone->newestVirtualChapter > 0) + && (virtualChapter == (zone->newestVirtualChapter - 1)) + && (zone->writingChapter->size > 0)) { + // Only search the writing chapter if it is full, else look on disk. + searchOpenChapter(zone->writingChapter, &request->chunkName, + &request->oldMetadata, found); + return UDS_SUCCESS; + } + + // The slow lane thread has determined the location previously. We don't need + // to search again. Just return the location. + if (request->slLocationKnown) { + *found = request->slLocation != LOC_UNAVAILABLE; + return UDS_SUCCESS; + } + + Volume *volume = zone->index->volume; + if (isZoneChapterSparse(zone, virtualChapter) + && sparseCacheContains(volume->sparseCache, virtualChapter, + request->zoneNumber)) { + // The named chunk, if it exists, is in a sparse chapter that is cached, + // so just run the chunk through the sparse chapter cache search. + return searchSparseCacheInZone(zone, request, virtualChapter, found); + } + + return searchVolumePageCache(volume, request, &request->chunkName, + virtualChapter, &request->oldMetadata, found); +} + +/**********************************************************************/ +int putRecordInZone(IndexZone *zone, + Request *request, + const UdsChunkData *metadata) +{ + unsigned int remaining; + int result = putOpenChapter(zone->openChapter, &request->chunkName, metadata, + &remaining); + if (result != UDS_SUCCESS) { + return result; + } + + if (remaining == 0) { + return openNextChapter(zone, request); + } + + return UDS_SUCCESS; +} + +/**************************************************************************/ +int searchSparseCacheInZone(IndexZone *zone, + Request *request, + uint64_t virtualChapter, + bool *found) +{ + int recordPageNumber; + int result = searchSparseCache(zone, &request->chunkName, &virtualChapter, + &recordPageNumber); + if ((result != UDS_SUCCESS) || (virtualChapter == UINT64_MAX)) { + return result; + } + + Volume *volume = zone->index->volume; + // XXX map to physical chapter and validate. It would be nice to just pass + // the virtual in to the slow lane, since it's tracking invalidations. + unsigned int chapter + = mapToPhysicalChapter(volume->geometry, virtualChapter); + + return searchCachedRecordPage(volume, request, &request->chunkName, chapter, + recordPageNumber, &request->oldMetadata, + found); +} diff --git a/source/uds/indexZone.h b/source/uds/indexZone.h new file mode 100644 index 0000000..8301894 --- /dev/null +++ b/source/uds/indexZone.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/indexZone.h#2 $ + */ + +#ifndef INDEX_ZONE_H +#define INDEX_ZONE_H + +#include "common.h" +#include "openChapterZone.h" +#include "request.h" + +typedef struct { + struct index *index; + OpenChapterZone *openChapter; + OpenChapterZone *writingChapter; + uint64_t oldestVirtualChapter; + uint64_t newestVirtualChapter; + unsigned int id; +} IndexZone; + +/** + * Allocate an index zone. + * + * @param index The index receiving the zone + * @param zoneNumber The number of the zone to allocate + * + * @return UDS_SUCCESS or an error code. + **/ +int makeIndexZone(struct index *index, unsigned int zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Clean up an index zone. + * + * @param zone The index zone to free + * + * @return UDS_SUCCESS or an error code. + **/ +void freeIndexZone(IndexZone *zone); + +/** + * Check whether a chapter is sparse or dense based on the current state of + * the index zone. + * + * @param zone The index zone to check against + * @param virtualChapter The virtual chapter number of the chapter to check + * + * @return true if the chapter is in the sparse part of the volume + **/ +bool isZoneChapterSparse(const IndexZone *zone, + uint64_t virtualChapter) + __attribute__((warn_unused_result)); + +/** + * Set the active chapter numbers for a zone based on its index. The active + * chapters consist of the range of chapters from the current oldest to + * the current newest virtual chapter. + * + * @param zone The zone to set + **/ +void setActiveChapters(IndexZone *zone); + +/** + * Dispatch a control request to an index zone. + * + * @param request The request to dispatch + * + * @return UDS_SUCCESS or an error code + **/ +int dispatchIndexZoneControlRequest(Request *request) + __attribute__((warn_unused_result)); + +/** + * Execute a sparse chapter index cache barrier control request on the zone + * worker thread. This call into the sparse cache to coordinate the cache + * update with the other zones. + * + * @param zone The index zone receiving the barrier message + * @param barrier The barrier control message data + * + * @return UDS_SUCCESS or an error code if the chapter index could not be + * read or decoded + **/ +int executeSparseCacheBarrierMessage(IndexZone *zone, + BarrierMessageData *barrier) + __attribute__((warn_unused_result)); + +/** + * Open the next chapter. + * + * @param zone The zone containing the open chapter + * @param request The request which requires the next chapter to be + * opened + * + * @return UDS_SUCCESS if successful. + **/ +int openNextChapter(IndexZone *zone, Request *request) + __attribute__((warn_unused_result)); + +/** + * Determine the IndexRegion in which a block was found. + * + * @param zone The zone that was searched + * @param virtualChapter The virtual chapter number + * + * @return the IndexRegion of the chapter in which the block was found + **/ +IndexRegion computeIndexRegion(const IndexZone *zone, + uint64_t virtualChapter); + +/** + * Get a record from either the volume or the open chapter in a zone. + * + * @param zone The index zone to query + * @param request The request originating the query + * @param found A pointer to a bool which will be set to + * true if the record was found. + * @param virtualChapter The chapter in which to search + * + * @return UDS_SUCCESS or an error code + **/ +int getRecordFromZone(IndexZone *zone, + Request *request, + bool *found, + uint64_t virtualChapter) + __attribute__((warn_unused_result)); + +/** + * Put a record in the open chapter. If this fills the chapter, the chapter + * will be closed and a new one will be opened. + * + * @param zone The index zone containing the chapter + * @param request The request containing the name of the record + * @param metadata The record metadata + * + * @return UDS_SUCCESS or an error + **/ +int putRecordInZone(IndexZone *zone, + Request *request, + const UdsChunkData *metadata) + __attribute__((warn_unused_result)); + +/** + * Search the cached sparse chapter index, either for a cached sparse hook, or + * as the last chance for finding the record named by a request. + * + * @param [in] zone the index zone + * @param [in] request the request originating the search + * @param [in] virtualChapter if UINT64_MAX, search the entire cache; + * otherwise search this chapter, if cached + * @param [out] found A pointer to a bool which will be set to + * true if the record was found + * + * @return UDS_SUCCESS or an error code + **/ +int searchSparseCacheInZone(IndexZone *zone, + Request *request, + uint64_t virtualChapter, + bool *found) + __attribute__((warn_unused_result)); + +#endif /* INDEX_ZONE_H */ diff --git a/source/uds/ioFactory.h b/source/uds/ioFactory.h new file mode 100644 index 0000000..ef6cc90 --- /dev/null +++ b/source/uds/ioFactory.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/ioFactory.h#7 $ + */ + +#ifndef IO_FACTORY_H +#define IO_FACTORY_H + +#include "bufferedReader.h" +#include "bufferedWriter.h" +#ifdef __KERNEL__ +#include +#else +#include "fileUtils.h" +#include "ioRegion.h" +#endif + +/* + * An IOFactory object is responsible for controlling access to index storage. + * The index is a contiguous range of blocks on a block device or within a + * file. + * + * The IOFactory holds the open device or file and is responsible for closing + * it. The IOFactory has methods to make IORegions that are used to access + * sections of the index. + */ +typedef struct ioFactory IOFactory; + +/* + * Define the UDS block size as 4K. Historically, we wrote the volume file in + * large blocks, but wrote all the other index data into byte streams stored in + * files. When we converted to writing an index into a block device, we + * changed to writing the byte streams into page sized blocks. Now that we + * support multiple architectures, we write into 4K blocks on all platforms. + * + * XXX We must convert all the rogue 4K constants to use UDS_BLOCK_SIZE. + */ +enum { UDS_BLOCK_SIZE = 4096 }; + +#ifdef __KERNEL__ +/** + * Create an IOFactory. The IOFactory is returned with a reference count of 1. + * + * @param path The path to the block device or file that contains the + * block stream + * @param factoryPtr The IOFactory is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeIOFactory(const char *path, IOFactory **factoryPtr) + __attribute__((warn_unused_result)); +#else +/** + * Create an IOFactory. The IOFactory is returned with a reference count of 1. + * + * @param path The path to the block device or file that contains the + * block stream + * @param access The requested access kind. + * @param factoryPtr The IOFactory is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeIOFactory(const char *path, + FileAccess access, + IOFactory **factoryPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Get another reference to an IOFactory, incrementing its reference count. + * + * @param factory The IOFactory + **/ +void getIOFactory(IOFactory *factory); + +/** + * Free a reference to an IOFactory. If the reference count drops to zero, + * free the IOFactory and release all its resources. + * + * @param factory The IOFactory + **/ +void putIOFactory(IOFactory *factory); + +/** + * Get the maximum potential size of the device or file. For a device, this is + * the actual size of the device. For a file, this is the largest file that we + * can possibly write. + * + * @param factory The IOFactory + * + * @return the writable size (in bytes) + **/ +size_t getWritableSize(IOFactory *factory) __attribute__((warn_unused_result)); + +#ifdef __KERNEL__ +/** + * Create a struct dm_bufio_client for a region of the index. + * + * @param factory The IOFactory + * @param offset The byte offset to the region within the index + * @param size The size of a block, in bytes + * @param reservedBuffers The number of buffers that can be reserved + * @param clientPtr The struct dm_bufio_client is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeBufio(IOFactory *factory, + off_t offset, + size_t blockSize, + unsigned int reservedBuffers, + struct dm_bufio_client **clientPtr) + __attribute__((warn_unused_result)); +#else +/** + * Create an IORegion for a region of the index. + * + * @param factory The IOFactory + * @param offset The byte offset to the region within the index + * @param size The size in bytes of the region + * @param regionPtr The IORegion is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeIORegion(IOFactory *factory, + off_t offset, + size_t size, + IORegion **regionPtr) + __attribute__((warn_unused_result)); +#endif + +/** + * Create a BufferedReader for a region of the index. + * + * @param factory The IOFactory + * @param offset The byte offset to the region within the index + * @param size The size in bytes of the region + * @param regionPtr The IORegion is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int openBufferedReader(IOFactory *factory, + off_t offset, + size_t size, + BufferedReader **readerPtr) + __attribute__((warn_unused_result)); + +/** + * Create a BufferedWriter for a region of the index. + * + * @param factory The IOFactory + * @param offset The byte offset to the region within the index + * @param size The size in bytes of the region + * @param regionPtr The IORegion is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int openBufferedWriter(IOFactory *factory, + off_t offset, + size_t size, + BufferedWriter **writerPtr) + __attribute__((warn_unused_result)); + +#endif // IO_FACTORY_H diff --git a/source/uds/ioFactoryLinuxKernel.c b/source/uds/ioFactoryLinuxKernel.c new file mode 100644 index 0000000..9e45920 --- /dev/null +++ b/source/uds/ioFactoryLinuxKernel.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/ioFactoryLinuxKernel.c#9 $ + */ + +#include +#include + +#include "atomicDefs.h" +#include "ioFactory.h" +#include "logger.h" +#include "memoryAlloc.h" + +enum { BLK_FMODE = FMODE_READ | FMODE_WRITE }; + +/* + * A kernel mode IOFactory object controls access to an index stored on a block + * device. + */ +struct ioFactory { + struct block_device *bdev; + atomic_t refCount; +}; + +/*****************************************************************************/ +void getIOFactory(IOFactory *factory) +{ + atomic_inc(&factory->refCount); +} + +/*****************************************************************************/ +int makeIOFactory(const char *path, IOFactory **factoryPtr) +{ + struct block_device *bdev; + dev_t device = name_to_dev_t(path); + if (device != 0) { + bdev = blkdev_get_by_dev(device, BLK_FMODE, NULL); + } else { + bdev = blkdev_get_by_path(path, BLK_FMODE, NULL); + } + if (IS_ERR(bdev)) { + logErrorWithStringError(-PTR_ERR(bdev), "%s is not a block device", path); + return UDS_INVALID_ARGUMENT; + } + + IOFactory *factory; + int result = ALLOCATE(1, IOFactory, __func__, &factory); + if (result != UDS_SUCCESS) { + blkdev_put(bdev, BLK_FMODE); + return result; + } + + factory->bdev = bdev; + atomic_set_release(&factory->refCount, 1); + + *factoryPtr = factory; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void putIOFactory(IOFactory *factory) +{ + if (atomic_add_return(-1, &factory->refCount) <= 0) { + blkdev_put(factory->bdev, BLK_FMODE); + FREE(factory); + } +} + +/*****************************************************************************/ +size_t getWritableSize(IOFactory *factory) +{ + return i_size_read(factory->bdev->bd_inode); +} + +/*****************************************************************************/ +int makeBufio(IOFactory *factory, + off_t offset, + size_t blockSize, + unsigned int reservedBuffers, + struct dm_bufio_client **clientPtr) +{ + if (offset % SECTOR_SIZE != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "offset %zd not multiple of %d", + offset, SECTOR_SIZE); + } + if (blockSize % UDS_BLOCK_SIZE != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "blockSize %zd not multiple of %d", + blockSize, UDS_BLOCK_SIZE); + } + + struct dm_bufio_client *client = dm_bufio_client_create(factory->bdev, + blockSize, + reservedBuffers, 0, + NULL, NULL); + if (IS_ERR(client)) { + return -PTR_ERR(client); + } + + dm_bufio_set_sector_offset(client, offset >> SECTOR_SHIFT); + *clientPtr = client; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int openBufferedReader(IOFactory *factory, + off_t offset, + size_t size, + BufferedReader **readerPtr) +{ + if (size % UDS_BLOCK_SIZE != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "region size %zd is not multiple of %d", + size, UDS_BLOCK_SIZE); + } + + struct dm_bufio_client *client = NULL; + int result = makeBufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeBufferedReader(factory, client, size / UDS_BLOCK_SIZE, + readerPtr); + if (result != UDS_SUCCESS) { + dm_bufio_client_destroy(client); + } + return result; +} + +/*****************************************************************************/ +int openBufferedWriter(IOFactory *factory, + off_t offset, + size_t size, + BufferedWriter **writerPtr) +{ + if (size % UDS_BLOCK_SIZE != 0) { + return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, + "region size %zd is not multiple of %d", + size, UDS_BLOCK_SIZE); + } + + struct dm_bufio_client *client = NULL; + int result = makeBufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); + if (result != UDS_SUCCESS) { + return result; + } + + result = makeBufferedWriter(factory, client, size / UDS_BLOCK_SIZE, + writerPtr); + if (result != UDS_SUCCESS) { + dm_bufio_client_destroy(client); + } + return result; +} diff --git a/source/uds/layoutRegion.h b/source/uds/layoutRegion.h new file mode 100644 index 0000000..b49f979 --- /dev/null +++ b/source/uds/layoutRegion.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/layoutRegion.h#1 $ + */ + +#ifndef LAYOUT_REGION_H +#define LAYOUT_REGION_H + +/** + * Single file layouts are defined in terms of data regions. Each data region + * is a sub-section of the available space. Some data regions may contain + * subsidiary data regions, for example, a checkpoint or index save will + * contain master index regions (according to the number of zones), an + * index page map region, and possibly an open chapter region. + **/ + +static const uint64_t REGION_MAGIC = 0x416c6252676e3031; // 'AlbRgn01' + +typedef struct regionHeader { + uint64_t magic; // REGION_MAGIC + uint64_t regionBlocks; // size of whole region + uint16_t type; // RH_TYPE_... + uint16_t version; // 1 + uint16_t numRegions; // number of layouts in the table + uint16_t payload; // extra data beyond region table +} RegionHeader; + +typedef struct layoutRegion { + uint64_t startBlock; + uint64_t numBlocks; + uint32_t checksum; // only used for save regions + uint16_t kind; + uint16_t instance; +} LayoutRegion; + +typedef struct regionTable { + RegionHeader header; + LayoutRegion regions[]; +} RegionTable; + +#endif // LAYOUT_REGION_H diff --git a/source/uds/loadType.c b/source/uds/loadType.c new file mode 100644 index 0000000..125f8b0 --- /dev/null +++ b/source/uds/loadType.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/loadType.c#1 $ + */ + +#include "loadType.h" + +#include "logger.h" + +/**********************************************************************/ +const char *getLoadType(LoadType loadType) +{ + switch (loadType) { + case LOAD_CREATE: + return "creating index"; + case LOAD_LOAD: + return "loading index"; + case LOAD_REBUILD: + return "loading or rebuilding index"; + default: + return "no load method specified"; + } +} diff --git a/source/uds/loadType.h b/source/uds/loadType.h new file mode 100644 index 0000000..2b93e72 --- /dev/null +++ b/source/uds/loadType.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/loadType.h#1 $ + */ + +#ifndef LOAD_TYPE_H +#define LOAD_TYPE_H + +/** + * Methods of starting the index. (Keep getLoadType() in sync.) + * + * Usage number 1 is to note the interface method that initiates loading the + * index. As in this table: + * + * name type opened by + * =========== ====== ==================== + * LOAD_CREATE local udsCreateLocalIndex + * LOAD_LOAD local udsLoadLocalIndex + * LOAD_REBUILD local udsRebuildLocalIndex + * + * Usage number 2 is to record how an index was really opened. As in this + * table: + * + * LOAD_CREATE new empty index + * LOAD_LOAD loaded saved index + * LOAD_REPLAY loaded checkpoint and replayed new chapters + * LOAD_EMPTY empty master index from empty volume data + * LOAD_REBUILD rebuilt master index from volume data + **/ +typedef enum { + LOAD_UNDEFINED = 0, + LOAD_CREATE, + LOAD_LOAD, + LOAD_REBUILD, + LOAD_EMPTY, + LOAD_REPLAY, +} LoadType; + +/** + * get a string indicating how an index is to be loaded. + * + * @param loadType The load type to log + **/ +const char *getLoadType(LoadType loadType); + +#endif /* LOAD_TYPE_H */ diff --git a/source/uds/logger.c b/source/uds/logger.c new file mode 100644 index 0000000..311bae1 --- /dev/null +++ b/source/uds/logger.c @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/logger.c#3 $ + */ + +#include "logger.h" + +#include "common.h" +#include "errors.h" +#include "stringUtils.h" +#include "threads.h" +#include "uds.h" + +typedef struct { + const char *name; + const int priority; +} PriorityName; + +static const PriorityName PRIORITIES[] = { + { "ALERT", LOG_ALERT }, + { "CRITICAL", LOG_CRIT }, + { "CRIT", LOG_CRIT }, + { "DEBUG", LOG_DEBUG }, + { "EMERGENCY", LOG_EMERG }, + { "EMERG", LOG_EMERG }, + { "ERROR", LOG_ERR }, + { "ERR", LOG_ERR }, + { "INFO", LOG_INFO }, + { "NOTICE", LOG_NOTICE }, + { "PANIC", LOG_EMERG }, + { "WARN", LOG_WARNING }, + { "WARNING", LOG_WARNING }, + { NULL, -1 }, +}; + +static const char *const PRIORITY_STRINGS[] = { + "EMERGENCY", + "ALERT", + "CRITICAL", + "ERROR", + "WARN", + "NOTICE", + "INFO", + "DEBUG", +}; + +static int logLevel = LOG_INFO; + +/*****************************************************************************/ +int getLogLevel(void) +{ + return logLevel; +} + +/*****************************************************************************/ +void setLogLevel(int newLogLevel) +{ + logLevel = newLogLevel; +} + +/*****************************************************************************/ +int stringToPriority(const char *string) +{ + int i; + for (i = 0; PRIORITIES[i].name != NULL; i++) { + if (strcasecmp(string, PRIORITIES[i].name) == 0) { + return PRIORITIES[i].priority; + } + } + return LOG_INFO; +} + +/*****************************************************************************/ +const char *priorityToString(int priority) +{ + if ((priority < 0) || (priority >= (int) COUNT_OF(PRIORITY_STRINGS))) { + return "unknown"; + } + return PRIORITY_STRINGS[priority]; +} + +/*****************************************************************************/ +void logEmbeddedMessage(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) +{ + va_list ap; + va_start(ap, fmt2); + logMessagePack(priority, prefix, fmt1, args1, fmt2, ap); + va_end(ap); +} + +#pragma GCC diagnostic push +/* + * GCC (version 8.1.1 20180502 (Red Hat 8.1.1-1)) on Fedora 28 seems + * to think that this function should get a printf format + * attribute. But we have no second format string, and no additional + * arguments at the call site, and GCC also gets unhappy trying to + * analyze the format and values when there are none. So we'll just + * shut it up. + */ +#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +/** + * Log a message. + * + * This helper function exists solely to create a valid va_list with + * no useful info. It does the real work of vLogMessage, which wants a + * second va_list object to pass down. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + * @param args The variadic argument list of format parameters. + **/ +static void vLogMessageHelper(int priority, + const char *format, + va_list args, + ...) +{ + va_list dummy; + va_start(dummy, args); + logMessagePack(priority, NULL, format, args, NULL, dummy); + va_end(dummy); +} +#pragma GCC diagnostic pop + +/*****************************************************************************/ +void vLogMessage(int priority, const char *format, va_list args) +{ + vLogMessageHelper(priority, format, args); +} + +/*****************************************************************************/ +void logMessage(int priority, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(priority, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logDebug(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_DEBUG, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logInfo(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_INFO, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logNotice(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_NOTICE, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logWarning(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_WARNING, format, args); + va_end(args); +} + +/*****************************************************************************/ +void logError(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_ERR, format, args); + va_end(args); +} + +/*****************************************************************************/ +int vLogWithStringError(int priority, + int errnum, + const char *format, + va_list args) +{ + char errbuf[ERRBUF_SIZE]; + logEmbeddedMessage(priority, NULL, format, args, ": %s (%d)", + stringError(errnum, errbuf, sizeof(errbuf)), + errnum); + return errnum; +} + +/*****************************************************************************/ +int logWithStringError(int priority, int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(priority, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logErrorWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_ERR, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logWarningWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_WARNING, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logDebugWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_DEBUG, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logInfoWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_INFO, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logNoticeWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_NOTICE, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logFatalWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_CRIT, errnum, format, args); + va_end(args); + return errnum; +} + +/*****************************************************************************/ +int logUnrecoverable(int errnum, const char *format, ...) +{ + if (isSuccessful(errnum)) { + return errnum; + } + va_list args; + va_start(args, format); + vLogWithStringError(LOG_CRIT, errnum, format, args); + va_end(args); + return makeUnrecoverable(errnum); +} + +/*****************************************************************************/ +void logFatal(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_CRIT, format, args); + va_end(args); +} diff --git a/source/uds/logger.h b/source/uds/logger.h new file mode 100644 index 0000000..b1f9d56 --- /dev/null +++ b/source/uds/logger.h @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/logger.h#5 $ + */ + +#ifndef LOGGER_H +#define LOGGER_H 1 + +#ifdef __KERNEL__ +#include +#include +#else +#include +#include "minisyslog.h" +#endif + +#ifdef __KERNEL__ +#define LOG_EMERG 0 /* system is unusable */ +#define LOG_ALERT 1 /* action must be taken immediately */ +#define LOG_CRIT 2 /* critical conditions */ +#define LOG_ERR 3 /* error conditions */ +#define LOG_WARNING 4 /* warning conditions */ +#define LOG_NOTICE 5 /* normal but significant condition */ +#define LOG_INFO 6 /* informational */ +#define LOG_DEBUG 7 /* debug-level messages */ +#endif + +#ifdef __KERNEL__ +// Make it easy to log real pointer values using %px when in development. +#ifdef LOG_INTERNAL +#define PRIptr "px" +#else +#define PRIptr "pK" +#endif +#else // not __KERNEL__ +// For compatibility with hooks we need when compiling in kernel mode. +#define PRIptr "p" +#endif + +/* + * Apply a rate limiter to a log method call. + * + * @param logFunc A method that does logging, which is not invoked if we are + * running in the kernel and the ratelimiter detects that we + * are calling it frequently. + */ +#ifdef __KERNEL__ +#define logRatelimit(logFunc, ...) \ + do { \ + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) { \ + logFunc(__VA_ARGS__); \ + } \ + } while (0) +#else +#define logRatelimit(logFunc, ...) logFunc(__VA_ARGS__) +#endif + +/** + * @file + * + * All of the log() functions will preserve the callers value of errno. + **/ + +#ifndef __KERNEL__ +/* + * In user mode, the functions in this file are not thread safe in the sense + * that nothing prevents multiple threads from closing loggers out from under + * other threads. In reality this isn't a problem since there are no calls to + * closeLogger() in production code. + */ + +/** + * Start the logger. + **/ +void openLogger(void); + +/** + * Stop the logger. + **/ +void closeLogger(void); +#endif + +/** + * Get the current logging level. + * + * @return the current logging priority level. + **/ +int getLogLevel(void); + +/** + * Set the current logging level. + * + * @param newLogLevel the new value for the logging priority level. + **/ +void setLogLevel(int newLogLevel); + +/** + * Return the integer logging priority represented by a name. + * + * @param string the name of the logging priority (case insensitive). + * + * @return the integer priority named by string, or LOG_INFO if not recognized. + **/ +int stringToPriority(const char *string); + +/** + * Return the printable name of a logging priority. + * + * @return the priority name + **/ +const char *priorityToString(int priority); + +/** + * Log a debug message. + * + * @param format The format of the message (a printf style format) + **/ +void logDebug(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an informational message. + * + * @param format The format of the message (a printf style format) + **/ +void logInfo(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a normal (but notable) condition. + * + * @param format The format of the message (a printf style format) + **/ +void logNotice(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a warning. + * + * @param format The format of the message (a printf style format) + **/ +void logWarning(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an error. + * + * @param format The format of the message (a printf style format) + **/ +void logError(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a message embedded within another message. + * + * @param priority the priority at which to log the message + * @param prefix optional string prefix to message, may be NULL + * @param fmt1 format of message first part, may be NULL + * @param args1 arguments for message first part + * @param fmt2 format of message second part + **/ +void logEmbeddedMessage(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) + __attribute__((format(printf, 3, 0), format(printf, 5, 6))); + +/** + * Log a message pack consisting of multiple variable sections. + * + * @param priority the priority at which to log the message + * @param prefix optional string prefix to message, may be NULL + * @param fmt1 format of message first part, may be NULL + * @param args1 arguments for message first part + * @param fmt2 format of message second part, may be NULL + * @param args2 arguments for message second part + **/ +void logMessagePack(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + va_list args2) + __attribute__((format(printf, 3, 0))); + +/** + * Log a stack backtrace. + * + * @param priority The priority at which to log the backtrace + **/ +void logBacktrace(int priority); + +/** + * Log a message with an error from an error code. + * + * @param priority The priority of the logging entry + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return errnum + **/ +int logWithStringError(int priority, int errnum, const char *format, ...) + __attribute__((format(printf, 3, 4))); + +/** + * Log a message with an error from an error code. + * + * @param priority The priority of the logging entry + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * @param args The list of arguments with format. + * + * @return errnum + **/ +int vLogWithStringError(int priority, + int errnum, + const char *format, + va_list args) + __attribute__((format(printf, 3, 0))); + +/** + * Log an error prefixed with the string associated with the errnum. + * + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return errnum + **/ +int logErrorWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logDebugWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logInfoWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logNoticeWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logWarningWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logFatalWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * IF the result is an error, log a FATAL level message and return the result + * after marking it unrecoverable. The UDS_SUCCESS and UDS_QUEUED results are + * not considered errors and are returned unmodified. + * + * @param errnum int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return makeUnrecoverable(errnum) or UDS_SUCCESS or UDS_QUEUED + **/ +int logUnrecoverable(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Log a fatal error. + * + * @param format The format of the message (a printf style format) + **/ +void logFatal(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a message -- for internal use only. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + * @param args The variadic argument list of format parameters. + **/ +void vLogMessage(int priority, const char *format, va_list args) + __attribute__((format(printf, 2, 0))); + +/** + * Log a message + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + **/ +void logMessage(int priority, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Sleep or delay a short time (likely a few milliseconds) in an attempt allow + * the log buffers to be written out in case they might be overrun. This is + * unnecessary in user-space (and is a no-op there), but is needed when + * quickly issuing a lot of log output in the Linux kernel, as when dumping a + * large number of data structures. + **/ +void pauseForLogger(void); + +#endif /* LOGGER_H */ diff --git a/source/uds/loggerLinuxKernel.c b/source/uds/loggerLinuxKernel.c new file mode 100644 index 0000000..bb1ad0b --- /dev/null +++ b/source/uds/loggerLinuxKernel.c @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/loggerLinuxKernel.c#2 $ + */ + +#include +#include +#include +#include + +#include "logger.h" + +/**********************************************************************/ +static const char *priorityToLogLevel(int priority) +{ + switch (priority) { + case LOG_EMERG: + case LOG_ALERT: + case LOG_CRIT: + return KERN_CRIT; + case LOG_ERR: + return KERN_ERR; + case LOG_WARNING: + return KERN_WARNING; + case LOG_NOTICE: + return KERN_NOTICE; + case LOG_INFO: + return KERN_INFO; + case LOG_DEBUG: + return KERN_DEBUG; + default: + return ""; + } +} + +/**********************************************************************/ +static const char *getCurrentInterruptType(void) +{ + if (in_nmi()) { + return "NMI"; + } + if (in_irq()) { + return "HI"; + } + if (in_softirq()) { + return "SI"; + } + return "INTR"; +} + +/**********************************************************************/ +void logMessagePack(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + va_list args2) +{ + if (priority > getLogLevel()) { + return; + } + + /* + * The kernel's printk has some magic for indirection to a secondary + * va_list. It wants us to supply a pointer to the va_list. + * + * However, va_list varies across platforms and can be an array + * type, which makes passing it around as an argument kind of + * tricky, due to the automatic conversion to a pointer. This makes + * taking the address of the argument a dicey thing; if we use "&a" + * it works fine for non-array types, but for array types we get the + * address of a pointer. Functions like va_copy and sprintf don't + * care as they get "va_list" values passed and are written to do + * the right thing, but printk explicitly wants the address of the + * va_list. + * + * So, we copy the va_list values to ensure that "&" consistently + * works the way we want. + */ + va_list args1Copy; + va_copy(args1Copy, args1); + va_list args2Copy; + va_copy(args2Copy, args2); + struct va_format vaf1 = { + .fmt = (fmt1 != NULL) ? fmt1 : "", + .va = &args1Copy, + }; + struct va_format vaf2 = { + .fmt = (fmt2 != NULL) ? fmt2 : "", + .va = &args2Copy, + }; + + if (prefix == NULL) { + prefix = ""; + } + + /* + * Context info formats: + * + * interrupt: uds[NMI]: blah + * process: uds: myprog: blah + * + * Fields: module name, interrupt level or process name. + * + * XXX need the equivalent of VDO's deviceInstance here + */ + if (in_interrupt()) { + printk("%s%s[%s]: %s%pV%pV\n", priorityToLogLevel(priority), + THIS_MODULE->name, getCurrentInterruptType(), prefix, &vaf1, &vaf2); + } else { + printk("%s%s: %s: %s%pV%pV\n", priorityToLogLevel(priority), + THIS_MODULE->name, current->comm, prefix, &vaf1, &vaf2); + } + + va_end(args1Copy); + va_end(args2Copy); +} + +/**********************************************************************/ +void logBacktrace(int priority) +{ + if (priority > getLogLevel()) { + return; + } + logMessage(priority, "[backtrace]"); + dump_stack(); +} + +/**********************************************************************/ +void pauseForLogger(void) +{ + // Hopefully, a few milliseconds of sleep will be large enough + // for the kernel log buffer to be flushed. + msleep(4); +} diff --git a/source/uds/masterIndex005.c b/source/uds/masterIndex005.c new file mode 100644 index 0000000..3f9a5b2 --- /dev/null +++ b/source/uds/masterIndex005.c @@ -0,0 +1,1470 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndex005.c#3 $ + */ +#include "masterIndex005.h" + +#include "buffer.h" +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "uds.h" +#include "zone.h" + +/* + * The master index is a kept as a delta index where the payload is a + * chapter number. The master index adds 2 basic functions to the delta + * index: + * + * (1) How to get the delta list number and address out of the chunk name. + * + * (2) Dealing with chapter numbers, and especially the lazy flushing of + * chapters from the index. + * + * There are three ways of expressing chapter numbers: virtual, index, and + * rolling. The interface to the the master index uses virtual chapter + * numbers, which are 64 bits long. We do not store such large values in + * memory, so we internally use a binary value using the minimal number of + * bits. + * + * The delta index stores the index chapter number, which is the low-order + * bits of the virtual chapter number. + * + * When we need to deal with ordering of index chapter numbers, we roll the + * index chapter number around so that the smallest one we are using has + * the representation 0. See convertIndexToVirtual() or + * flushInvalidEntries() for an example of this technique. + */ + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) masterIndexZone { + uint64_t virtualChapterLow; // The lowest virtual chapter indexed + uint64_t virtualChapterHigh; // The highest virtual chapter indexed + long numEarlyFlushes; // The number of early flushes +} MasterIndexZone; + +typedef struct { + MasterIndex common; // Common master index methods + DeltaIndex deltaIndex; // The delta index + uint64_t *flushChapters; // The first chapter to be flushed + MasterIndexZone *masterZones; // The Zones + uint64_t volumeNonce; // The volume nonce + uint64_t chapterZoneBits; // Expected size of a chapter (per zone) + uint64_t maxZoneBits; // Maximum size index (per zone) + unsigned int addressBits; // Number of bits in address mask + unsigned int addressMask; // Mask to get address within delta list + unsigned int chapterBits; // Number of bits in chapter number + unsigned int chapterMask; // Largest storable chapter number + unsigned int numChapters; // Number of chapters used + unsigned int numDeltaLists; // The number of delta lists + unsigned int numZones; // The number of zones +} MasterIndex5; + +typedef struct chapterRange { + unsigned int chapterStart; // The first chapter + unsigned int chapterCount; // The number of chapters +} ChapterRange; + +// Constants for the magic byte of a MasterIndexRecord +static const byte masterIndexRecordMagic = 0xAA; +static const byte badMagic = 0; + +/* + * In production, the default value for minMasterIndexDeltaLists will be + * replaced by MAX_ZONES*MAX_ZONES. Some unit tests will replace + * minMasterIndexDeltaLists with the non-default value 1, because those + * tests really want to run with a single delta list. + */ +unsigned int minMasterIndexDeltaLists; + +/** + * Maximum of two unsigned ints + * + * @param a One unsigned int + * @param b Another unsigned int + * + * @return the bigger one + **/ +static INLINE unsigned int maxUint(unsigned int a, unsigned int b) +{ + return a > b ? a : b; +} + +/** + * Extract the address from a block name. + * + * @param mi5 The master index + * @param name The block name + * + * @return the address + **/ +static INLINE unsigned int extractAddress(const MasterIndex5 *mi5, + const UdsChunkName *name) +{ + return extractMasterIndexBytes(name) & mi5->addressMask; +} + +/** + * Extract the delta list number from a block name. + * + * @param mi5 The master index + * @param name The block name + * + * @return the delta list number + **/ +static INLINE unsigned int extractDListNum(const MasterIndex5 *mi5, + const UdsChunkName *name) +{ + uint64_t bits = extractMasterIndexBytes(name); + return (bits >> mi5->addressBits) % mi5->numDeltaLists; +} + +/** + * Get the master index zone containing a given master index record + * + * @param record The master index record + * + * @return the master index zone + **/ +static INLINE const MasterIndexZone *getMasterZone(const MasterIndexRecord *record) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + return &mi5->masterZones[record->zoneNumber]; +} + +/** + * Convert an index chapter number to a virtual chapter number. + * + * @param record The master index record + * @param indexChapter The index chapter number + * + * @return the virtual chapter number + **/ +static INLINE uint64_t convertIndexToVirtual(const MasterIndexRecord *record, + unsigned int indexChapter) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + const MasterIndexZone *masterZone = getMasterZone(record); + unsigned int rollingChapter + = ((indexChapter - masterZone->virtualChapterLow) & mi5->chapterMask); + return masterZone->virtualChapterLow + rollingChapter; +} + +/** + * Convert a virtual chapter number to an index chapter number. + * + * @param mi5 The master index + * @param virtualChapter The virtual chapter number + * + * @return the index chapter number + **/ +static INLINE unsigned int convertVirtualToIndex(const MasterIndex5 *mi5, + uint64_t virtualChapter) +{ + return virtualChapter & mi5->chapterMask; +} + +/** + * Determine whether a virtual chapter number is in the range being indexed + * + * @param record The master index record + * @param virtualChapter The virtual chapter number + * + * @return true if the virtual chapter number is being indexed + **/ +static INLINE bool isVirtualChapterIndexed(const MasterIndexRecord *record, + uint64_t virtualChapter) +{ + const MasterIndexZone *masterZone = getMasterZone(record); + return ((virtualChapter >= masterZone->virtualChapterLow) + && (virtualChapter <= masterZone->virtualChapterHigh)); +} + +/***********************************************************************/ +/** + * Flush an invalid entry from the master index, advancing to the next + * valid entry. + * + * @param record Updated to describe the next valid record + * @param flushRange Range of chapters to flush from the index + * @param nextChapterToInvalidate Updated to record the next chapter that we + * will need to invalidate + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int flushInvalidEntries(MasterIndexRecord *record, + ChapterRange *flushRange, + unsigned int *nextChapterToInvalidate) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + int result = nextDeltaIndexEntry(&record->deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + while (!record->deltaEntry.atEnd) { + unsigned int indexChapter = getDeltaEntryValue(&record->deltaEntry); + unsigned int relativeChapter = ((indexChapter - flushRange->chapterStart) + & mi5->chapterMask); + if (likely(relativeChapter >= flushRange->chapterCount)) { + if (relativeChapter < *nextChapterToInvalidate) { + *nextChapterToInvalidate = relativeChapter; + } + break; + } + result = removeDeltaIndexEntry(&record->deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/** + * Find the delta index entry, or the insertion point for a delta index + * entry, while processing chapter LRU flushing. + * + * @param record Updated to describe the entry being looked for + * @param listNumber The delta list number + * @param key The address field being looked for + * @param flushRange The range of chapters to flush from the index + * + * @return UDS_SUCCESS or an error code + **/ +static int getMasterIndexEntry(MasterIndexRecord *record, + unsigned int listNumber, + unsigned int key, + ChapterRange *flushRange) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + unsigned int nextChapterToInvalidate = mi5->chapterMask; + + int result = startDeltaIndexSearch(&mi5->deltaIndex, listNumber, 0, + false, &record->deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + do { + result = flushInvalidEntries(record, flushRange, &nextChapterToInvalidate); + if (result != UDS_SUCCESS) { + return result; + } + } while (!record->deltaEntry.atEnd && (key > record->deltaEntry.key)); + + result = rememberDeltaIndexOffset(&record->deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + + // We probably found the record we want, but we need to keep going + MasterIndexRecord otherRecord = *record; + if (!otherRecord.deltaEntry.atEnd && (key == otherRecord.deltaEntry.key)) { + for (;;) { + result = flushInvalidEntries(&otherRecord, flushRange, + &nextChapterToInvalidate); + if (result != UDS_SUCCESS) { + return result; + } + if (otherRecord.deltaEntry.atEnd + || !otherRecord.deltaEntry.isCollision) { + break; + } + byte collisionName[UDS_CHUNK_NAME_SIZE]; + result = getDeltaEntryCollision(&otherRecord.deltaEntry, collisionName); + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(collisionName, record->name, UDS_CHUNK_NAME_SIZE) == 0) { + // This collision record is the one we are looking for + *record = otherRecord; + break; + } + } + } + while (!otherRecord.deltaEntry.atEnd) { + result = flushInvalidEntries(&otherRecord, flushRange, + &nextChapterToInvalidate); + if (result != UDS_SUCCESS) { + return result; + } + } + nextChapterToInvalidate += flushRange->chapterStart; + nextChapterToInvalidate &= mi5->chapterMask; + flushRange->chapterStart = nextChapterToInvalidate; + flushRange->chapterCount = 0; + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Terminate and clean up the master index + * + * @param masterIndex The master index to terminate + **/ +static void freeMasterIndex_005(MasterIndex *masterIndex) +{ + if (masterIndex != NULL) { + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + FREE(mi5->flushChapters); + mi5->flushChapters = NULL; + FREE(mi5->masterZones); + mi5->masterZones = NULL; + uninitializeDeltaIndex(&mi5->deltaIndex); + FREE(masterIndex); + } +} + +/** + * Constants and structures for the saved master index file. "MI5" is for + * masterIndex005, and "-XXXX" is a number to increment when the format of + * the data changes. + **/ +enum { MAGIC_SIZE = 8 }; +static const char MAGIC_MI_START[] = "MI5-0005"; + +struct mi005_data { + char magic[MAGIC_SIZE]; // MAGIC_MI_START + uint64_t volumeNonce; + uint64_t virtualChapterLow; + uint64_t virtualChapterHigh; + unsigned int firstList; + unsigned int numLists; +}; + +/***********************************************************************/ +/** + * Set the tag value used when saving and/or restoring a master index. + * + * @param masterIndex The master index + * @param tag The tag value + **/ +static void setMasterIndexTag_005(MasterIndex *masterIndex, byte tag) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + setDeltaIndexTag(&mi5->deltaIndex, tag); +} + +/***********************************************************************/ +__attribute__((warn_unused_result)) +static int encodeMasterIndexHeader(Buffer *buffer, struct mi005_data *header) +{ + int result = putBytes(buffer, MAGIC_SIZE, MAGIC_MI_START); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->volumeNonce); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->virtualChapterLow); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt64LEIntoBuffer(buffer, header->virtualChapterHigh); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->firstList); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->numLists); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(struct mi005_data), + "%zu bytes of config written, of %zu expected", + contentLength(buffer), sizeof(struct mi005_data)); + return result; +} + +/** + * Start saving a master index to a buffered output stream. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * @param bufferedWriter The index state component being written + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int startSavingMasterIndex_005(const MasterIndex *masterIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; + unsigned int firstList = getDeltaIndexZoneFirstList(&mi5->deltaIndex, + zoneNumber); + unsigned int numLists = getDeltaIndexZoneNumLists(&mi5->deltaIndex, + zoneNumber); + + struct mi005_data header; + memset(&header, 0, sizeof(header)); + memcpy(header.magic, MAGIC_MI_START, MAGIC_SIZE); + header.volumeNonce = mi5->volumeNonce; + header.virtualChapterLow = masterZone->virtualChapterLow; + header.virtualChapterHigh = masterZone->virtualChapterHigh; + header.firstList = firstList; + header.numLists = numLists; + + Buffer *buffer; + int result = makeBuffer(sizeof(struct mi005_data), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = encodeMasterIndexHeader(buffer, &header); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write master index header"); + } + result = makeBuffer(numLists * sizeof(uint64_t), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + uint64_t *firstFlushChapter = &mi5->flushChapters[firstList]; + result = putUInt64LEsIntoBuffer(buffer, numLists, firstFlushChapter); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write master index flush " + "ranges"); + } + + return startSavingDeltaIndex(&mi5->deltaIndex, zoneNumber, bufferedWriter); +} + +/***********************************************************************/ +/** + * Have all the data been written while saving a master index to an output + * stream? If the answer is yes, it is still necessary to call + * finishSavingMasterIndex(), which will return quickly. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return true if all the data are written + **/ +static bool isSavingMasterIndexDone_005(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + return isSavingDeltaIndexDone(&mi5->deltaIndex, zoneNumber); +} + +/***********************************************************************/ +/** + * Finish saving a master index to an output stream. Force the writing of + * all of the remaining data. If an error occurred asynchronously during + * the save operation, it will be returned here. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int finishSavingMasterIndex_005(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + return finishSavingDeltaIndex(&mi5->deltaIndex, zoneNumber); +} + +/***********************************************************************/ +/** + * Abort saving a master index to an output stream. If an error occurred + * asynchronously during the save operation, it will be dropped. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int abortSavingMasterIndex_005(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + return abortSavingDeltaIndex(&mi5->deltaIndex, zoneNumber); +} + +/***********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeMasterIndexHeader(Buffer *buffer, struct mi005_data *header) +{ + int result = getBytesFromBuffer(buffer, sizeof(header->magic), + &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->volumeNonce); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->virtualChapterLow); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt64LEFromBuffer(buffer, &header->virtualChapterHigh); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->firstList); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->numLists); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_COMPONENT; + } + return result; +} + +/** + * Start restoring the master index from multiple buffered readers + * + * @param masterIndex The master index to restore into + * @param bufferedReaders The buffered readers to read the master index from + * @param numReaders The number of buffered readers + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int startRestoringMasterIndex_005(MasterIndex *masterIndex, + BufferedReader **bufferedReaders, + int numReaders) +{ + if (masterIndex == NULL) { + return logWarningWithStringError(UDS_BAD_STATE, + "cannot restore to null master index"); + } + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + emptyDeltaIndex(&mi5->deltaIndex); + + uint64_t virtualChapterLow = 0; + uint64_t virtualChapterHigh = 0; + int i; + for (i = 0; i < numReaders; i++) { + Buffer *buffer; + int result = makeBuffer(sizeof(struct mi005_data), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(bufferedReaders[i], + getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logWarningWithStringError(result, + "failed to read master index header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + struct mi005_data header; + result = decodeMasterIndexHeader(buffer, &header); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(header.magic, MAGIC_MI_START, MAGIC_SIZE) != 0) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "master index file had bad magic" + " number"); + } + if (mi5->volumeNonce == 0) { + mi5->volumeNonce = header.volumeNonce; + } else if (header.volumeNonce != mi5->volumeNonce) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "master index volume nonce incorrect"); + } + if (i == 0) { + virtualChapterLow = header.virtualChapterLow; + virtualChapterHigh = header.virtualChapterHigh; + } else if (virtualChapterHigh != header.virtualChapterHigh) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "Inconsistent master index zone files:" + " Chapter range is [%llu,%" + PRIu64 "], chapter range %d is [%" + PRIu64 ",%llu]", + virtualChapterLow, virtualChapterHigh, + i, header.virtualChapterLow, + header.virtualChapterHigh); + } else if (virtualChapterLow < header.virtualChapterLow) { + virtualChapterLow = header.virtualChapterLow; + } + uint64_t *firstFlushChapter = &mi5->flushChapters[header.firstList]; + result = makeBuffer(header.numLists * sizeof(uint64_t), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(bufferedReaders[i], + getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logWarningWithStringError(result, + "failed to read master index flush" + " ranges"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = getUInt64LEsFromBuffer(buffer, header.numLists, + firstFlushChapter); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + } + + unsigned int z; + for (z = 0; z < mi5->numZones; z++) { + memset(&mi5->masterZones[z], 0, sizeof(MasterIndexZone)); + mi5->masterZones[z].virtualChapterLow = virtualChapterLow; + mi5->masterZones[z].virtualChapterHigh = virtualChapterHigh; + } + + int result = startRestoringDeltaIndex(&mi5->deltaIndex, bufferedReaders, + numReaders); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "restoring delta index failed"); + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Have all the data been read while restoring a master index from an + * input stream? + * + * @param masterIndex The master index to restore into + * + * @return true if all the data are read + **/ +static bool isRestoringMasterIndexDone_005(const MasterIndex *masterIndex) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + return isRestoringDeltaIndexDone(&mi5->deltaIndex); +} + +/***********************************************************************/ +/** + * Restore a saved delta list + * + * @param masterIndex The master index to restore into + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +static int restoreDeltaListToMasterIndex_005(MasterIndex *masterIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + return restoreDeltaListToDeltaIndex(&mi5->deltaIndex, dlsi, data); +} + +/***********************************************************************/ +/** + * Abort restoring a master index from an input stream. + * + * @param masterIndex The master index + **/ +static void abortRestoringMasterIndex_005(MasterIndex *masterIndex) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + abortRestoringDeltaIndex(&mi5->deltaIndex); +} + +/***********************************************************************/ +static void removeNewestChapters(MasterIndex5 *mi5, + unsigned int zoneNumber, + uint64_t virtualChapter) +{ + // Get the range of delta lists belonging to this zone + unsigned int firstList = getDeltaIndexZoneFirstList(&mi5->deltaIndex, + zoneNumber); + unsigned int numLists = getDeltaIndexZoneNumLists(&mi5->deltaIndex, + zoneNumber); + unsigned int lastList = firstList + numLists - 1; + + if (virtualChapter > mi5->chapterMask) { + // The virtual chapter number is large enough so that we can use the + // normal LRU mechanism without an unsigned underflow. + virtualChapter -= mi5->chapterMask + 1; + // Eliminate the newest chapters by renumbering them to become the + // oldest chapters + unsigned int i; + for (i = firstList; i <= lastList; i++) { + if (virtualChapter < mi5->flushChapters[i]) { + mi5->flushChapters[i] = virtualChapter; + } + } + } else { + // Underflow will prevent the fast path. Do it the slow and painful way. + MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; + ChapterRange range; + range.chapterStart = convertVirtualToIndex(mi5, virtualChapter); + range.chapterCount = (mi5->chapterMask + 1 + - (virtualChapter - masterZone->virtualChapterLow)); + UdsChunkName name; + memset(&name, 0, sizeof(UdsChunkName)); + MasterIndexRecord record = (MasterIndexRecord) { + .magic = masterIndexRecordMagic, + .masterIndex = &mi5->common, + .name = &name, + .zoneNumber = zoneNumber, + }; + unsigned int i; + for (i = firstList; i <= lastList; i++) { + ChapterRange tempRange = range; + getMasterIndexEntry(&record, i, 0, &tempRange); + } + } +} + +/***********************************************************************/ +/** + * Set the open chapter number on a zone. The master index zone will be + * modified to index the proper number of chapters ending with the new open + * chapter. + * + * @param masterIndex The master index + * @param zoneNumber The zone number + * @param virtualChapter The new open chapter number + **/ +static void setMasterIndexZoneOpenChapter_005(MasterIndex *masterIndex, + unsigned int zoneNumber, + uint64_t virtualChapter) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; + // Take care here to avoid underflow of an unsigned value. Note that + // this is the smallest valid virtual low. We may or may not actually + // use this value. + uint64_t newVirtualLow = (virtualChapter >= mi5->numChapters + ? virtualChapter - mi5->numChapters + 1 + : 0); + + if (virtualChapter <= masterZone->virtualChapterLow) { + /* + * Moving backwards and the new range is totally before the old range. + * Note that moving to the lowest virtual chapter counts as totally before + * the old range, as we need to remove the entries in the open chapter. + */ + emptyDeltaIndexZone(&mi5->deltaIndex, zoneNumber); + masterZone->virtualChapterLow = virtualChapter; + masterZone->virtualChapterHigh = virtualChapter; + } else if (virtualChapter <= masterZone->virtualChapterHigh) { + // Moving backwards and the new range overlaps the old range. Note + // that moving to the same open chapter counts as backwards, as we need + // to remove the entries in the open chapter. + removeNewestChapters(mi5, zoneNumber, virtualChapter); + masterZone->virtualChapterHigh = virtualChapter; + } else if (newVirtualLow < masterZone->virtualChapterLow) { + // Moving forwards and we can keep all the old chapters + masterZone->virtualChapterHigh = virtualChapter; + } else if (newVirtualLow <= masterZone->virtualChapterHigh) { + // Moving forwards and we can keep some old chapters + masterZone->virtualChapterLow = newVirtualLow; + masterZone->virtualChapterHigh = virtualChapter; + } else { + // Moving forwards and the new range is totally after the old range + masterZone->virtualChapterLow = virtualChapter; + masterZone->virtualChapterHigh = virtualChapter; + } + // Check to see if the zone data has grown to be too large + if (masterZone->virtualChapterLow < masterZone->virtualChapterHigh) { + uint64_t usedBits = getDeltaIndexZoneDlistBitsUsed(&mi5->deltaIndex, + zoneNumber); + if (usedBits > mi5->maxZoneBits) { + // Expire enough chapters to free the desired space + uint64_t expireCount + = 1 + (usedBits - mi5->maxZoneBits) / mi5->chapterZoneBits; + if (expireCount == 1) { + logRatelimit(logInfo, + "masterZone %u: At chapter %" PRIu64 + ", expiring chapter %llu early", + zoneNumber, virtualChapter, + masterZone->virtualChapterLow); + masterZone->numEarlyFlushes++; + masterZone->virtualChapterLow++; + } else { + uint64_t firstExpired = masterZone->virtualChapterLow; + if (firstExpired + expireCount < masterZone->virtualChapterHigh) { + masterZone->numEarlyFlushes += expireCount; + masterZone->virtualChapterLow += expireCount; + } else { + masterZone->numEarlyFlushes + += masterZone->virtualChapterHigh - masterZone->virtualChapterLow; + masterZone->virtualChapterLow = masterZone->virtualChapterHigh; + } + logRatelimit(logInfo, + "masterZone %u: At chapter %" PRIu64 + ", expiring chapters %llu to %llu early", + zoneNumber, virtualChapter, firstExpired, + masterZone->virtualChapterLow - 1); + } + } + } +} + +/***********************************************************************/ +/** + * Set the open chapter number. The master index will be modified to index + * the proper number of chapters ending with the new open chapter. + * + * @param masterIndex The master index + * @param virtualChapter The new open chapter number + **/ +static void setMasterIndexOpenChapter_005(MasterIndex *masterIndex, + uint64_t virtualChapter) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + unsigned int z; + for (z = 0; z < mi5->numZones; z++) { + // In normal operation, we advance forward one chapter at a time. + // Log all abnormal changes. + MasterIndexZone *masterZone = &mi5->masterZones[z]; + bool logMove = virtualChapter != masterZone->virtualChapterHigh + 1; + if (logMove) { + logDebug("masterZone %u: The range of indexed chapters is moving from [%" + PRIu64 ", %llu] ...", + z, + masterZone->virtualChapterLow, + masterZone->virtualChapterHigh); + } + + setMasterIndexZoneOpenChapter_005(masterIndex, z, virtualChapter); + + if (logMove) { + logDebug("masterZone %u: ... and moving to [%llu, %llu]", + z, + masterZone->virtualChapterLow, + masterZone->virtualChapterHigh); + } + } +} + +/***********************************************************************/ +/** + * Find the master index zone associated with a chunk name + * + * @param masterIndex The master index + * @param name The chunk name + * + * @return the zone that the chunk name belongs to + **/ +static unsigned int getMasterIndexZone_005(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + unsigned int deltaListNumber = extractDListNum(mi5, name); + return getDeltaIndexZone(&mi5->deltaIndex, deltaListNumber); +} + +/***********************************************************************/ +/** + * Do a quick read-only lookup of the chunk name and return information + * needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name + * + * @return UDS_SUCCESS or an error code + **/ +static int lookupMasterIndexName_005(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + triage->isSample = false; + triage->inSampledChapter = false; + triage->zone = getMasterIndexZone_005(masterIndex, name); + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Do a quick read-only lookup of the sampled chunk name and return + * information needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name. The zone and + * isSample fields are already filled in. Set + * inSampledChapter and virtualChapter if the chunk + * name is found in the index. + * + * @return UDS_SUCCESS or an error code + **/ +static int lookupMasterIndexSampledName_005(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + unsigned int address = extractAddress(mi5, name); + unsigned int deltaListNumber = extractDListNum(mi5, name); + DeltaIndexEntry deltaEntry; + int result = getDeltaIndexEntry(&mi5->deltaIndex, deltaListNumber, address, + name->name, true, &deltaEntry); + if (result != UDS_SUCCESS) { + return result; + } + triage->inSampledChapter = !deltaEntry.atEnd && (deltaEntry.key == address); + if (triage->inSampledChapter) { + const MasterIndexZone *masterZone = &mi5->masterZones[triage->zone]; + unsigned int indexChapter = getDeltaEntryValue(&deltaEntry); + unsigned int rollingChapter = ((indexChapter + - masterZone->virtualChapterLow) + & mi5->chapterMask); + triage->virtualChapter = masterZone->virtualChapterLow + rollingChapter; + if (triage->virtualChapter > masterZone->virtualChapterHigh) { + triage->inSampledChapter = false; + } + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Find the master index record associated with a block name + * + * This is always the first routine to be called when dealing with a delta + * master index entry. The fields of the record parameter should be + * examined to determine the state of the record: + * + * If isFound is false, then we did not find an entry for the block + * name. Information is saved in the MasterIndexRecord so that + * putMasterIndexRecord() will insert an entry for that block name at + * the proper place. + * + * If isFound is true, then we did find an entry for the block name. + * Information is saved in the MasterIndexRecord so that the "chapter" + * and "isCollision" fields reflect the entry found. + * Calls to removeMasterIndexRecord() will remove the entry, calls to + * setMasterIndexRecordChapter() can modify the entry, and calls to + * putMasterIndexRecord() can insert a collision record with this + * entry. + * + * @param masterIndex The master index to search + * @param name The chunk name + * @param record Set to the info about the record searched for + * + * @return UDS_SUCCESS or an error code + **/ +static int getMasterIndexRecord_005(MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexRecord *record) +{ + MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); + unsigned int address = extractAddress(mi5, name); + unsigned int deltaListNumber = extractDListNum(mi5, name); + uint64_t flushChapter = mi5->flushChapters[deltaListNumber]; + record->magic = masterIndexRecordMagic; + record->masterIndex = masterIndex; + record->mutex = NULL; + record->name = name; + record->zoneNumber = getDeltaIndexZone(&mi5->deltaIndex, deltaListNumber); + const MasterIndexZone *masterZone = getMasterZone(record); + + int result; + if (flushChapter < masterZone->virtualChapterLow) { + ChapterRange range; + uint64_t flushCount = masterZone->virtualChapterLow - flushChapter; + range.chapterStart = convertVirtualToIndex(mi5, flushChapter); + range.chapterCount = (flushCount > mi5->chapterMask + ? mi5->chapterMask + 1 + : flushCount); + result = getMasterIndexEntry(record, deltaListNumber, address, &range); + flushChapter = convertIndexToVirtual(record, range.chapterStart); + if (flushChapter > masterZone->virtualChapterHigh) { + flushChapter = masterZone->virtualChapterHigh; + } + mi5->flushChapters[deltaListNumber] = flushChapter; + } else { + result = getDeltaIndexEntry(&mi5->deltaIndex, deltaListNumber, address, + name->name, false, &record->deltaEntry); + } + if (result != UDS_SUCCESS) { + return result; + } + record->isFound = (!record->deltaEntry.atEnd + && (record->deltaEntry.key == address)); + if (record->isFound) { + unsigned int indexChapter = getDeltaEntryValue(&record->deltaEntry); + record->virtualChapter = convertIndexToVirtual(record, indexChapter); + } + record->isCollision = record->deltaEntry.isCollision; + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Create a new record associated with a block name. + * + * @param record The master index record found by getRecord() + * @param virtualChapter The chapter number where block info is found + * + * @return UDS_SUCCESS or an error code + **/ +int putMasterIndexRecord(MasterIndexRecord *record, uint64_t virtualChapter) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + if (record->magic != masterIndexRecordMagic) { + return logWarningWithStringError(UDS_BAD_STATE, + "bad magic number in master index record"); + } + if (!isVirtualChapterIndexed(record, virtualChapter)) { + const MasterIndexZone *masterZone = getMasterZone(record); + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot put record into chapter number %" + PRIu64 " that is out of the valid range %" + PRIu64 " to %llu", + virtualChapter, + masterZone->virtualChapterLow, + masterZone->virtualChapterHigh); + } + unsigned int address = extractAddress(mi5, record->name); + if (unlikely(record->mutex != NULL)) { + lockMutex(record->mutex); + } + int result = putDeltaIndexEntry(&record->deltaEntry, address, + convertVirtualToIndex(mi5, virtualChapter), + record->isFound ? record->name->name : NULL); + if (unlikely(record->mutex != NULL)) { + unlockMutex(record->mutex); + } + switch (result) { + case UDS_SUCCESS: + record->virtualChapter = virtualChapter; + record->isCollision = record->deltaEntry.isCollision; + record->isFound = true; + break; + case UDS_OVERFLOW: + logRatelimit(logWarningWithStringError, UDS_OVERFLOW, + "Master index entry dropped due to overflow condition"); + logDeltaIndexEntry(&record->deltaEntry); + break; + default: + break; + } + return result; +} + +/**********************************************************************/ +static INLINE int validateRecord(MasterIndexRecord *record) +{ + if (record->magic != masterIndexRecordMagic) { + return logWarningWithStringError( + UDS_BAD_STATE, "bad magic number in master index record"); + } + if (!record->isFound) { + return logWarningWithStringError(UDS_BAD_STATE, + "illegal operation on new record"); + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Remove an existing record. + * + * @param record The master index record found by getRecord() + * + * @return UDS_SUCCESS or an error code + **/ +int removeMasterIndexRecord(MasterIndexRecord *record) +{ + int result = validateRecord(record); + if (result != UDS_SUCCESS) { + return result; + } + // Mark the record so that it cannot be used again + record->magic = badMagic; + if (unlikely(record->mutex != NULL)) { + lockMutex(record->mutex); + } + result = removeDeltaIndexEntry(&record->deltaEntry); + if (unlikely(record->mutex != NULL)) { + unlockMutex(record->mutex); + } + return result; +} + +/***********************************************************************/ +/** + * Set the chapter number associated with a block name. + * + * @param record The master index record found by getRecord() + * @param virtualChapter The chapter number where the block info is now found. + * + * @return UDS_SUCCESS or an error code + **/ +int setMasterIndexRecordChapter(MasterIndexRecord *record, + uint64_t virtualChapter) +{ + const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, + common); + int result = validateRecord(record); + if (result != UDS_SUCCESS) { + return result; + } + if (!isVirtualChapterIndexed(record, virtualChapter)) { + const MasterIndexZone *masterZone = getMasterZone(record); + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot set chapter number %" PRIu64 + " that is out of the valid range %" PRIu64 + " to %llu", + virtualChapter, + masterZone->virtualChapterLow, + masterZone->virtualChapterHigh); + } + if (unlikely(record->mutex != NULL)) { + lockMutex(record->mutex); + } + result = setDeltaEntryValue(&record->deltaEntry, + convertVirtualToIndex(mi5, virtualChapter)); + if (unlikely(record->mutex != NULL)) { + unlockMutex(record->mutex); + } + if (result != UDS_SUCCESS) { + return result; + } + record->virtualChapter = virtualChapter; + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Get the number of bytes used for master index entries. + * + * @param masterIndex The master index + * + * @return The number of bytes in use + **/ +static size_t getMasterIndexMemoryUsed_005(const MasterIndex *masterIndex) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + uint64_t bits = getDeltaIndexDlistBitsUsed(&mi5->deltaIndex); + return (bits + CHAR_BIT - 1) / CHAR_BIT; +} + +/***********************************************************************/ +/** + * Return the master index stats. There is only one portion of the master + * index in this implementation, and we call it the dense portion of the + * index. + * + * @param masterIndex The master index + * @param dense Stats for the dense portion of the index + * @param sparse Stats for the sparse portion of the index + **/ +static void getMasterIndexStats_005(const MasterIndex *masterIndex, + MasterIndexStats *dense, + MasterIndexStats *sparse) +{ + const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, + common); + DeltaIndexStats dis; + getDeltaIndexStats(&mi5->deltaIndex, &dis); + dense->memoryAllocated = (dis.memoryAllocated + + sizeof(MasterIndex5) + + mi5->numDeltaLists * sizeof(uint64_t) + + mi5->numZones * sizeof(MasterIndexZone)); + dense->rebalanceTime = dis.rebalanceTime; + dense->rebalanceCount = dis.rebalanceCount; + dense->recordCount = dis.recordCount; + dense->collisionCount = dis.collisionCount; + dense->discardCount = dis.discardCount; + dense->overflowCount = dis.overflowCount; + dense->numLists = dis.numLists; + dense->earlyFlushes = 0; + unsigned int z; + for (z = 0; z < mi5->numZones; z++) { + dense->earlyFlushes += mi5->masterZones[z].numEarlyFlushes; + } + memset(sparse, 0, sizeof(MasterIndexStats)); +} + +/***********************************************************************/ +/** + * Determine whether a given chunk name is a hook. + * + * @param masterIndex The master index + * @param name The block name + * + * @return whether to use as sample + **/ +static bool isMasterIndexSample_005(const MasterIndex *masterIndex + __attribute__((unused)), + const UdsChunkName *name + __attribute__((unused))) +{ + return false; +} + +/***********************************************************************/ +typedef struct { + unsigned int addressBits; // Number of bits in address mask + unsigned int chapterBits; // Number of bits in chapter number + unsigned int meanDelta; // The mean delta + unsigned long numDeltaLists; // The number of delta lists + unsigned long numChapters; // Number of chapters used + size_t numBitsPerChapter; // The number of bits per chapter + size_t memorySize; // The number of bytes of delta list memory + size_t targetFreeSize; // The number of free bytes we desire +} Parameters005; + +/***********************************************************************/ +static int computeMasterIndexParameters005(const Configuration *config, + Parameters005 *params) +{ + enum { DELTA_LIST_SIZE = 256 }; + /* + * For a given zone count, setting the the minimum number of delta lists + * to the square of the number of zones ensures that the distribution of + * delta lists over zones doesn't underflow, leaving the last zone with + * an invalid number of delta lists. See the explanation in + * initializeDeltaIndex(). Because we can restart with a different number + * of zones but the number of delta lists is invariant across restart, + * we must use the largest number of zones to compute this minimum. + */ + unsigned long minDeltaLists = (minMasterIndexDeltaLists + ? minMasterIndexDeltaLists + : MAX_ZONES * MAX_ZONES); + + Geometry *geometry = config->geometry; + unsigned long recordsPerChapter = geometry->recordsPerChapter; + params->numChapters = geometry->chaptersPerVolume; + unsigned long recordsPerVolume = recordsPerChapter * params->numChapters; + unsigned int numAddresses = config->masterIndexMeanDelta * DELTA_LIST_SIZE; + params->numDeltaLists + = maxUint(recordsPerVolume / DELTA_LIST_SIZE, minDeltaLists); + params->addressBits = computeBits(numAddresses - 1); + params->chapterBits = computeBits(params->numChapters - 1); + + if ((unsigned int) params->numDeltaLists != params->numDeltaLists) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize master index with %lu" + " delta lists", + params->numDeltaLists); + } + if (params->addressBits > 31) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize master index with %u" + " address bits", + params->addressBits); + } + if (geometry->sparseChaptersPerVolume > 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize dense master index" + " with %u sparse chapters", + geometry->sparseChaptersPerVolume); + } + if (recordsPerChapter == 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize master index with %lu" + " records per chapter", + recordsPerChapter); + } + if (params->numChapters == 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cannot initialize master index with %lu" + " chapters per volume", + params->numChapters); + } + + /* + * We can now compute the probability that a delta list is not touched during + * the writing of an entire chapter. The computation is: + * + * double pNotTouched = pow((double) (params->numDeltaLists - 1) + * / params->numDeltaLists, + * recordsPerChapter); + * + * For the standard index sizes, about 78% of the delta lists are not + * touched, and therefore contain dead index entries that have not been + * eliminated by the lazy LRU processing. We can then compute how many dead + * index entries accumulate over time. The computation is: + * + * double invalidChapters = pNotTouched / (1.0 - pNotTouched); + * + * For the standard index sizes, we will need about 3.5 chapters of space for + * the dead index entries in a 1K chapter index. Since we do not want to do + * that floating point computation, we use 4 chapters per 1K of chapters. + */ + unsigned long invalidChapters = maxUint(params->numChapters / 256, 2); + unsigned long chaptersInMasterIndex = params->numChapters + invalidChapters; + unsigned long entriesInMasterIndex + = recordsPerChapter * chaptersInMasterIndex; + // Compute the mean delta + unsigned long addressSpan = params->numDeltaLists << params->addressBits; + params->meanDelta = addressSpan / entriesInMasterIndex; + // Project how large we expect a chapter to be + params->numBitsPerChapter = getDeltaMemorySize(recordsPerChapter, + params->meanDelta, + params->chapterBits); + // Project how large we expect the index to be + size_t numBitsPerIndex = params->numBitsPerChapter * chaptersInMasterIndex; + size_t expectedIndexSize = numBitsPerIndex / CHAR_BIT; + /* + * Set the total memory to be 6% larger than the expected index size. We + * want this number to be large enough that the we do not do a great many + * rebalances as the list when the list is full. We use MasterIndex_p1 + * to tune this setting. + */ + params->memorySize = expectedIndexSize * 106 / 100; + // Set the target free size to 5% of the expected index size + params->targetFreeSize = expectedIndexSize / 20; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int computeMasterIndexSaveBytes005(const Configuration *config, + size_t *numBytes) +{ + Parameters005 params = { .addressBits = 0 }; + int result = computeMasterIndexParameters005(config, ¶ms); + if (result != UDS_SUCCESS) { + return result; + } + // Saving a MasterIndex005 needs a header plus one uint64_t per delta + // list plus the delta index. + *numBytes = (sizeof(struct mi005_data) + + params.numDeltaLists * sizeof(uint64_t) + + computeDeltaIndexSaveBytes(params.numDeltaLists, + params.memorySize)); + return UDS_SUCCESS; +} + +/***********************************************************************/ +int makeMasterIndex005(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) +{ + Parameters005 params = { .addressBits = 0 }; + int result = computeMasterIndexParameters005(config, ¶ms); + if (result != UDS_SUCCESS) { + return result; + } + + MasterIndex5 *mi5; + result = ALLOCATE(1, MasterIndex5, "master index", &mi5); + if (result != UDS_SUCCESS) { + *masterIndex = NULL; + return result; + } + + mi5->common.abortRestoringMasterIndex = abortRestoringMasterIndex_005; + mi5->common.abortSavingMasterIndex = abortSavingMasterIndex_005; + mi5->common.finishSavingMasterIndex = finishSavingMasterIndex_005; + mi5->common.freeMasterIndex = freeMasterIndex_005; + mi5->common.getMasterIndexMemoryUsed = getMasterIndexMemoryUsed_005; + mi5->common.getMasterIndexRecord = getMasterIndexRecord_005; + mi5->common.getMasterIndexStats = getMasterIndexStats_005; + mi5->common.getMasterIndexZone = getMasterIndexZone_005; + mi5->common.isMasterIndexSample = isMasterIndexSample_005; + mi5->common.isRestoringMasterIndexDone = isRestoringMasterIndexDone_005; + mi5->common.isSavingMasterIndexDone = isSavingMasterIndexDone_005; + mi5->common.lookupMasterIndexName = lookupMasterIndexName_005; + mi5->common.lookupMasterIndexSampledName = lookupMasterIndexSampledName_005; + mi5->common.restoreDeltaListToMasterIndex = restoreDeltaListToMasterIndex_005; + mi5->common.setMasterIndexOpenChapter = setMasterIndexOpenChapter_005; + mi5->common.setMasterIndexTag = setMasterIndexTag_005; + mi5->common.setMasterIndexZoneOpenChapter = setMasterIndexZoneOpenChapter_005; + mi5->common.startRestoringMasterIndex = startRestoringMasterIndex_005; + mi5->common.startSavingMasterIndex = startSavingMasterIndex_005; + + mi5->addressBits = params.addressBits; + mi5->addressMask = (1u << params.addressBits) - 1; + mi5->chapterBits = params.chapterBits; + mi5->chapterMask = (1u << params.chapterBits) - 1; + mi5->numChapters = params.numChapters; + mi5->numDeltaLists = params.numDeltaLists; + mi5->numZones = numZones; + mi5->chapterZoneBits = params.numBitsPerChapter / numZones; + mi5->volumeNonce = volumeNonce; + + result = initializeDeltaIndex(&mi5->deltaIndex, numZones, + params.numDeltaLists, params.meanDelta, + params.chapterBits, params.memorySize); + if (result == UDS_SUCCESS) { + mi5->maxZoneBits = ((getDeltaIndexDlistBitsAllocated(&mi5->deltaIndex) + - params.targetFreeSize * CHAR_BIT) + / numZones); + } + + // Initialize the chapter flush ranges to be empty. This depends upon + // allocate returning zeroed memory. + if (result == UDS_SUCCESS) { + result = ALLOCATE(params.numDeltaLists, uint64_t, + "first chapter to flush", &mi5->flushChapters); + } + + // Initialize the virtual chapter ranges to start at zero. This depends + // upon allocate returning zeroed memory. + if (result == UDS_SUCCESS) { + result = ALLOCATE(numZones, MasterIndexZone, "master index zones", + &mi5->masterZones); + } + + if (result == UDS_SUCCESS) { + *masterIndex = &mi5->common; + } else { + freeMasterIndex_005(&mi5->common); + *masterIndex = NULL; + } + return result; +} diff --git a/source/uds/masterIndex005.h b/source/uds/masterIndex005.h new file mode 100644 index 0000000..5436c7f --- /dev/null +++ b/source/uds/masterIndex005.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndex005.h#1 $ + */ + +#ifndef MASTERINDEX005_H +#define MASTERINDEX005_H 1 + +#include "masterIndexOps.h" + +/** + * Make a new master index. + * + * @param config The configuration of the master index + * @param numZones The number of zones + * @param volumeNonce The nonce used to authenticate the index + * @param masterIndex Location to hold new master index ptr + * + * @return error code or UDS_SUCCESS + **/ +int makeMasterIndex005(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) + __attribute__((warn_unused_result)); + +/** + * Compute the number of bytes required to save a master index of a given + * configuration. + * + * @param config The configuration of the master index + * @param numBytes The number of bytes required to save the master index + * + * @return UDS_SUCCESS or an error code. + **/ +int computeMasterIndexSaveBytes005(const Configuration *config, + size_t *numBytes) + __attribute__((warn_unused_result)); + +#endif /* MASTERINDEX005_H */ diff --git a/source/uds/masterIndex006.c b/source/uds/masterIndex006.c new file mode 100644 index 0000000..3e1ef00 --- /dev/null +++ b/source/uds/masterIndex006.c @@ -0,0 +1,791 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndex006.c#2 $ + */ +#include "masterIndex006.h" + +#include "buffer.h" +#include "compiler.h" +#include "errors.h" +#include "hashUtils.h" +#include "logger.h" +#include "masterIndex005.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "threads.h" +#include "uds.h" + +/* + * The master index is a kept as a wrapper around 2 master index + * implementations, one for dense chapters and one for sparse chapters. + * Methods will be routed to one or the other, or both, depending on the + * method and data passed in. + * + * The master index is divided into zones, and in normal operation there is + * one thread operating on each zone. Any operation that operates on all + * the zones needs to do its operation at a safe point that ensures that + * only one thread is operating on the master index. + * + * The only multithreaded operation supported by the sparse master index is + * the lookupMasterIndexName() method. It is called by the thread that + * assigns an index request to the proper zone, and needs to do a master + * index query for sampled chunk names. The zone mutexes are used to make + * this lookup operation safe. + */ + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) masterIndexZone { + Mutex hookMutex; // Protects the sampled index in this zone +} MasterIndexZone; + +typedef struct { + MasterIndex common; // Common master index methods + unsigned int sparseSampleRate; // The sparse sample rate + unsigned int numZones; // The number of zones + MasterIndex *miNonHook; // The non-hook index + MasterIndex *miHook; // The hook index == sample index + MasterIndexZone *masterZones; // The zones +} MasterIndex6; + +/** + * Determine whether a given chunk name is a hook. + * + * @param masterIndex The master index + * @param name The block name + * + * @return whether to use as sample + **/ +static INLINE bool isMasterIndexSample_006(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (extractSamplingBytes(name) % mi6->sparseSampleRate) == 0; +} + +/***********************************************************************/ +/** + * Get the subindex for the given chunk name + * + * @param masterIndex The master index + * @param name The block name + * + * @return the subindex + **/ +static INLINE MasterIndex *getSubIndex(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (isMasterIndexSample_006(masterIndex, name) + ? mi6->miHook + : mi6->miNonHook); +} + +/***********************************************************************/ +/** + * Terminate and clean up the master index + * + * @param masterIndex The master index to terminate + **/ +static void freeMasterIndex_006(MasterIndex *masterIndex) +{ + if (masterIndex != NULL) { + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + if (mi6->masterZones != NULL) { + unsigned int zone; + for (zone = 0; zone < mi6->numZones; zone++) { + destroyMutex(&mi6->masterZones[zone].hookMutex); + } + FREE(mi6->masterZones); + mi6->masterZones = NULL; + } + if (mi6->miNonHook != NULL) { + freeMasterIndex(mi6->miNonHook); + mi6->miNonHook = NULL; + } + if (mi6->miHook != NULL) { + freeMasterIndex(mi6->miHook); + mi6->miHook = NULL; + } + FREE(masterIndex); + } +} + +/***********************************************************************/ +/** + * Constants and structures for the saved master index file. "MI6" is for + * masterIndex006, and "-XXXX" is a number to increment when the format of + * the data changes. + **/ +enum { MAGIC_SIZE = 8 }; +static const char MAGIC_MI_START[] = "MI6-0001"; + +struct mi006_data { + char magic[MAGIC_SIZE]; // MAGIC_MI_START + unsigned int sparseSampleRate; +}; + +/***********************************************************************/ +/** + * Set the tag value used when saving and/or restoring a master index. + * + * @param masterIndex The master index + * @param tag The tag value + **/ +static void setMasterIndexTag_006(MasterIndex *masterIndex + __attribute__((unused)), + byte tag __attribute__((unused))) +{ +} + +/***********************************************************************/ +__attribute__((warn_unused_result)) +static int encodeMasterIndexHeader(Buffer *buffer, struct mi006_data *header) +{ + int result = putBytes(buffer, MAGIC_SIZE, MAGIC_MI_START); + if (result != UDS_SUCCESS) { + return result; + } + result = putUInt32LEIntoBuffer(buffer, header->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(struct mi006_data), + "%zu bytes of config written, of %zu expected", + contentLength(buffer), sizeof(struct mi006_data)); + return result; +} + +/** + * Start saving a master index to a buffered output stream. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * @param bufferedWriter The index state component being written + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int startSavingMasterIndex_006(const MasterIndex *masterIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + Buffer *buffer; + int result = makeBuffer(sizeof(struct mi006_data), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + struct mi006_data header; + memset(&header, 0, sizeof(header)); + memcpy(header.magic, MAGIC_MI_START, MAGIC_SIZE); + header.sparseSampleRate = mi6->sparseSampleRate; + result = encodeMasterIndexHeader(buffer, &header); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), + contentLength(buffer)); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "failed to write master index header"); + return result; + } + + result = startSavingMasterIndex(mi6->miNonHook, zoneNumber, bufferedWriter); + if (result != UDS_SUCCESS) { + return result; + } + + result = startSavingMasterIndex(mi6->miHook, zoneNumber, bufferedWriter); + if (result != UDS_SUCCESS) { + return result; + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +/** + * Have all the data been written while saving a master index to an output + * stream? If the answer is yes, it is still necessary to call + * finishSavingMasterIndex(), which will return quickly. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return true if all the data are written + **/ +static bool isSavingMasterIndexDone_006(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (isSavingMasterIndexDone(mi6->miNonHook, zoneNumber) + && isSavingMasterIndexDone(mi6->miHook, zoneNumber)); +} + +/***********************************************************************/ +/** + * Finish saving a master index to an output stream. Force the writing of + * all of the remaining data. If an error occurred asynchronously during + * the save operation, it will be returned here. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int finishSavingMasterIndex_006(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + int result = finishSavingMasterIndex(mi6->miNonHook, zoneNumber); + if (result == UDS_SUCCESS) { + result = finishSavingMasterIndex(mi6->miHook, zoneNumber); + } + return result; +} + +/***********************************************************************/ +/** + * Abort saving a master index to an output stream. If an error occurred + * asynchronously during the save operation, it will be dropped. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int abortSavingMasterIndex_006(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + int result = abortSavingMasterIndex(mi6->miNonHook, zoneNumber); + int result2 = abortSavingMasterIndex(mi6->miHook, zoneNumber); + if (result == UDS_SUCCESS) { + result = result2; + } + return result; +} + +/***********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeMasterIndexHeader(Buffer *buffer, struct mi006_data *header) +{ + int result = getBytesFromBuffer(buffer, sizeof(header->magic), + &header->magic); + if (result != UDS_SUCCESS) { + return result; + } + result = getUInt32LEFromBuffer(buffer, &header->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, + "%zu bytes decoded of %zu expected", + bufferLength(buffer) - contentLength(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + result = UDS_CORRUPT_COMPONENT; + } + return result; +} + +/** + * Start restoring the master index from multiple buffered readers + * + * @param masterIndex The master index to restore into + * @param bufferedReaders The buffered reader to read the master index from + * @param numReaders The number of buffered readers + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static int startRestoringMasterIndex_006(MasterIndex *masterIndex, + BufferedReader **bufferedReaders, + int numReaders) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + int result = ASSERT_WITH_ERROR_CODE(masterIndex != NULL, UDS_BAD_STATE, + "cannot restore to null master index"); + if (result != UDS_SUCCESS) { + return result; + } + + int i; + for (i = 0; i < numReaders; i++) { + Buffer *buffer; + result = makeBuffer(sizeof(struct mi006_data), &buffer); + if (result != UDS_SUCCESS) { + return result; + } + result = readFromBufferedReader(bufferedReaders[i], + getBufferContents(buffer), + bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return logWarningWithStringError(result, + "failed to read master index header"); + } + result = resetBufferEnd(buffer, bufferLength(buffer)); + if (result != UDS_SUCCESS) { + freeBuffer(&buffer); + return result; + } + struct mi006_data header; + result = decodeMasterIndexHeader(buffer, &header); + freeBuffer(&buffer); + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(header.magic, MAGIC_MI_START, MAGIC_SIZE) != 0) { + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "master index file had bad magic" + " number"); + } + if (i == 0) { + mi6->sparseSampleRate = header.sparseSampleRate; + } else if (mi6->sparseSampleRate != header.sparseSampleRate) { + logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "Inconsistent sparse sample rate in delta" + " index zone files: %u vs. %u", + mi6->sparseSampleRate, + header.sparseSampleRate); + return UDS_CORRUPT_COMPONENT; + } + } + + result = startRestoringMasterIndex(mi6->miNonHook, bufferedReaders, + numReaders); + if (result != UDS_SUCCESS) { + return result; + } + return startRestoringMasterIndex(mi6->miHook, bufferedReaders, numReaders); +} + +/***********************************************************************/ +/** + * Have all the data been read while restoring a master index from an + * input stream? + * + * @param masterIndex The master index to restore into + * + * @return true if all the data are read + **/ +static bool isRestoringMasterIndexDone_006(const MasterIndex *masterIndex) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (isRestoringMasterIndexDone(mi6->miNonHook) + && isRestoringMasterIndexDone(mi6->miHook)); +} + +/***********************************************************************/ +/** + * Restore a saved delta list + * + * @param masterIndex The master index to restore into + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +static int restoreDeltaListToMasterIndex_006(MasterIndex *masterIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + int result = restoreDeltaListToMasterIndex(mi6->miNonHook, dlsi, data); + if (result != UDS_SUCCESS) { + result = restoreDeltaListToMasterIndex(mi6->miHook, dlsi, data); + } + return result; +} + +/***********************************************************************/ +/** + * Abort restoring a master index from an input stream. + * + * @param masterIndex The master index + **/ +static void abortRestoringMasterIndex_006(MasterIndex *masterIndex) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + abortRestoringMasterIndex(mi6->miNonHook); + abortRestoringMasterIndex(mi6->miHook); +} + +/***********************************************************************/ +/** + * Set the open chapter number on a zone. The master index zone will be + * modified to index the proper number of chapters ending with the new open + * chapter. + * + * @param masterIndex The master index + * @param zoneNumber The zone number + * @param virtualChapter The new open chapter number + **/ +static void setMasterIndexZoneOpenChapter_006(MasterIndex *masterIndex, + unsigned int zoneNumber, + uint64_t virtualChapter) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + setMasterIndexZoneOpenChapter(mi6->miNonHook, zoneNumber, virtualChapter); + + // We need to prevent a lookupMasterIndexName() happening while we are + // changing the open chapter number + Mutex *mutex = &mi6->masterZones[zoneNumber].hookMutex; + lockMutex(mutex); + setMasterIndexZoneOpenChapter(mi6->miHook, zoneNumber, virtualChapter); + unlockMutex(mutex); +} + +/***********************************************************************/ +/** + * Set the open chapter number. The master index will be modified to index + * the proper number of chapters ending with the new open chapter. + * + * @param masterIndex The master index + * @param virtualChapter The new open chapter number + **/ +static void setMasterIndexOpenChapter_006(MasterIndex *masterIndex, + uint64_t virtualChapter) +{ + MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); + unsigned int zone; + for (zone = 0; zone < mi6->numZones; zone++) { + setMasterIndexZoneOpenChapter_006(masterIndex, zone, virtualChapter); + } +} + +/***********************************************************************/ +/** + * Find the master index zone associated with a chunk name + * + * @param masterIndex The master index + * @param name The chunk name + * + * @return the zone that the chunk name belongs to + **/ +static unsigned int getMasterIndexZone_006(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + return getMasterIndexZone(getSubIndex(masterIndex, name), name); +} + +/***********************************************************************/ +/** + * Do a quick read-only lookup of the chunk name and return information + * needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name + * + * @return UDS_SUCCESS or an error code + **/ +static int lookupMasterIndexName_006(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + triage->isSample = isMasterIndexSample_006(masterIndex, name); + triage->inSampledChapter = false; + triage->zone = getMasterIndexZone_006(masterIndex, name); + int result = UDS_SUCCESS; + if (triage->isSample) { + Mutex *mutex = &mi6->masterZones[triage->zone].hookMutex; + lockMutex(mutex); + result = lookupMasterIndexSampledName(mi6->miHook, name, triage); + unlockMutex(mutex); + } + return result; +} + +/***********************************************************************/ +/** + * Do a quick read-only lookup of the sampled chunk name and return + * information needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name. The zone and + * isSample fields are already filled in. Set + * inSampledChapter and virtualChapter if the chunk + * name is found in the index. + * + * @return UDS_SUCCESS or an error code + **/ +static int lookupMasterIndexSampledName_006(const MasterIndex *masterIndex + __attribute__((unused)), + const UdsChunkName *name + __attribute__((unused)), + MasterIndexTriage *triage + __attribute__((unused))) +{ + return ASSERT_WITH_ERROR_CODE(false, UDS_BAD_STATE, + "%s should not be called", __func__); +} + +/***********************************************************************/ +/** + * Find the master index record associated with a block name + * + * This is always the first routine to be called when dealing with a delta + * master index entry. The fields of the record parameter should be + * examined to determine the state of the record: + * + * If isFound is false, then we did not find an entry for the block + * name. Information is saved in the MasterIndexRecord so that + * putMasterIndexRecord() will insert an entry for that block name at + * the proper place. + * + * If isFound is true, then we did find an entry for the block name. + * Information is saved in the MasterIndexRecord so that the "chapter" + * and "isCollision" fields reflect the entry found. + * Calls to removeMasterIndexRecord() will remove the entry, calls to + * setMasterIndexRecordChapter() can modify the entry, and calls to + * putMasterIndexRecord() can insert a collision record with this + * entry. + * + * @param masterIndex The master index to search + * @param name The chunk name + * @param record Set to the info about the record searched for + * + * @return UDS_SUCCESS or an error code + **/ +static int getMasterIndexRecord_006(MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexRecord *record) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + int result; + if (isMasterIndexSample_006(masterIndex, name)) { + /* + * We need to prevent a lookupMasterIndexName() happening while we are + * finding the master index record. Remember that because of lazy LRU + * flushing of the master index, getMasterIndexRecord() is not a + * read-only operation. + */ + unsigned int zone = getMasterIndexZone(mi6->miHook, name); + Mutex *mutex = &mi6->masterZones[zone].hookMutex; + lockMutex(mutex); + result = getMasterIndexRecord(mi6->miHook, name, record); + unlockMutex(mutex); + // Remember the mutex so that other operations on the MasterIndexRecord + // can use it + record->mutex = mutex; + } else { + result = getMasterIndexRecord(mi6->miNonHook, name, record); + } + return result; +} + +/***********************************************************************/ +/** + * Get the number of bytes used for master index entries. + * + * @param masterIndex The master index + * + * @return The number of bytes in use + **/ +static size_t getMasterIndexMemoryUsed_006(const MasterIndex *masterIndex) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + return (getMasterIndexMemoryUsed(mi6->miNonHook) + + getMasterIndexMemoryUsed(mi6->miHook)); +} + +/***********************************************************************/ +/** + * Return the master index stats. There is only one portion of the master + * index in this implementation, and we call it the dense portion of the + * index. + * + * @param masterIndex The master index + * @param dense Stats for the dense portion of the index + * @param sparse Stats for the sparse portion of the index + **/ +static void getMasterIndexStats_006(const MasterIndex *masterIndex, + MasterIndexStats *dense, + MasterIndexStats *sparse) +{ + const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, + common); + MasterIndexStats dummyStats; + getMasterIndexStats(mi6->miNonHook, dense, &dummyStats); + getMasterIndexStats(mi6->miHook, sparse, &dummyStats); +} + +/***********************************************************************/ +typedef struct { + Configuration hookConfig; // Describe the hook part of the index + Geometry hookGeometry; + Configuration nonHookConfig; // Describe the non-hook part of the index + Geometry nonHookGeometry; +} SplitConfig; + +/***********************************************************************/ +static int splitConfiguration006(const Configuration *config, + SplitConfig *split) +{ + int result + = ASSERT_WITH_ERROR_CODE(config->geometry->sparseChaptersPerVolume != 0, + UDS_INVALID_ARGUMENT, + "cannot initialize sparse+dense master index" + " with no sparse chapters"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE(config->sparseSampleRate != 0, + UDS_INVALID_ARGUMENT, + "cannot initialize sparse+dense master" + " index with a sparse sample rate of %u", + config->sparseSampleRate); + if (result != UDS_SUCCESS) { + return result; + } + + // Start with copies of the base configuration + split->hookConfig = *config; + split->hookGeometry = *config->geometry; + split->hookConfig.geometry = &split->hookGeometry; + split->nonHookConfig = *config; + split->nonHookGeometry = *config->geometry; + split->nonHookConfig.geometry = &split->nonHookGeometry; + + uint64_t sampleRate = config->sparseSampleRate; + uint64_t numChapters = config->geometry->chaptersPerVolume; + uint64_t numSparseChapters = config->geometry->sparseChaptersPerVolume; + uint64_t numDenseChapters = numChapters - numSparseChapters; + uint64_t sampleRecords = config->geometry->recordsPerChapter / sampleRate; + + // Adjust the number of records indexed for each chapter + split->hookGeometry.recordsPerChapter = sampleRecords; + split->nonHookGeometry.recordsPerChapter -= sampleRecords; + + // Adjust the number of chapters indexed + split->hookGeometry.sparseChaptersPerVolume = 0; + split->nonHookGeometry.sparseChaptersPerVolume = 0; + split->nonHookGeometry.chaptersPerVolume = numDenseChapters; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int computeMasterIndexSaveBytes006(const Configuration *config, + size_t *numBytes) +{ + SplitConfig split; + int result = splitConfiguration006(config, &split); + if (result != UDS_SUCCESS) { + return result; + } + size_t hookBytes, nonHookBytes; + result = computeMasterIndexSaveBytes005(&split.hookConfig, &hookBytes); + if (result != UDS_SUCCESS) { + return result; + } + result = computeMasterIndexSaveBytes005(&split.nonHookConfig, &nonHookBytes); + if (result != UDS_SUCCESS) { + return result; + } + // Saving a MasterIndex006 needs a header plus the hook index plus the + // non-hook index + *numBytes = sizeof(struct mi006_data) + hookBytes + nonHookBytes; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int makeMasterIndex006(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) +{ + SplitConfig split; + int result = splitConfiguration006(config, &split); + if (result != UDS_SUCCESS) { + return result; + } + + MasterIndex6 *mi6; + result = ALLOCATE(1, MasterIndex6, "master index", &mi6); + if (result != UDS_SUCCESS) { + return result; + } + + mi6->common.abortRestoringMasterIndex = abortRestoringMasterIndex_006; + mi6->common.abortSavingMasterIndex = abortSavingMasterIndex_006; + mi6->common.finishSavingMasterIndex = finishSavingMasterIndex_006; + mi6->common.freeMasterIndex = freeMasterIndex_006; + mi6->common.getMasterIndexMemoryUsed = getMasterIndexMemoryUsed_006; + mi6->common.getMasterIndexRecord = getMasterIndexRecord_006; + mi6->common.getMasterIndexStats = getMasterIndexStats_006; + mi6->common.getMasterIndexZone = getMasterIndexZone_006; + mi6->common.isMasterIndexSample = isMasterIndexSample_006; + mi6->common.isRestoringMasterIndexDone = isRestoringMasterIndexDone_006; + mi6->common.isSavingMasterIndexDone = isSavingMasterIndexDone_006; + mi6->common.lookupMasterIndexName = lookupMasterIndexName_006; + mi6->common.lookupMasterIndexSampledName = lookupMasterIndexSampledName_006; + mi6->common.restoreDeltaListToMasterIndex = restoreDeltaListToMasterIndex_006; + mi6->common.setMasterIndexOpenChapter = setMasterIndexOpenChapter_006; + mi6->common.setMasterIndexTag = setMasterIndexTag_006; + mi6->common.setMasterIndexZoneOpenChapter = setMasterIndexZoneOpenChapter_006; + mi6->common.startRestoringMasterIndex = startRestoringMasterIndex_006; + mi6->common.startSavingMasterIndex = startSavingMasterIndex_006; + + mi6->numZones = numZones; + mi6->sparseSampleRate = config->sparseSampleRate; + + result = ALLOCATE(numZones, MasterIndexZone, "master index zones", + &mi6->masterZones); + unsigned int zone; + for (zone = 0; zone < numZones; zone++) { + if (result == UDS_SUCCESS) { + result = initMutex(&mi6->masterZones[zone].hookMutex); + } + } + if (result != UDS_SUCCESS) { + freeMasterIndex_006(&mi6->common); + return result; + } + + result = makeMasterIndex005(&split.nonHookConfig, numZones, volumeNonce, + &mi6->miNonHook); + if (result != UDS_SUCCESS) { + freeMasterIndex_006(&mi6->common); + return logErrorWithStringError(result, + "Error creating non hook master index"); + } + setMasterIndexTag(mi6->miNonHook, 'd'); + + result = makeMasterIndex005(&split.hookConfig, numZones, volumeNonce, + &mi6->miHook); + if (result != UDS_SUCCESS) { + freeMasterIndex_006(&mi6->common); + return logErrorWithStringError(result, + "Error creating hook master index"); + } + setMasterIndexTag(mi6->miHook, 's'); + + *masterIndex = &mi6->common; + return UDS_SUCCESS; +} diff --git a/source/uds/masterIndex006.h b/source/uds/masterIndex006.h new file mode 100644 index 0000000..1d3b377 --- /dev/null +++ b/source/uds/masterIndex006.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndex006.h#1 $ + */ + +#ifndef MASTERINDEX006_H +#define MASTERINDEX006_H 1 + +#include "masterIndexOps.h" + +/** + * Make a new master index. + * + * @param config The configuration of the master index + * @param numZones The number of zones + * @param volumeNonce The nonce used to authenticate the index + * @param masterIndex Location to hold new master index ptr + * + * @return error code or UDS_SUCCESS + **/ +int makeMasterIndex006(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) + __attribute__((warn_unused_result)); + +/** + * Compute the number of bytes required to save a master index of a given + * configuration. + * + * @param config The configuration of the master index + * @param numBytes The number of bytes required to save the master index + * + * @return UDS_SUCCESS or an error code. + **/ +int computeMasterIndexSaveBytes006(const Configuration *config, + size_t *numBytes) + __attribute__((warn_unused_result)); + +#endif /* MASTERINDEX006_H */ diff --git a/source/uds/masterIndexOps.c b/source/uds/masterIndexOps.c new file mode 100644 index 0000000..1cbd10b --- /dev/null +++ b/source/uds/masterIndexOps.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndexOps.c#4 $ + */ +#include "masterIndexOps.h" + +#include "compiler.h" +#include "errors.h" +#include "indexComponent.h" +#include "logger.h" +#include "masterIndex005.h" +#include "masterIndex006.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" +#include "zone.h" + +/**********************************************************************/ +static INLINE bool usesSparse(const Configuration *config) +{ + return config->geometry->sparseChaptersPerVolume > 0; +} + +/**********************************************************************/ +void getMasterIndexCombinedStats(const MasterIndex *masterIndex, + MasterIndexStats *stats) +{ + MasterIndexStats dense, sparse; + getMasterIndexStats(masterIndex, &dense, &sparse); + stats->memoryAllocated = dense.memoryAllocated + sparse.memoryAllocated; + stats->rebalanceTime = dense.rebalanceTime + sparse.rebalanceTime; + stats->rebalanceCount = dense.rebalanceCount + sparse.rebalanceCount; + stats->recordCount = dense.recordCount + sparse.recordCount; + stats->collisionCount = dense.collisionCount + sparse.collisionCount; + stats->discardCount = dense.discardCount + sparse.discardCount; + stats->overflowCount = dense.overflowCount + sparse.overflowCount; + stats->numLists = dense.numLists + sparse.numLists; + stats->earlyFlushes = dense.earlyFlushes + sparse.earlyFlushes; +} + +/**********************************************************************/ +int makeMasterIndex(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) +{ + if (usesSparse(config)) { + return makeMasterIndex006(config, numZones, volumeNonce, masterIndex); + } else { + return makeMasterIndex005(config, numZones, volumeNonce, masterIndex); + } +} + +/**********************************************************************/ +int computeMasterIndexSaveBlocks(const Configuration *config, + size_t blockSize, uint64_t *blockCount) +{ + size_t numBytes; + int result = (usesSparse(config) + ? computeMasterIndexSaveBytes006(config, &numBytes) + : computeMasterIndexSaveBytes005(config, &numBytes)); + if (result != UDS_SUCCESS) { + return result; + } + numBytes += sizeof(DeltaListSaveInfo); + *blockCount = (numBytes + blockSize - 1) / blockSize + MAX_ZONES; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int readMasterIndex(ReadPortal *portal) +{ + MasterIndex *masterIndex = indexComponentContext(portal->component); + unsigned int numZones = portal->zones; + if (numZones > MAX_ZONES) { + return logErrorWithStringError(UDS_BAD_STATE, + "zone count %u must not exceed MAX_ZONES", + numZones); + } + + BufferedReader *readers[MAX_ZONES]; + unsigned int z; + for (z = 0; z < numZones; ++z) { + int result = getBufferedReaderForPortal(portal, z, &readers[z]); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "cannot read component for zone %u", z); + } + } + return restoreMasterIndex(readers, numZones, masterIndex); +} + +/**********************************************************************/ +static int writeMasterIndex(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone, + IncrementalWriterCommand command, + bool *completed) +{ + MasterIndex *masterIndex = indexComponentContext(component); + bool isComplete = false; + + int result = UDS_SUCCESS; + + switch (command) { + case IWC_START: + result = startSavingMasterIndex(masterIndex, zone, writer); + isComplete = result != UDS_SUCCESS; + break; + case IWC_CONTINUE: + isComplete = isSavingMasterIndexDone(masterIndex, zone); + break; + case IWC_FINISH: + result = finishSavingMasterIndex(masterIndex, zone); + if (result == UDS_SUCCESS) { + result = writeGuardDeltaList(writer); + } + isComplete = true; + break; + case IWC_ABORT: + result = abortSavingMasterIndex(masterIndex, zone); + isComplete = true; + break; + default: + result = logWarningWithStringError(UDS_INVALID_ARGUMENT, + "Invalid writer command"); + break; + } + if (completed != NULL) { + *completed = isComplete; + } + return result; +} + +/**********************************************************************/ + +static const IndexComponentInfo MASTER_INDEX_INFO_DATA = { + .kind = RL_KIND_MASTER_INDEX, + .name = "master index", + .saveOnly = false, + .chapterSync = false, + .multiZone = true, + .ioStorage = true, + .loader = readMasterIndex, + .saver = NULL, + .incremental = writeMasterIndex, +}; +const IndexComponentInfo *const MASTER_INDEX_INFO = &MASTER_INDEX_INFO_DATA; + +/**********************************************************************/ +static int restoreMasterIndexBody(BufferedReader **bufferedReaders, + unsigned int numReaders, + MasterIndex *masterIndex, + byte dlData[DELTA_LIST_MAX_BYTE_COUNT]) +{ + // Start by reading the "header" section of the stream + int result = startRestoringMasterIndex(masterIndex, bufferedReaders, + numReaders); + if (result != UDS_SUCCESS) { + return result; + } + // Loop to read the delta lists, stopping when they have all been processed. + unsigned int z; + for (z = 0; z < numReaders; z++) { + for (;;) { + DeltaListSaveInfo dlsi; + result = readSavedDeltaList(&dlsi, dlData, bufferedReaders[z]); + if (result == UDS_END_OF_FILE) { + break; + } else if (result != UDS_SUCCESS) { + abortRestoringMasterIndex(masterIndex); + return result; + } + result = restoreDeltaListToMasterIndex(masterIndex, &dlsi, dlData); + if (result != UDS_SUCCESS) { + abortRestoringMasterIndex(masterIndex); + return result; + } + } + } + if (!isRestoringMasterIndexDone(masterIndex)) { + abortRestoringMasterIndex(masterIndex); + return logWarningWithStringError(UDS_CORRUPT_COMPONENT, + "incomplete delta list data"); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int restoreMasterIndex(BufferedReader **bufferedReaders, + unsigned int numReaders, + MasterIndex *masterIndex) +{ + byte *dlData; + int result = ALLOCATE(DELTA_LIST_MAX_BYTE_COUNT, byte, __func__, &dlData); + if (result != UDS_SUCCESS) { + return result; + } + result = restoreMasterIndexBody(bufferedReaders, numReaders, masterIndex, + dlData); + FREE(dlData); + return result; +} diff --git a/source/uds/masterIndexOps.h b/source/uds/masterIndexOps.h new file mode 100644 index 0000000..90802ac --- /dev/null +++ b/source/uds/masterIndexOps.h @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/masterIndexOps.h#1 $ + */ + +#ifndef MASTERINDEXOPS_H +#define MASTERINDEXOPS_H 1 + +#include "compiler.h" +#include "deltaIndex.h" +#include "indexComponent.h" +#include "indexConfig.h" +#include "threads.h" +#include "uds.h" + +extern const IndexComponentInfo *const MASTER_INDEX_INFO; +extern unsigned int minMasterIndexDeltaLists; + +typedef struct masterIndex MasterIndex; + +typedef struct { + size_t memoryAllocated; // Number of bytes allocated + RelTime rebalanceTime; // The number of seconds spent rebalancing + int rebalanceCount; // Number of memory rebalances + long recordCount; // The number of records in the index + long collisionCount; // The number of collision records + long discardCount; // The number of records removed + long overflowCount; // The number of UDS_OVERFLOWs detected + unsigned int numLists; // The number of delta lists + long earlyFlushes; // Number of early flushes +} MasterIndexStats; + +/* + * The MasterIndexTriage structure is used by lookupMasterIndexName(), + * which is a read-only operation that looks at the chunk name and returns + * some information used by the index to select the thread/queue/code_path + * that will process the chunk. + */ +typedef struct { + uint64_t virtualChapter; // If inSampledChapter is true, then this is the + // chapter containing the entry for the chunk name + unsigned int zone; // The zone containing the chunk name + bool isSample; // If true, this chunk name belongs to the + // sampled index + bool inSampledChapter; // If true, this chunk already has an entry in the + // sampled index and virtualChapter is valid +} MasterIndexTriage; + +/* + * The MasterIndexRecord structure is used for normal index read-write + * processing of a chunk name. The first call must be to + * getMasterIndexRecord() to find the master index record for a chunk name. + * This call can be followed by putMasterIndexRecord() to add a master + * index record, or by setMasterIndexRecordChapter() to associate the chunk + * name with a different chapter, or by removeMasterIndexRecord() to delete + * a master index record. + */ +typedef struct { + // Public fields + uint64_t virtualChapter; // Chapter where the block info is found + bool isCollision; // This record is a collision + bool isFound; // This record is the block searched for + + // Private fields + unsigned char magic; // The magic number for valid records + unsigned int zoneNumber; // Zone that contains this block + MasterIndex *masterIndex; // The master index + Mutex *mutex; // Mutex that must be held while accessing + // this delta index entry; used only for + // a sampled index; otherwise is NULL + const UdsChunkName *name; // The blockname to which this record refers + DeltaIndexEntry deltaEntry; // The delta index entry for this record +} MasterIndexRecord; + +struct masterIndex { + void (*abortRestoringMasterIndex)(MasterIndex *masterIndex); + int (*abortSavingMasterIndex)(const MasterIndex *masterIndex, + unsigned int zoneNumber); + int (*finishSavingMasterIndex)(const MasterIndex *masterIndex, + unsigned int zoneNumber); + void (*freeMasterIndex)(MasterIndex *masterIndex); + size_t (*getMasterIndexMemoryUsed)(const MasterIndex *masterIndex); + int (*getMasterIndexRecord)(MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexRecord *record); + void (*getMasterIndexStats)(const MasterIndex *masterIndex, + MasterIndexStats *dense, + MasterIndexStats *sparse); + unsigned int (*getMasterIndexZone)(const MasterIndex *masterIndex, + const UdsChunkName *name); + bool (*isMasterIndexSample)(const MasterIndex *masterIndex, + const UdsChunkName *name); + bool (*isRestoringMasterIndexDone)(const MasterIndex *masterIndex); + bool (*isSavingMasterIndexDone)(const MasterIndex *masterIndex, + unsigned int zoneNumber); + int (*lookupMasterIndexName)(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage); + int (*lookupMasterIndexSampledName)(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage); + int (*restoreDeltaListToMasterIndex)(MasterIndex *masterIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]); + void (*setMasterIndexOpenChapter)(MasterIndex *masterIndex, + uint64_t virtualChapter); + void (*setMasterIndexTag)(MasterIndex *masterIndex, byte tag); + void (*setMasterIndexZoneOpenChapter)(MasterIndex *masterIndex, + unsigned int zoneNumber, + uint64_t virtualChapter); + int (*startRestoringMasterIndex)(MasterIndex *masterIndex, + BufferedReader **bufferedReaders, + int numReaders); + int (*startSavingMasterIndex)(const MasterIndex *masterIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter); +}; + +/** + * Return the combined master index stats. + * + * @param masterIndex The master index + * @param stats Combined stats for the index + **/ +void getMasterIndexCombinedStats(const MasterIndex *masterIndex, + MasterIndexStats *stats); + +/** + * Make a new master index. + * + * @param config The configuration of the master index + * @param numZones The number of zones + * @param volumeNonce The nonce used to store the index + * @param masterIndex Location to hold new master index ptr + * + * @return error code or UDS_SUCCESS + **/ +int makeMasterIndex(const Configuration *config, unsigned int numZones, + uint64_t volumeNonce, MasterIndex **masterIndex) + __attribute__((warn_unused_result)); + +/** + * Compute the number of blocks required to save a master index of a given + * configuration. + * + * @param [in] config The configuration of a master index + * @param [in] blockSize The size of a block in bytes. + * @param [out] blockCount The resulting number of blocks. + * + * @return UDS_SUCCESS or an error code. + **/ +int computeMasterIndexSaveBlocks(const Configuration *config, + size_t blockSize, + uint64_t *blockCount) + __attribute__((warn_unused_result)); + +/** + * Restore a master index. This is exposed for unit tests. + * + * @param readers The readers to read from. + * @param numReaders The number of readers. + * @param masterIndex The master index + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +int restoreMasterIndex(BufferedReader **readers, + unsigned int numReaders, + MasterIndex *masterIndex) + __attribute__((warn_unused_result)); + +/** + * Abort restoring a master index from an input stream. + * + * @param masterIndex The master index + **/ +static INLINE void abortRestoringMasterIndex(MasterIndex *masterIndex) +{ + masterIndex->abortRestoringMasterIndex(masterIndex); +} + +/** + * Abort saving a master index to an output stream. If an error occurred + * asynchronously during the save operation, it will be dropped. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static INLINE int abortSavingMasterIndex(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + return masterIndex->abortSavingMasterIndex(masterIndex, zoneNumber); +} + +/** + * Finish saving a master index to an output stream. Force the writing of + * all of the remaining data. If an error occurred asynchronously during + * the save operation, it will be returned here. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static INLINE int finishSavingMasterIndex(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + return masterIndex->finishSavingMasterIndex(masterIndex, zoneNumber); +} + +/** + * Terminate and clean up the master index + * + * @param masterIndex The master index to terminate + **/ +static INLINE void freeMasterIndex(MasterIndex *masterIndex) +{ + masterIndex->freeMasterIndex(masterIndex); +} + +/** + * Get the number of bytes used for master index entries. + * + * @param masterIndex The master index + * + * @return The number of bytes in use + **/ +static INLINE size_t getMasterIndexMemoryUsed(const MasterIndex *masterIndex) +{ + return masterIndex->getMasterIndexMemoryUsed(masterIndex); +} + +/** + * Find the master index record associated with a block name + * + * This is always the first routine to be called when dealing with a delta + * master index entry. The fields of the record parameter should be + * examined to determine the state of the record: + * + * If isFound is false, then we did not find an entry for the block name. + * Information is saved in the MasterIndexRecord so that + * putMasterIndexRecord() will insert an entry for that block name at the + * proper place. + * + * If isFound is true, then we did find an entry for the block name. + * Information is saved in the MasterIndexRecord so that the "chapter" and + * "isCollision" fields reflect the entry found. Calls to + * removeMasterIndexRecord() will remove the entry, calls to + * setMasterIndexRecordChapter() can modify the entry, and calls to + * putMasterIndexRecord() can insert a collision record with this entry. + * + * @param masterIndex The master index to search + * @param name The chunk name + * @param record Set to the info about the record searched for + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int getMasterIndexRecord(MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexRecord *record) +{ + return masterIndex->getMasterIndexRecord(masterIndex, name, record); +} + +/** + * Return the master index stats. + * + * @param masterIndex The master index + * @param dense Stats for the dense portion of the index + * @param sparse Stats for the sparse portion of the index + **/ +static INLINE void getMasterIndexStats(const MasterIndex *masterIndex, + MasterIndexStats *dense, + MasterIndexStats *sparse) +{ + masterIndex->getMasterIndexStats(masterIndex, dense, sparse); +} + +/** + * Find the master index zone associated with a chunk name + * + * @param masterIndex The master index + * @param name The chunk name + * + * @return the zone that the chunk name belongs to + **/ +static INLINE unsigned int getMasterIndexZone(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + return masterIndex->getMasterIndexZone(masterIndex, name); +} + +/** + * Determine whether a given chunk name is a hook. + * + * @param masterIndex The master index + * @param name The block name + * + * @return whether to use as sample + **/ +static INLINE bool isMasterIndexSample(const MasterIndex *masterIndex, + const UdsChunkName *name) +{ + return masterIndex->isMasterIndexSample(masterIndex, name); +} + +/** + * Have all the data been read while restoring a master index from an input + * stream? + * + * @param masterIndex The master index to restore into + * + * @return true if all the data are read + **/ +static INLINE bool isRestoringMasterIndexDone(const MasterIndex *masterIndex) +{ + return masterIndex->isRestoringMasterIndexDone(masterIndex); +} + +/** + * Have all the data been written while saving a master index to an + * output stream? If the answer is yes, it is still necessary to call + * finishSavingMasterIndex(), which will return quickly. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * + * @return true if all the data are written + **/ +static INLINE bool isSavingMasterIndexDone(const MasterIndex *masterIndex, + unsigned int zoneNumber) +{ + return masterIndex->isSavingMasterIndexDone(masterIndex, zoneNumber); +} + +/** + * Do a quick read-only lookup of the chunk name and return information + * needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int lookupMasterIndexName(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + return masterIndex->lookupMasterIndexName(masterIndex, name, triage); +} + +/** + * Do a quick read-only lookup of the sampled chunk name and return + * information needed by the index code to process the chunk name. + * + * @param masterIndex The master index + * @param name The chunk name + * @param triage Information about the chunk name. The zone and + * isSample fields are already filled in. Set + * inSampledChapter and virtualChapter if the chunk + * name is found in the index. + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int lookupMasterIndexSampledName(const MasterIndex *masterIndex, + const UdsChunkName *name, + MasterIndexTriage *triage) +{ + return masterIndex->lookupMasterIndexSampledName(masterIndex, name, triage); +} + +/** + * Create a new record associated with a block name. + * + * @param record The master index record found by getRecord() + * @param virtualChapter The chapter number where block info is found + * + * @return UDS_SUCCESS or an error code + **/ +int putMasterIndexRecord(MasterIndexRecord *record, uint64_t virtualChapter) + __attribute__((warn_unused_result)); + +/** + * Remove an existing record. + * + * @param record The master index record found by getRecord() + * + * @return UDS_SUCCESS or an error code + **/ +int removeMasterIndexRecord(MasterIndexRecord *record) + __attribute__((warn_unused_result)); + +/** + * Restore a saved delta list + * + * @param masterIndex The master index to restore into + * @param dlsi The DeltaListSaveInfo describing the delta list + * @param data The saved delta list bit stream + * + * @return error code or UDS_SUCCESS + **/ +static INLINE int restoreDeltaListToMasterIndex(MasterIndex *masterIndex, + const DeltaListSaveInfo *dlsi, + const byte data[DELTA_LIST_MAX_BYTE_COUNT]) +{ + return masterIndex->restoreDeltaListToMasterIndex(masterIndex, dlsi, data); +} + +/** + * Set the open chapter number. The master index will be modified to index + * the proper number of chapters ending with the new open chapter. + * + * In normal operation, the virtual chapter number will be the next chapter + * following the currently open chapter. We will advance the master index + * one chapter forward in the virtual chapter space, invalidating the + * oldest chapter in the index and be prepared to add index entries for the + * newly opened chapter. + * + * In abnormal operation we make a potentially large change to the range of + * chapters being indexed. This happens when we are replaying chapters or + * rebuilding an entire index. If we move the open chapter forward, we + * will invalidate many chapters (potentially the entire index). If we + * move the open chapter backward, we invalidate any entry in the newly + * open chapter and any higher numbered chapter (potentially the entire + * index). + * + * @param masterIndex The master index + * @param virtualChapter The new open chapter number + **/ +static INLINE void setMasterIndexOpenChapter(MasterIndex *masterIndex, + uint64_t virtualChapter) +{ + masterIndex->setMasterIndexOpenChapter(masterIndex, virtualChapter); +} + +/** + * Set the chapter number associated with a block name. + * + * @param record The master index record found by getRecord() + * @param virtualChapter The chapter number where block info is now found. + * + * @return UDS_SUCCESS or an error code + **/ +int setMasterIndexRecordChapter(MasterIndexRecord *record, uint64_t chapter) + __attribute__((warn_unused_result)); + +/** + * Set the tag value used when saving and/or restoring a master index. + * + * @param masterIndex The master index + * @param tag The tag value + **/ +static INLINE void setMasterIndexTag(MasterIndex *masterIndex, byte tag) +{ + masterIndex->setMasterIndexTag(masterIndex, tag); +} + +/** + * Set the open chapter number on a zone. The master index zone will be + * modified to index the proper number of chapters ending with the new open + * chapter. + * + * @param masterIndex The master index + * @param zoneNumber The zone number + * @param virtualChapter The new open chapter number + **/ +static INLINE void setMasterIndexZoneOpenChapter(MasterIndex *masterIndex, + unsigned int zoneNumber, + uint64_t virtualChapter) +{ + masterIndex->setMasterIndexZoneOpenChapter(masterIndex, zoneNumber, + virtualChapter); +} + +/** + * Start restoring the master index from multiple buffered readers + * + * @param masterIndex The master index to restore into + * @param bufferedReaders The buffered reader to read the master index from + * @param numReaders The number of buffered readers + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static INLINE int startRestoringMasterIndex(MasterIndex *masterIndex, + BufferedReader **bufferedReaders, + int numReaders) +{ + return masterIndex->startRestoringMasterIndex(masterIndex, bufferedReaders, + numReaders); +} + +/** + * Start saving a master index to a buffered output stream. + * + * @param masterIndex The master index + * @param zoneNumber The number of the zone to save + * @param bufferedWriter The index state component being written + * + * @return UDS_SUCCESS on success, or an error code on failure + **/ +static INLINE int startSavingMasterIndex(const MasterIndex *masterIndex, + unsigned int zoneNumber, + BufferedWriter *bufferedWriter) +{ + return masterIndex->startSavingMasterIndex(masterIndex, zoneNumber, + bufferedWriter); +} + +#endif /* MASTERINDEXOPS_H */ diff --git a/source/uds/memoryAlloc.c b/source/uds/memoryAlloc.c new file mode 100644 index 0000000..e47494c --- /dev/null +++ b/source/uds/memoryAlloc.c @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/memoryAlloc.c#1 $ + */ + +#include "memoryAlloc.h" + +#include "stringUtils.h" + +/**********************************************************************/ +int duplicateString(const char *string, const char *what, char **newString) +{ + return memdup(string, strlen(string) + 1, what, newString); +} + +/**********************************************************************/ +int memdup(const void *buffer, size_t size, const char *what, void *dupPtr) +{ + byte *dup; + int result = ALLOCATE(size, byte, what, &dup); + if (result != UDS_SUCCESS) { + return result; + } + + memcpy(dup, buffer, size); + *((void **) dupPtr) = dup; + return UDS_SUCCESS; +} diff --git a/source/uds/memoryAlloc.h b/source/uds/memoryAlloc.h new file mode 100644 index 0000000..c669e2b --- /dev/null +++ b/source/uds/memoryAlloc.h @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/memoryAlloc.h#2 $ + */ + +#ifndef MEMORY_ALLOC_H +#define MEMORY_ALLOC_H 1 + +#include + +#include "compiler.h" +#include "cpu.h" +#include "memoryDefs.h" +#include "permassert.h" + +/** + * Allocate storage based on memory size and alignment, logging an error if + * the allocation fails. The memory will be zeroed. + * + * @param size The size of an object + * @param align The required alignment + * @param what What is being allocated (for error logging) + * @param ptr A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +int allocateMemory(size_t size, size_t align, const char *what, void *ptr) + __attribute__((warn_unused_result)); + +/** + * Free storage + * + * @param ptr The memory to be freed + **/ +void freeMemory(void *ptr); + +/** + * Allocate storage based on element counts, sizes, and alignment. + * + * This is a generalized form of our allocation use case: It allocates + * an array of objects, optionally preceded by one object of another + * type (i.e., a struct with trailing variable-length array), with the + * alignment indicated. + * + * Why is this inline? The sizes and alignment will always be + * constant, when invoked through the macros below, and often the + * count will be a compile-time constant 1 or the number of extra + * bytes will be a compile-time constant 0. So at least some of the + * arithmetic can usually be optimized away, and the run-time + * selection between allocation functions always can. In many cases, + * it'll boil down to just a function call with a constant size. + * + * @param count The number of objects to allocate + * @param size The size of an object + * @param extra The number of additional bytes to allocate + * @param align The required alignment + * @param what What is being allocated (for error logging) + * @param ptr A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int doAllocation(size_t count, + size_t size, + size_t extra, + size_t align, + const char *what, + void *ptr) +{ + size_t totalSize = count * size + extra; + // Overflow check: + if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) { + /* + * This is kind of a hack: We rely on the fact that SIZE_MAX would + * cover the entire address space (minus one byte) and thus the + * system can never allocate that much and the call will always + * fail. So we can report an overflow as "out of memory" by asking + * for "merely" SIZE_MAX bytes. + */ + totalSize = SIZE_MAX; + } + + return allocateMemory(totalSize, align, what, ptr); +} + +/** + * Reallocate dynamically allocated memory. There are no alignment guarantees + * for the reallocated memory. + * + * @param ptr The memory to reallocate. + * @param oldSize The old size of the memory + * @param size The new size to allocate + * @param what What is being allocated (for error logging) + * @param newPtr A pointer to hold the reallocated pointer + * + * @return UDS_SUCCESS or an error code + **/ +int reallocateMemory(void *ptr, + size_t oldSize, + size_t size, + const char *what, + void *newPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate one or more elements of the indicated type, logging an + * error if the allocation fails. The memory will be zeroed. + * + * @param COUNT The number of objects to allocate + * @param TYPE The type of objects to allocate. This type determines the + * alignment of the allocated memory. + * @param WHAT What is being allocated (for error logging) + * @param PTR A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +#define ALLOCATE(COUNT, TYPE, WHAT, PTR) \ + doAllocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR) + +/** + * Allocate one object of an indicated type, followed by one or more + * elements of a second type, logging an error if the allocation + * fails. The memory will be zeroed. + * + * @param TYPE1 The type of the primary object to allocate. This type + * determines the alignment of the allocated memory. + * @param COUNT The number of objects to allocate + * @param TYPE2 The type of array objects to allocate + * @param WHAT What is being allocated (for error logging) + * @param PTR A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +#define ALLOCATE_EXTENDED(TYPE1, COUNT, TYPE2, WHAT, PTR) \ + __extension__ ({ \ + TYPE1 **_ptr = (PTR); \ + STATIC_ASSERT(__alignof__(TYPE1) >= __alignof__(TYPE2)); \ + int _result = doAllocation(COUNT, sizeof(TYPE2), sizeof(TYPE1), \ + __alignof__(TYPE1), WHAT, _ptr); \ + _result; \ + }) + +/** + * Free memory allocated with ALLOCATE(). + * + * @param ptr Pointer to the memory to free + **/ +static INLINE void FREE(void *ptr) +{ + freeMemory(ptr); +} + +/** + * Allocate memory starting on a cache line boundary, logging an error if the + * allocation fails. The memory will be zeroed. + * + * @param size The number of bytes to allocate + * @param what What is being allocated (for error logging) + * @param ptr A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static INLINE int allocateCacheAligned(size_t size, + const char *what, + void *ptr) +{ + return allocateMemory(size, CACHE_LINE_BYTES, what, ptr); +} + +/** + * Duplicate a string. + * + * @param string The string to duplicate + * @param what What is being allocated (for error logging) + * @param newString A pointer to hold the duplicated string + * + * @return UDS_SUCCESS or an error code + **/ +int duplicateString(const char *string, const char *what, char **newString) + __attribute__((warn_unused_result)); + +/** + * Duplicate a buffer, logging an error if the allocation fails. + * + * @param ptr The buffer to copy + * @param size The size of the buffer + * @param what What is being duplicated (for error logging) + * @param dupPtr A pointer to hold the allocated array + * + * @return UDS_SUCCESS or ENOMEM + **/ +int memdup(const void *ptr, size_t size, const char *what, void *dupPtr) + __attribute__((warn_unused_result)); + +/** + * Wrapper which permits freeing a const pointer. + * + * @param pointer the pointer to be freed + **/ +static INLINE void freeConst(const void *pointer) +{ + union { + const void *constP; + void *notConst; + } u = { .constP = pointer }; + FREE(u.notConst); +} + +/** + * Wrapper which permits freeing a volatile pointer. + * + * @param pointer the pointer to be freed + **/ +static INLINE void freeVolatile(volatile void *pointer) +{ + union { + volatile void *volP; + void *notVol; + } u = { .volP = pointer }; + FREE(u.notVol); +} + +#endif /* MEMORY_ALLOC_H */ diff --git a/source/uds/memoryDefs.h b/source/uds/memoryDefs.h new file mode 100644 index 0000000..3f8041e --- /dev/null +++ b/source/uds/memoryDefs.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryDefs.h#2 $ + */ + +#ifndef LINUX_KERNEL_MEMORY_DEFS_H +#define LINUX_KERNEL_MEMORY_DEFS_H 1 + +#include // for PAGE_SIZE + +#include "compiler.h" +#include "threadRegistry.h" +#include "typeDefs.h" + +/** + * Allocate one or more elements of the indicated type, aligning them + * on the boundary that will allow them to be used in io, logging an + * error if the allocation fails. The memory will be zeroed. + * + * @param COUNT The number of objects to allocate + * @param TYPE The type of objects to allocate + * @param WHAT What is being allocated (for error logging) + * @param PTR A pointer to hold the allocated memory + * + * @return UDS_SUCCESS or an error code + **/ +#define ALLOCATE_IO_ALIGNED(COUNT, TYPE, WHAT, PTR) \ + doAllocation(COUNT, sizeof(TYPE), 0, PAGE_SIZE, WHAT, PTR) + +/** + * Allocate one element of the indicated type immediately, failing if the + * required memory is not immediately available. + * + * @param TYPE The type of objects to allocate + * @param WHAT What is being allocated (for error logging) + * + * @return pointer to the memory, or NULL if the memory is not available. + **/ +#define ALLOCATE_NOWAIT(TYPE, WHAT) allocateMemoryNowait(sizeof(TYPE), WHAT) + +/** + * Perform termination of the memory allocation subsystem. + **/ +void memoryExit(void); + +/** + * Perform initialization of the memory allocation subsystem. + **/ +void memoryInit(void); + +/** + * Allocate storage based on memory size, failing immediately if the required + * memory is not available. The memory will be zeroed. + * + * @param size The size of an object. + * @param what What is being allocated (for error logging) + * + * @return pointer to the allocated memory, or NULL if the required space is + * not available. + **/ +void *allocateMemoryNowait(size_t size, const char *what) + __attribute__((warn_unused_result)); + + +/** + * Register the current thread as an allocating thread. + * + * An optional flag location can be supplied indicating whether, at + * any given point in time, the threads associated with that flag + * should be allocating storage. If the flag is false, a message will + * be logged. + * + * If no flag is supplied, the thread is always allowed to allocate + * storage without complaint. + * + * @param newThread RegisteredThread structure to use for the current thread + * @param flagPtr Location of the allocation-allowed flag + **/ +void registerAllocatingThread(RegisteredThread *newThread, + const bool *flagPtr); + +/** + * Unregister the current thread as an allocating thread. + **/ +void unregisterAllocatingThread(void); + +/** + * Get the memory statistics. + * + * @param bytesUsed A pointer to hold the number of bytes in use + * @param peakBytesUsed A pointer to hold the maximum value bytesUsed has + * attained + **/ +void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed); + +/** + * Report stats on any allocated memory that we're tracking. + * + * Not all allocation types are guaranteed to be tracked in bytes + * (e.g., bios). + **/ +void reportMemoryUsage(void); + + +#endif /* LINUX_KERNEL_MEMORY_DEFS_H */ diff --git a/source/uds/memoryLinuxKernel.c b/source/uds/memoryLinuxKernel.c new file mode 100644 index 0000000..5a42583 --- /dev/null +++ b/source/uds/memoryLinuxKernel.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryLinuxKernel.c#6 $ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "compilerDefs.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + + +/* + ****************************************************************************** + * Production: UDS and VDO keep track of which threads are allowed to allocate + * memory freely, and which threads must be careful to not do a memory + * allocation that does an I/O request. The allocatingThreads ThreadsRegistry + * and its associated methods implement this tracking. + */ + +static ThreadRegistry allocatingThreads; + +/*****************************************************************************/ +static bool allocationsAllowed(void) +{ + const bool *pointer = lookupThread(&allocatingThreads); + return pointer != NULL ? *pointer : false; +} + +/*****************************************************************************/ +void registerAllocatingThread(RegisteredThread *newThread, const bool *flagPtr) +{ + if (flagPtr == NULL) { + static const bool allocationAlwaysAllowed = true; + flagPtr = &allocationAlwaysAllowed; + } + registerThread(&allocatingThreads, newThread, flagPtr); +} + +/*****************************************************************************/ +void unregisterAllocatingThread(void) +{ + unregisterThread(&allocatingThreads); +} + +/* + ****************************************************************************** + * Production: We track how much memory has been allocated and freed. When we + * unload the UDS module, we log an error if we have not freed all the memory + * that we allocated. Nearly all memory allocation and freeing is done using + * this module. + * + * We do not use kernel functions like the kvasprintf() method, which allocate + * memory indirectly using kmalloc. + * + * These data structures and methods are used to track the amount of memory + * used. + */ + +// We allocate very few large objects, and allocation/deallocation isn't done +// in a performance-critical stage for us, so a linked list should be fine. +typedef struct vmallocBlockInfo { + void *ptr; + size_t size; + struct vmallocBlockInfo *next; +} VmallocBlockInfo; + +static struct { + spinlock_t lock; + size_t kmallocBlocks; + size_t kmallocBytes; + size_t vmallocBlocks; + size_t vmallocBytes; + size_t peakBytes; + VmallocBlockInfo *vmallocList; +} memoryStats __cacheline_aligned; + +/*****************************************************************************/ +static void updatePeakUsage(void) +{ + size_t totalBytes = memoryStats.kmallocBytes + memoryStats.vmallocBytes; + if (totalBytes > memoryStats.peakBytes) { + memoryStats.peakBytes = totalBytes; + } +} + +/*****************************************************************************/ +static void addKmallocBlock(size_t size) +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + memoryStats.kmallocBlocks++; + memoryStats.kmallocBytes += size; + updatePeakUsage(); + spin_unlock_irqrestore(&memoryStats.lock, flags); +} + +/*****************************************************************************/ +static void removeKmallocBlock(size_t size) +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + memoryStats.kmallocBlocks--; + memoryStats.kmallocBytes -= size; + spin_unlock_irqrestore(&memoryStats.lock, flags); +} + +/*****************************************************************************/ +static void addVmallocBlock(VmallocBlockInfo *block) +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + block->next = memoryStats.vmallocList; + memoryStats.vmallocList = block; + memoryStats.vmallocBlocks++; + memoryStats.vmallocBytes += block->size; + updatePeakUsage(); + spin_unlock_irqrestore(&memoryStats.lock, flags); +} + +/*****************************************************************************/ +static void removeVmallocBlock(void *ptr) +{ + VmallocBlockInfo *block, **blockPtr; + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + for (blockPtr = &memoryStats.vmallocList; + (block = *blockPtr) != NULL; + blockPtr = &block->next) { + if (block->ptr == ptr) { + *blockPtr = block->next; + memoryStats.vmallocBlocks--; + memoryStats.vmallocBytes -= block->size; + break; + } + } + spin_unlock_irqrestore(&memoryStats.lock, flags); + if (block != NULL) { + FREE(block); + } else { + logInfo("attempting to remove ptr %" PRIptr " not found in vmalloc list", + ptr); + } +} + + + +/** + * Determine whether allocating a memory block should use kmalloc or vmalloc. + * + * vmalloc can allocate any integral number of pages. + * + * kmalloc can allocate any number of bytes up to a configured limit, which + * defaults to 8 megabytes on some of our systems. kmalloc is especially good + * when memory is being both allocated and freed, and it does this efficiently + * in a multi CPU environment. + * + * kmalloc usually rounds the size of the block up to the next power of two. + * So when the requested block is bigger than PAGE_SIZE / 2 bytes, kmalloc will + * never give you less space than the corresponding vmalloc allocation. + * Sometimes vmalloc will use less overhead than kmalloc. + * + * The advantages of kmalloc do not help out UDS or VDO, because we allocate + * all our memory up front and do not free and reallocate it. Sometimes we + * have problems using kmalloc, because the Linux memory page map can become so + * fragmented that kmalloc will not give us a 32KB chunk. We have used vmalloc + * as a backup to kmalloc in the past, and a followup vmalloc of 32KB will + * work. But there is no strong case to be made for using kmalloc over vmalloc + * for these size chunks. + * + * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB + * requests. There is no strong reason for favoring either kmalloc or vmalloc + * for 4KB requests, except that the keeping of vmalloc statistics uses a + * linked list implementation. Using a simple test, this choice of boundary + * results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB + * results in an additional 6374 vmalloc calls, which will require a change to + * the code that tracks vmalloc statistics. + * + * @param size How many bytes to allocate + **/ +static INLINE bool useKmalloc(size_t size) +{ + return size <= PAGE_SIZE; +} + +/*****************************************************************************/ +int allocateMemory(size_t size, size_t align, const char *what, void *ptr) +{ + if (ptr == NULL) { + return UDS_INVALID_ARGUMENT; + } + if (size == 0) { + *((void **) ptr) = NULL; + return UDS_SUCCESS; + } + + + /* + * The __GFP_RETRY_MAYFAIL means: The VM implementation will retry memory + * reclaim procedures that have previously failed if there is some indication + * that progress has been made else where. It can wait for other tasks to + * attempt high level approaches to freeing memory such as compaction (which + * removes fragmentation) and page-out. There is still a definite limit to + * the number of retries, but it is a larger limit than with __GFP_NORETRY. + * Allocations with this flag may fail, but only when there is genuinely + * little unused memory. While these allocations do not directly trigger the + * OOM killer, their failure indicates that the system is likely to need to + * use the OOM killer soon. The caller must handle failure, but can + * reasonably do so by failing a higher-level request, or completing it only + * in a much less efficient manner. + */ + const gfp_t gfpFlags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL; + + bool allocationsRestricted = !allocationsAllowed(); + unsigned int noioFlags; + if (allocationsRestricted) { + noioFlags = memalloc_noio_save(); + } + + unsigned long startTime = jiffies; + void *p = NULL; + if (useKmalloc(size) && (align < PAGE_SIZE)) { + p = kmalloc(size, gfpFlags | __GFP_NOWARN); + if (p == NULL) { + /* + * If we had just done kmalloc(size, gfpFlags) it is possible that the + * allocation would fail (see VDO-3688). The kernel log would then + * contain a long report about the failure. Although the failure occurs + * because there is no page available to allocate, by the time it logs + * the available space, there is a page available. So hopefully a short + * sleep will allow the page reclaimer to free a single page, which is + * all that we need. + */ + msleep(1); + p = kmalloc(size, gfpFlags); + } + if (p != NULL) { + addKmallocBlock(ksize(p)); + } + } else { + VmallocBlockInfo *block; + if (ALLOCATE(1, VmallocBlockInfo, __func__, &block) == UDS_SUCCESS) { + /* + * If we just do __vmalloc(size, gfpFlags, PAGE_KERNEL) it is possible + * that the allocation will fail (see VDO-3661). The kernel log will + * then contain a long report about the failure. Although the failure + * occurs because there are not enough pages available to allocate, by + * the time it logs the available space, there may enough pages available + * for smaller allocations. So hopefully a short sleep will allow the + * page reclaimer to free enough pages for us. + * + * For larger allocations, the kernel page_alloc code is racing against + * the page reclaimer. If the page reclaimer can stay ahead of + * page_alloc, the __vmalloc will succeed. But if page_alloc overtakes + * the page reclaimer, the allocation fails. It is possible that more + * retries will succeed. + */ + for (;;) { + p = __vmalloc(size, gfpFlags | __GFP_NOWARN, PAGE_KERNEL); + // Try again unless we succeeded or more than 1 second has elapsed. + if ((p != NULL) || (jiffies_to_msecs(jiffies - startTime) > 1000)) { + break; + } + msleep(1); + } + if (p == NULL) { + // Try one more time, logging a failure for this call. + p = __vmalloc(size, gfpFlags, PAGE_KERNEL); + } + if (p == NULL) { + FREE(block); + } else { + block->ptr = p; + block->size = PAGE_ALIGN(size); + addVmallocBlock(block); + } + } + } + + if (allocationsRestricted) { + memalloc_noio_restore(noioFlags); + } + + if (p == NULL) { + unsigned int duration = jiffies_to_msecs(jiffies - startTime); + logError("Could not allocate %zu bytes for %s in %u msecs", + size, what, duration); + return ENOMEM; + } + *((void **) ptr) = p; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void *allocateMemoryNowait(size_t size, + const char *what __attribute__((unused))) +{ + void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO); + if (p != NULL) { + addKmallocBlock(ksize(p)); + } + return p; +} + +/*****************************************************************************/ +void freeMemory(void *ptr) +{ + if (ptr != NULL) { + if (is_vmalloc_addr(ptr)) { + removeVmallocBlock(ptr); + vfree(ptr); + } else { + removeKmallocBlock(ksize(ptr)); + kfree(ptr); + } + } +} + +/*****************************************************************************/ +int reallocateMemory(void *ptr, + size_t oldSize, + size_t size, + const char *what, + void *newPtr) +{ + // Handle special case of zero sized result + if (size == 0) { + FREE(ptr); + *(void **)newPtr = NULL; + return UDS_SUCCESS; + } + + int result = ALLOCATE(size, char, what, newPtr); + if (result != UDS_SUCCESS) { + return result; + } + + if (ptr != NULL) { + if (oldSize < size) { + size = oldSize; + } + memcpy(*((void **) newPtr), ptr, size); + FREE(ptr); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void memoryInit(void) +{ + + spin_lock_init(&memoryStats.lock); + initializeThreadRegistry(&allocatingThreads); +} + + +/*****************************************************************************/ +void memoryExit(void) +{ + + ASSERT_LOG_ONLY(memoryStats.kmallocBytes == 0, + "kmalloc memory used (%zd bytes in %zd blocks)" + " is returned to the kernel", + memoryStats.kmallocBytes, memoryStats.kmallocBlocks); + ASSERT_LOG_ONLY(memoryStats.vmallocBytes == 0, + "vmalloc memory used (%zd bytes in %zd blocks)" + " is returned to the kernel", + memoryStats.vmallocBytes, memoryStats.vmallocBlocks); + logDebug("%s peak usage %zd bytes", THIS_MODULE->name, + memoryStats.peakBytes); +} + +/**********************************************************************/ +void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed) +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + *bytesUsed = memoryStats.kmallocBytes + memoryStats.vmallocBytes; + *peakBytesUsed = memoryStats.peakBytes; + spin_unlock_irqrestore(&memoryStats.lock, flags); +} + +/**********************************************************************/ +void reportMemoryUsage() +{ + unsigned long flags; + spin_lock_irqsave(&memoryStats.lock, flags); + uint64_t kmallocBlocks = memoryStats.kmallocBlocks; + uint64_t kmallocBytes = memoryStats.kmallocBytes; + uint64_t vmallocBlocks = memoryStats.vmallocBlocks; + uint64_t vmallocBytes = memoryStats.vmallocBytes; + uint64_t peakUsage = memoryStats.peakBytes; + spin_unlock_irqrestore(&memoryStats.lock, flags); + uint64_t totalBytes = kmallocBytes + vmallocBytes; + logInfo("current module memory tracking" + " (actual allocation sizes, not requested):"); + logInfo(" %llu bytes in %llu kmalloc blocks", + kmallocBytes, kmallocBlocks); + logInfo(" %llu bytes in %llu vmalloc blocks", + vmallocBytes, vmallocBlocks); + logInfo(" total %llu bytes, peak usage %llu bytes", + totalBytes, peakUsage); +} diff --git a/source/uds/murmur/MurmurHash3.c b/source/uds/murmur/MurmurHash3.c new file mode 100644 index 0000000..42af11a --- /dev/null +++ b/source/uds/murmur/MurmurHash3.c @@ -0,0 +1,379 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +#include "cpu.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#if __GNUC__ >= 7 +#pragma GCC diagnostic warning "-Wimplicit-fallthrough=0" +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +static inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +static inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +static FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return p[i]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return __builtin_bswap32(p[i]); +#else +#error "can't figure out byte order" +#endif +} + +static FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return p[i]; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return __builtin_bswap64(p[i]); +#else +#error "can't figure out byte order" +#endif +} + +// Block write +static FORCE_INLINE void putblock (uint32_t *p, int i, uint32_t value) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + p[i] = value; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + p[i] = __builtin_bswap32(value); +#else +#error "can't figure out byte order" +#endif +} + +static FORCE_INLINE void putblock64 (uint64_t *p, int i, uint64_t value) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + p[i] = value; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + p[i] = __builtin_bswap64(value); +#else +#error "can't figure out byte order" +#endif +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +static FORCE_INLINE uint32_t fmix32 ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +static FORCE_INLINE uint64_t fmix64 ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + int i; + for(i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + default: break; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix32(h1); + + putblock(out, 0, h1); +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + int i; + for(i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + default: break; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix32(h1); + h2 = fmix32(h2); + h3 = fmix32(h3); + h4 = fmix32(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + putblock((uint32_t*)out, 0, h1); + putblock((uint32_t*)out, 1, h2); + putblock((uint32_t*)out, 2, h3); + putblock((uint32_t*)out, 3, h4); +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + int i; + for(i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock64(blocks,i*2+0); + uint64_t k2 = getblock64(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= ((uint64_t)tail[14]) << 48; + case 14: k2 ^= ((uint64_t)tail[13]) << 40; + case 13: k2 ^= ((uint64_t)tail[12]) << 32; + case 12: k2 ^= ((uint64_t)tail[11]) << 24; + case 11: k2 ^= ((uint64_t)tail[10]) << 16; + case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; + case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; + case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; + case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; + case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; + case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; + case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; + case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; + case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + default: break; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + putblock64((uint64_t*)out, 0, h1); + putblock64((uint64_t*)out, 1, h2); +} diff --git a/source/uds/murmur/MurmurHash3.h b/source/uds/murmur/MurmurHash3.h new file mode 100644 index 0000000..bebb8fa --- /dev/null +++ b/source/uds/murmur/MurmurHash3.h @@ -0,0 +1,44 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Linux kernel + +#ifdef __KERNEL__ +# include + +// Microsoft Visual Studio + +#else // defined(__KERNEL__) +# if defined(_MSC_VER) + + typedef unsigned char uint8_t; + typedef unsigned long uint32_t; + typedef unsigned __int64 uint64_t; + +// Other compilers + +# else // defined(_MSC_VER) + +# include + +# endif // !defined(_MSC_VER) +#endif // !defined(__KERNEL__) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/source/uds/nonce.c b/source/uds/nonce.c new file mode 100644 index 0000000..43b0f80 --- /dev/null +++ b/source/uds/nonce.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/nonce.c#3 $ + */ + +#include "nonce.h" + +#include "murmur/MurmurHash3.h" +#include "numeric.h" +#include "random.h" +#include "stringUtils.h" +#include "timeUtils.h" + +/*****************************************************************************/ +static uint64_t hashStuff(uint64_t start, const void *data, size_t len) +{ + uint32_t seed = start ^ (start >> 27); + byte hashBuffer[16]; + MurmurHash3_x64_128(data, len, seed, hashBuffer); + return getUInt64LE(hashBuffer + 4); +} + +/*****************************************************************************/ +static void *memput(void *buf, void *end, const void *data, size_t len) +{ + byte *bp = buf; + byte *be = end; + + size_t chunk = minSizeT(len, be - bp); + memcpy(bp, data, chunk); + return bp + chunk; +} + +/*****************************************************************************/ +size_t createUniqueNonceData(byte *buffer, size_t length) +{ + AbsTime now = currentTime(CLOCK_REALTIME); + + byte *be = buffer + length; + byte *bp = memput(buffer, be, &now, sizeof(now)); + + uint32_t rand = randomInRange(1, (1<<30) - 1); + + bp = memput(bp, be, &rand, sizeof(rand)); + + while (bp < be) { + size_t n = minSizeT(be - bp, bp - buffer); + memcpy(bp, buffer, n); + bp += n; + } + + return bp - buffer; +} + +/*****************************************************************************/ +uint64_t generateMasterNonce(const void *data, size_t len) +{ + return hashStuff(0xa1b1e0fc, data, len); +} + +/*****************************************************************************/ +uint64_t generateSecondaryNonce(uint64_t nonce, + const void *data, + size_t len) +{ + return hashStuff(nonce + 1, data, len); +} diff --git a/source/uds/nonce.h b/source/uds/nonce.h new file mode 100644 index 0000000..43f2054 --- /dev/null +++ b/source/uds/nonce.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/nonce.h#1 $ + */ + +#ifndef NONCE_H +#define NONCE_H + +#include "typeDefs.h" + +/** + * Create unique data for the master nonce, using system-specific + * methods such as the current time and a random number. + * + * @param buffer A buffer of length specified next. + * @param length Length of the buffer. + * + * @return the amount of the buffer that has been filled with unique data + **/ +size_t createUniqueNonceData(byte *buffer, size_t length); + +/** + * Generate a master nonce, using the specified data. + * + * @param data Some arbitrary information. + * @param len The length of the information. + * + * @return a number which will be fairly unique + **/ +uint64_t generateMasterNonce(const void *data, size_t len); + +/** + * Deterministically generate a secondary nonce based on an existing + * nonce and some arbitrary data. Effectively hashes the nonce and + * the data to produce a new nonce which is deterministic. + * + * @param nonce An existing nonce which is well known. + * @param data Some data related to the creation of this nonce. + * @param len The length of the data. + * + * @return a number which will be fairly unique and depend solely on + * the nonce and the data. + **/ +uint64_t generateSecondaryNonce(uint64_t nonce, + const void *data, + size_t len); + +#endif // NONCE_H diff --git a/source/uds/numeric.c b/source/uds/numeric.c new file mode 100644 index 0000000..4bc1e2d --- /dev/null +++ b/source/uds/numeric.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/numeric.c#2 $ + */ + +#include "numeric.h" +#include "permassert.h" + +#define STATIC_ASSERT_ALIGNOF(type, expectedAlignment) \ + STATIC_ASSERT(__alignof__(type) == (expectedAlignment)) + +/**********************************************************************/ +bool multiplyWouldOverflow(uint64_t a, uint64_t b) +{ + return b != 0 && a > UINT64_MAX / b; +} + +/**********************************************************************/ +void numericCompileTimeAssertions(void) +{ + STATIC_ASSERT_SIZEOF(uint64_t, 8); + STATIC_ASSERT_SIZEOF(uint32_t, 4); + STATIC_ASSERT_SIZEOF(uint16_t, 2); + + STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint64_t), 8); + STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint32_t), 4); + STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint16_t), 2); + + STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint64_t), 1); + STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint32_t), 1); + STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint16_t), 1); +} diff --git a/source/uds/numeric.h b/source/uds/numeric.h new file mode 100644 index 0000000..06d7eee --- /dev/null +++ b/source/uds/numeric.h @@ -0,0 +1,721 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/numeric.h#2 $ + */ + +#ifndef NUMERIC_H +#define NUMERIC_H 1 + +#include "compiler.h" +#include "numericDefs.h" +#include "typeDefs.h" + +#if !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) \ + || !defined(__BYTE_ORDER__) +#error "GCC byte order macros not defined?" +#endif + +/* + * Define a type describing an integer value that is only byte-aligned + * and may explicitly alias other types. GCC keeps getting better + * about type-based alias analysis (both for optimization and for + * warnings), so simply casting a pointer to pointer-to-uintXX_t isn't + * good enough. + * + * C is okay with defining the structures directly in a cast, but + * C++ is not, and we use this header in some C++ code internally. + */ +#define UNALIGNED_WRAPPER(TYPE) \ + unaligned_wrap_##TYPE +#define UNALIGNED_WRAPPER_DEF(TYPE) \ + typedef struct __attribute__((packed, may_alias)) { TYPE value; } \ + UNALIGNED_WRAPPER(TYPE) +UNALIGNED_WRAPPER_DEF(int64_t); +UNALIGNED_WRAPPER_DEF(uint64_t); +UNALIGNED_WRAPPER_DEF(int32_t); +UNALIGNED_WRAPPER_DEF(uint32_t); +UNALIGNED_WRAPPER_DEF(uint16_t); + +#define GET_UNALIGNED(TYPE,ADDR) \ + (((const UNALIGNED_WRAPPER(TYPE) *)(ADDR))->value) +#define PUT_UNALIGNED(TYPE,ADDR,VALUE) \ + (((UNALIGNED_WRAPPER(TYPE) *)(ADDR))->value = (VALUE)) + +/** + * Find the minimum of two ints. + * + * @param a The first int + * @param b The second int + * + * @return The lesser of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE int minInt(int a, int b) +{ + return ((a < b) ? a : b); +} + +/** + * Find the maximum of two ints. + * + * @param a The first int + * @param b The second int + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE int maxInt(int a, int b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the maximum of two unsigned ints. + * + * @param a The first value + * @param b The second value + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE unsigned int maxUInt(unsigned int a, unsigned int b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the maximum of two signed longs. + * + * @param a The first int + * @param b The second int + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE long maxLong(long a, long b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the maximum of two unsigned longs. + * + * @param a The first int + * @param b The second int + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE unsigned long maxULong(unsigned long a, unsigned long b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the minimum of two size_ts. + * + * @param a The first size_t + * @param b The second size_t + * + * @return The lesser of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE size_t minSizeT(size_t a, size_t b) +{ + return ((a < b) ? a : b); +} + +/** + * Find the maximum of two size_ts. + * + * @param a The first size_t + * @param b The second size_t + * + * @return The greater of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE size_t maxSizeT(size_t a, size_t b) +{ + return ((a > b) ? a : b); +} + +/** + * Find the minimum of two uint64_ts. + * + * @param a The first uint64_t + * @param b The second uint64_t + * + * @return The lesser of a and b + **/ +__attribute__((warn_unused_result)) +static INLINE uint64_t minUInt64(uint64_t a, uint64_t b) +{ + return ((a < b) ? a : b); +} + +/** + * Multiply two uint64_t and check for overflow. Does division. + **/ +bool multiplyWouldOverflow(uint64_t a, uint64_t b); + +/** + * Extract a 64 bit unsigned number from a buffer stored in + * big-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint64_t getUInt64BE(const byte* data) +{ + uint64_t num = GET_UNALIGNED(uint64_t, data); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + return num; +} + +/** + * Extract a 64 bit unsigned big-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt64BE(const byte *buffer, + size_t *offset, + uint64_t *decoded) +{ + *decoded = getUInt64BE(buffer + *offset); + *offset += sizeof(uint64_t); +} + +/** + * Store a 64 bit unsigned number in a buffer in + * big-endian representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt64BE(byte* data, uint64_t num) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + PUT_UNALIGNED(uint64_t, data, num); +} + +/** + * Encode a 64 bit unsigned number into a buffer at a given offset + * using a big-endian representation. The offset will be advanced to + * first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt64BE(byte *data, + size_t *offset, + uint64_t toEncode) +{ + storeUInt64BE(data + *offset, toEncode); + *offset += sizeof(uint64_t); +} + +/** + * Extract a 32 bit unsigned number from a buffer stored in big-endian + * representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint32_t getUInt32BE(const byte* data) +{ + uint32_t num = GET_UNALIGNED(uint32_t, data); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + return num; +} + +/** + * Extract a 32 bit unsigned big-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt32BE(const byte *buffer, + size_t *offset, + uint32_t *decoded) +{ + *decoded = getUInt32BE(buffer + *offset); + *offset += sizeof(uint32_t); +} + +/** + * Store a 32 bit number in a buffer in + * big-endian representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt32BE(byte* data, uint32_t num) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + PUT_UNALIGNED(uint32_t, data, num); +} + +/** + * Encode a 32 bit number into a buffer at a given offset using a + * big-endian representation. The offset will be advanced to first byte + * after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt32BE(byte *data, + size_t *offset, + uint32_t toEncode) +{ + storeUInt32BE(data + *offset, toEncode); + *offset += sizeof(uint32_t); +} + +/** + * Extract a 16 bit number from a buffer stored in + * big-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint16_t getUInt16BE(const byte* data) +{ + uint16_t num = GET_UNALIGNED(uint16_t, data); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = bswap_16(num); +#endif + return num; +} + +/** + * Extract a 16 bit, big-endian number from a buffer at a specified offset. + * The offset will be advanced to the first byte after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to + * extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt16BE(const byte *buffer, + size_t *offset, + uint16_t *decoded) +{ + *decoded = getUInt16BE(buffer + *offset); + *offset += sizeof(uint16_t); +} + +/** + * Store a 16 bit number in a buffer in + * big-endian representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt16BE(byte* data, uint16_t num) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + num = bswap_16(num); +#endif + PUT_UNALIGNED(uint16_t, data, num); +} + +/** + * Encode a 16 bit number into a buffer at a given offset using a + * big-endian representation. The offset will be advanced to first byte + * after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt16BE(byte *data, + size_t *offset, + uint16_t toEncode) +{ + storeUInt16BE(data + *offset, toEncode); + *offset += sizeof(uint16_t); +} + +/** + * Extract a 64 bit signed number from a buffer stored in + * little-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE int64_t getInt64LE(const byte* data) +{ + int64_t num = GET_UNALIGNED(int64_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + return num; +} + +/** + * Extract a 64 bit signed little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeInt64LE(const byte *buffer, + size_t *offset, + int64_t *decoded) +{ + *decoded = getInt64LE(buffer + *offset); + *offset += sizeof(int64_t); +} + +/** + * Store a signed 64 bit number in a buffer in little-endian + * representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeInt64LE(byte* data, int64_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + PUT_UNALIGNED(int64_t, data, num); +} + +/** + * Encode a 64 bit signed number into a buffer at a given offset using + * a little-endian representation. The offset will be advanced to + * first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeInt64LE(byte *data, + size_t *offset, + int64_t toEncode) +{ + storeInt64LE(data + *offset, toEncode); + *offset += sizeof(int64_t); +} + +/** + * Extract a 64 bit number from a buffer stored in + * little-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint64_t getUInt64LE(const byte* data) +{ + uint64_t num = GET_UNALIGNED(uint64_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + return num; +} + +/** + * Extract a 64 bit unsigned little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt64LE(const byte *buffer, + size_t *offset, + uint64_t *decoded) +{ + *decoded = getUInt64LE(buffer + *offset); + *offset += sizeof(uint64_t); +} + +/** + * Store a 64 bit unsigned number in a buffer in little-endian + * representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt64LE(byte* data, uint64_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap64(num); +#endif + PUT_UNALIGNED(uint64_t, data, num); +} + +/** + * Encode a 64 bit unsigned number into a buffer at a given offset + * using a little-endian representation. The offset will be advanced + * to first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt64LE(byte *data, + size_t *offset, + uint64_t toEncode) +{ + storeUInt64LE(data + *offset, toEncode); + *offset += sizeof(uint64_t); +} + +/** + * Extract a 32 bit signed number from a buffer stored in + * little-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE int32_t getInt32LE(const byte* data) +{ + int32_t num = GET_UNALIGNED(int32_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + return num; +} + +/** + * Extract a 32 bit signed little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeInt32LE(const byte *buffer, + size_t *offset, + int32_t *decoded) +{ + *decoded = getInt32LE(buffer + *offset); + *offset += sizeof(int32_t); +} + +/** + * Store a signed 32 bit number in a buffer in little-endian + * representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeInt32LE(byte* data, int32_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + PUT_UNALIGNED(int32_t, data, num); +} + +/** + * Encode a 32 bit signed number into a buffer at a given offset using + * a little-endian representation. The offset will be advanced to + * first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeInt32LE(byte *data, + size_t *offset, + int32_t toEncode) +{ + storeInt32LE(data + *offset, toEncode); + *offset += sizeof(int32_t); +} + +/** + * Extract a 32 bit unsigned number from a buffer stored in + * little-endian representation. + + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint32_t getUInt32LE(const byte* data) +{ + uint32_t num = GET_UNALIGNED(uint32_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + return num; +} + +/** + * Extract a 32 bit unsigned little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt32LE(const byte *buffer, + size_t *offset, + uint32_t *decoded) +{ + *decoded = getUInt32LE(buffer + *offset); + *offset += sizeof(uint32_t); +} + +/** + * Store a 32 bit unsigned number in a buffer in little-endian + * representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt32LE(byte* data, uint32_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = __builtin_bswap32(num); +#endif + PUT_UNALIGNED(uint32_t, data, num); +} + +/** + * Encode a 32 bit unsigned number into a buffer at a given offset + * using a little-endian representation. The offset will be advanced + * to first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt32LE(byte *data, + size_t *offset, + uint32_t toEncode) +{ + storeUInt32LE(data + *offset, toEncode); + *offset += sizeof(uint32_t); +} + +/** + * Extract a 16 bit number from a buffer stored in + * little-endian representation. + * + * @param data The buffer from which to extract the number + * + * @return The extracted quantity + **/ +__attribute__((warn_unused_result)) +static INLINE uint16_t getUInt16LE(const byte* data) +{ + uint16_t num = GET_UNALIGNED(uint16_t, data); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = bswap_16(num); +#endif + return num; +} + +/** + * Extract a 16 bit unsigned little-endian number from a buffer at a + * specified offset. The offset will be advanced to the first byte + * after the number. + * + * @param buffer The buffer from which to extract the number + * @param offset A pointer to the offset into the buffer at which to + * extract + * @param decoded A pointer to hold the extracted number + **/ +static INLINE void decodeUInt16LE(const byte *buffer, + size_t *offset, + uint16_t *decoded) +{ + *decoded = getUInt16LE(buffer + *offset); + *offset += sizeof(uint16_t); +} + +/** + * Store a 16 bit number in a buffer in little-endian representation. + * + * @param data The buffer in which to store the number + * @param num The number to store + **/ +static INLINE void storeUInt16LE(byte* data, uint16_t num) +{ +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + num = bswap_16(num); +#endif + PUT_UNALIGNED(uint16_t, data, num); +} + +/** + * Encode a 16 bit unsigned number into a buffer at a given offset + * using a little-endian representation. The offset will be advanced + * to first byte after the encoded number. + * + * @param data The buffer to encode into + * @param offset A pointer to the offset at which to start encoding + * @param toEncode The number to encode + **/ +static INLINE void encodeUInt16LE(byte *data, + size_t *offset, + uint16_t toEncode) +{ + storeUInt16LE(data + *offset, toEncode); + *offset += sizeof(uint16_t); +} + +/** + * Special function wrapper required for compile-time assertions. This + * function will fail to compile if any of the uint*_t types are not of the + * size we expect. This function should never be called. + **/ +void numericCompileTimeAssertions(void); + +#endif /* NUMERIC_H */ diff --git a/source/uds/numericDefs.h b/source/uds/numericDefs.h new file mode 100644 index 0000000..c8795a1 --- /dev/null +++ b/source/uds/numericDefs.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/numericDefs.h#1 $ + */ + +#ifndef LINUX_KERNEL_NUMERIC_DEFS_H +#define LINUX_KERNEL_NUMERIC_DEFS_H 1 + +#ifdef __x86_64__ +/* + * __builtin_bswap16 should work fine here too, but check for a + * performance impact before changing it, just to be safe. + */ +#define bswap_16(x) \ + (__extension__ \ + ({ register unsigned short int __v, __x = (unsigned short int) (x); \ + __asm__ ("rorw $8, %w0" \ + : "=r" (__v) \ + : "0" (__x) \ + : "cc"); \ + __v; })) +#else +#define bswap_16(x) __builtin_bswap16(x) +#endif + +#endif /* LINUX_KERNEL_NUMERIC_DEFS_H */ diff --git a/source/uds/opaqueTypes.h b/source/uds/opaqueTypes.h new file mode 100644 index 0000000..478631a --- /dev/null +++ b/source/uds/opaqueTypes.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/opaqueTypes.h#3 $ + */ + +#ifndef OPAQUE_TYPES_H +#define OPAQUE_TYPES_H + +/* + * This file contains typedefs of structures internal to the UDS library + * for which many users of those structures do need to know the details + * of the structures themselves. + */ +typedef struct indexRouter IndexRouter; +typedef struct internalRequest Request; +typedef struct requestQueue RequestQueue; + +#endif /* OPAQUE_TYPES_H */ diff --git a/source/uds/openChapter.c b/source/uds/openChapter.c new file mode 100644 index 0000000..7a8a613 --- /dev/null +++ b/source/uds/openChapter.c @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/openChapter.c#4 $ + */ + +#include "openChapter.h" + +#include "compiler.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "zone.h" + +static int readOpenChapters(ReadPortal *portal); +static int writeOpenChapters(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone); + +const IndexComponentInfo OPEN_CHAPTER_INFO = { + .kind = RL_KIND_OPEN_CHAPTER, + .name = "open chapter", + .saveOnly = true, + .chapterSync = false, + .multiZone = false, + .ioStorage = true, + .loader = readOpenChapters, + .saver = writeOpenChapters, + .incremental = NULL, +}; + +static const byte OPEN_CHAPTER_MAGIC[] = "ALBOC"; +static const byte OPEN_CHAPTER_VERSION[] = "02.00"; + +enum { + OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1, + OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1 +}; + +/**********************************************************************/ +static int fillDeltaChapterIndex(OpenChapterZone **chapterZones, + unsigned int zoneCount, + OpenChapterIndex *index, + UdsChunkRecord *collatedRecords) +{ + // Find a record to replace any deleted records, and fill the chapter if + // it was closed early. The last record in any filled zone is guaranteed + // to not have been deleted in this chapter, so use one of those. + OpenChapterZone *fillChapterZone = NULL; + UdsChunkRecord *fillRecord = NULL; + unsigned int z; + for (z = 0; z < zoneCount; ++z) { + fillChapterZone = chapterZones[z]; + if (fillChapterZone->size == fillChapterZone->capacity) { + fillRecord = &fillChapterZone->records[fillChapterZone->size]; + break; + } + } + int result = ASSERT((fillRecord != NULL), + "some open chapter zone filled"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT(!fillChapterZone->slots[fillChapterZone->size].recordDeleted, + "chapter fill record not deleted"); + if (result != UDS_SUCCESS) { + return result; + } + + const Geometry *geometry = index->geometry; + unsigned int pagesPerChapter = geometry->recordPagesPerChapter; + unsigned int recordsPerPage = geometry->recordsPerPage; + int overflowCount = 0; + unsigned int recordsAdded = 0; + unsigned int zone = 0; + + unsigned int page; + for (page = 0; page < pagesPerChapter; page++) { + unsigned int i; + for (i = 0; + i < recordsPerPage; + i++, recordsAdded++, zone = (zone + 1) % zoneCount) { + + // The record arrays are 1-based. + unsigned int recordNumber = 1 + (recordsAdded / zoneCount); + + // If the zone has been exhausted, or the record was deleted, + // add the fill record to the chapter. + if (recordNumber > chapterZones[zone]->size + || chapterZones[zone]->slots[recordNumber].recordDeleted) { + collatedRecords[1 + recordsAdded] = *fillRecord; + continue; + } + + UdsChunkRecord *nextRecord = &chapterZones[zone]->records[recordNumber]; + collatedRecords[1 + recordsAdded] = *nextRecord; + + int result = putOpenChapterIndexRecord(index, &nextRecord->name, page); + switch (result) { + case UDS_SUCCESS: + break; + case UDS_OVERFLOW: + overflowCount++; + break; + default: + logErrorWithStringError(result, "failed to build open chapter index"); + return result; + } + } + } + if (overflowCount > 0) { + logWarning("Failed to add %d entries to chapter index", overflowCount); + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int closeOpenChapter(OpenChapterZone **chapterZones, + unsigned int zoneCount, + Volume *volume, + OpenChapterIndex *chapterIndex, + UdsChunkRecord *collatedRecords, + uint64_t virtualChapterNumber) +{ + // Empty the delta chapter index, and prepare it for the new virtual chapter. + emptyOpenChapterIndex(chapterIndex, virtualChapterNumber); + + // Map each non-deleted record name to its record page number in the delta + // chapter index. + int result = fillDeltaChapterIndex(chapterZones, zoneCount, chapterIndex, + collatedRecords); + if (result != UDS_SUCCESS) { + return result; + } + + // Pass the populated chapter index and the records to the volume, which + // will generate and write the index and record pages for the chapter. + return writeChapter(volume, chapterIndex, collatedRecords); +} + +/**********************************************************************/ +int saveOpenChapters(Index *index, BufferedWriter *writer) +{ + int result = writeToBufferedWriter(writer, OPEN_CHAPTER_MAGIC, + OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + result = writeToBufferedWriter(writer, OPEN_CHAPTER_VERSION, + OPEN_CHAPTER_VERSION_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + uint32_t totalRecords = 0; + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + totalRecords += openChapterSize(index->zones[i]->openChapter); + } + + // Store the record count in little-endian order. + byte totalRecordData[sizeof(totalRecords)]; + storeUInt32LE(totalRecordData, totalRecords); + + result = writeToBufferedWriter(writer, totalRecordData, + sizeof(totalRecordData)); + if (result != UDS_SUCCESS) { + return result; + } + + // Only write out the records that have been added and not deleted. + uint32_t recordsAdded = 0; + unsigned int recordIndex = 1; + while(recordsAdded < totalRecords) { + unsigned int i; + for (i = 0; i < index->zoneCount; i++) { + if (recordIndex > index->zones[i]->openChapter->size) { + continue; + } + if (index->zones[i]->openChapter->slots[recordIndex].recordDeleted) { + continue; + } + UdsChunkRecord *record + = &index->zones[i]->openChapter->records[recordIndex]; + result = writeToBufferedWriter(writer, record, sizeof(UdsChunkRecord)); + if (result != UDS_SUCCESS) { + return result; + } + recordsAdded++; + } + recordIndex++; + } + + return flushBufferedWriter(writer); +} + +/**********************************************************************/ +uint64_t computeSavedOpenChapterSize(Geometry *geometry) +{ + return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + + sizeof(uint32_t) + geometry->recordsPerChapter * sizeof(UdsChunkRecord); +} + +/**********************************************************************/ +static int writeOpenChapters(IndexComponent *component, + BufferedWriter *writer, + unsigned int zone) +{ + int result = ASSERT((zone == 0), "open chapter write not zoned"); + if (result != UDS_SUCCESS) { + return result; + } + + Index *index = indexComponentData(component); + return saveOpenChapters(index, writer); +} + +/** + * Read the version field from a buffered reader, checking whether it is a + * supported version. Returns (via a pointer parameter) the matching + * version constant, which can be used by comparing to the version + * constants using simple pointer equality. + * + * @param [in] reader A buffered reader. + * @param [out] version The version constant that was matched. + * + * @return UDS_SUCCESS or an error code if the file could not be read or + * the version is invalid or unsupported + **/ +static int readVersion(BufferedReader *reader, const byte **version) +{ + byte buffer[OPEN_CHAPTER_VERSION_LENGTH]; + int result = readFromBufferedReader(reader, buffer, sizeof(buffer)); + if (result != UDS_SUCCESS) { + return result; + } + if (memcmp(OPEN_CHAPTER_VERSION, buffer, sizeof(buffer)) != 0) { + return logErrorWithStringError(UDS_CORRUPT_COMPONENT, + "Invalid open chapter version: %.*s", + (int) sizeof(buffer), buffer); + } + *version = OPEN_CHAPTER_VERSION; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int loadVersion20(Index *index, BufferedReader *reader) +{ + byte numRecordsData[sizeof(uint32_t)]; + int result + = readFromBufferedReader(reader, &numRecordsData, sizeof(numRecordsData)); + if (result != UDS_SUCCESS) { + return result; + } + uint32_t numRecords = getUInt32LE(numRecordsData); + + // Keep track of which zones cannot accept any more records. + bool fullFlags[MAX_ZONES] = { false, }; + + // Assign records to the correct zones. + UdsChunkRecord record; + uint32_t records; + for (records = 0; records < numRecords; records++) { + result = readFromBufferedReader(reader, &record, sizeof(UdsChunkRecord)); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int zone = 0; + if (index->zoneCount > 1) { + // A read-only index has no master index, but it also has only one zone. + zone = getMasterIndexZone(index->masterIndex, &record.name); + } + // Add records until the open chapter zone almost runs out of space. + // The chapter can't be closed here, so don't add the last record. + if (!fullFlags[zone]) { + unsigned int remaining; + result = putOpenChapter(index->zones[zone]->openChapter, + &record.name, &record.data, &remaining); + fullFlags[zone] = (remaining <= 1); + if (result != UDS_SUCCESS) { + return result; + } + } + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int loadOpenChapters(Index *index, BufferedReader *reader) +{ + // Read and check the magic number. + int result = + verifyBufferedData(reader, OPEN_CHAPTER_MAGIC, OPEN_CHAPTER_MAGIC_LENGTH); + if (result != UDS_SUCCESS) { + return result; + } + + // Read and check the version. + const byte *version = NULL; + result = readVersion(reader, &version); + if (result != UDS_SUCCESS) { + return result; + } + + return loadVersion20(index, reader); +} + +/**********************************************************************/ +int readOpenChapters(ReadPortal *portal) +{ + Index *index = indexComponentData(portal->component); + + BufferedReader *reader; + int result = getBufferedReaderForPortal(portal, 0, &reader); + if (result != UDS_SUCCESS) { + return result; + } + return loadOpenChapters(index, reader); +} diff --git a/source/uds/openChapter.h b/source/uds/openChapter.h new file mode 100644 index 0000000..381badd --- /dev/null +++ b/source/uds/openChapter.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/openChapter.h#1 $ + */ + +#ifndef OPENCHAPTER_H +#define OPENCHAPTER_H 1 + +#include "common.h" +#include "geometry.h" +#include "index.h" +#include "indexComponent.h" + +extern const IndexComponentInfo OPEN_CHAPTER_INFO; + +/** + * OpenChapter handles writing the open chapter records to the volume. It also + * manages the open chapter index component, and all the tools to generate and + * parse the open chapter file. The open chapter file interleaves records from + * each openChapterZone structure. + * + *

Once each open chapter zone is filled, the records are interleaved to + * preserve temporal locality, the index pages are generated through a + * delta chapter index, and the record pages are derived by sorting each + * page-sized batch of records by their names. + * + *

Upon index shutdown, the open chapter zone records are again + * interleaved, and the records are stored as a single array. The hash + * slots are not preserved, since the records may be reassigned to new + * zones at load time. + **/ + +/** + * Close the open chapter and write it to disk. + * + * @param chapterZones The zones of the chapter to close + * @param zoneCount The number of zones + * @param volume The volume to which to write the chapter + * @param chapterIndex The OpenChapterIndex to use while writing + * @param collatedRecords Collated records array to use while writing + * @param virtualChapterNumber The virtual chapter number of the open chapter + * + * @return UDS_SUCCESS or an error code + **/ +int closeOpenChapter(OpenChapterZone **chapterZones, + unsigned int zoneCount, + Volume *volume, + OpenChapterIndex *chapterIndex, + UdsChunkRecord *collatedRecords, + uint64_t virtualChapterNumber) + __attribute__((warn_unused_result)); + +/** + * Write out a partially filled chapter to a file. + * + * @param index the index to save the data from + * @param writer the writer to write out the chapters + * + * @return UDS_SUCCESS on success + **/ +int saveOpenChapters(Index *index, BufferedWriter *writer) + __attribute__((warn_unused_result)); + +/** + * Read a partially filled chapter from a file. + * + * @param index the index to load the data into + * @param reader the buffered reader to read from + * + * @return UDS_SUCCESS on success + **/ +int loadOpenChapters(Index *index, BufferedReader *reader) + __attribute__((warn_unused_result)); + +/** + * Compute the size of the maximum open chapter save image. + * + * @param geometry the index geometry + * + * @return the number of bytes of the largest possible open chapter save + * image + **/ +uint64_t computeSavedOpenChapterSize(Geometry *geometry); + +#endif /* OPENCHAPTER_H */ diff --git a/source/uds/openChapterZone.c b/source/uds/openChapterZone.c new file mode 100644 index 0000000..f346409 --- /dev/null +++ b/source/uds/openChapterZone.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/openChapterZone.c#2 $ + */ + +#include "openChapterZone.h" + +#include "compiler.h" +#include "hashUtils.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +/**********************************************************************/ +static INLINE size_t recordsSize(const OpenChapterZone *openChapter) +{ + return (sizeof(UdsChunkRecord) * (1 + openChapter->capacity)); +} + +/**********************************************************************/ +static INLINE size_t slotsSize(size_t slotCount) +{ + return (sizeof(Slot) * slotCount); +} + +/** + * Round up to the first power of two greater than or equal + * to the supplied number. + * + * @param val the number to round up + * + * @return the first power of two not smaller than val for any + * val <= 2^63 + **/ +static INLINE size_t nextPowerOfTwo(size_t val) +{ + if (val == 0) { + return 1; + } + return (1 << computeBits(val - 1)); +} + +/**********************************************************************/ +int makeOpenChapter(const Geometry *geometry, + unsigned int zoneCount, + OpenChapterZone **openChapterPtr) +{ + int result = ASSERT(zoneCount > 0, "zone count must be > 0"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE(geometry->openChapterLoadRatio > 1, + UDS_BAD_STATE, + "Open chapter hash table is too small"); + if (result != UDS_SUCCESS) { + return result; + } + result = ASSERT_WITH_ERROR_CODE((geometry->recordsPerChapter + <= OPEN_CHAPTER_MAX_RECORD_NUMBER), + UDS_BAD_STATE, + "Too many records (%u) for a single chapter", + geometry->recordsPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + + if (geometry->recordsPerChapter < zoneCount) { + return logUnrecoverable( + UDS_INVALID_ARGUMENT, + "zone count: %u is larger than the records per chapter %u", + zoneCount, geometry->recordsPerChapter); + } + size_t capacity = geometry->recordsPerChapter / zoneCount; + + // The slot count must be at least one greater than the capacity. + // Using a power of two slot count guarantees that hash insertion + // will never fail if the hash table is not full. + size_t slotCount = nextPowerOfTwo(capacity * geometry->openChapterLoadRatio); + OpenChapterZone *openChapter; + result = ALLOCATE_EXTENDED(OpenChapterZone, slotCount, Slot, + "open chapter", &openChapter); + if (result != UDS_SUCCESS) { + return result; + } + openChapter->slotCount = slotCount; + openChapter->capacity = capacity; + result = allocateCacheAligned(recordsSize(openChapter), "record pages", + &openChapter->records); + if (result != UDS_SUCCESS) { + freeOpenChapter(openChapter); + return result; + } + + *openChapterPtr = openChapter; + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t openChapterSize(const OpenChapterZone *openChapter) +{ + return openChapter->size - openChapter->deleted; +} + +/**********************************************************************/ +void resetOpenChapter(OpenChapterZone *openChapter) +{ + openChapter->size = 0; + openChapter->deleted = 0; + + memset(openChapter->records, 0, recordsSize(openChapter)); + memset(openChapter->slots, 0, slotsSize(openChapter->slotCount)); +} + +/**********************************************************************/ +static UdsChunkRecord *probeChapterSlots(OpenChapterZone *openChapter, + const UdsChunkName *name, + unsigned int *slotPtr, + unsigned int *recordNumberPtr) +{ + unsigned int slots = openChapter->slotCount; + unsigned int probe = nameToHashSlot(name, slots); + unsigned int firstSlot = 0; + + UdsChunkRecord *record; + unsigned int probeSlot; + unsigned int recordNumber; + unsigned int probeAttempts; + + for (probeAttempts = 1; ; ++probeAttempts) { + probeSlot = firstSlot + probe; + recordNumber = openChapter->slots[probeSlot].recordNumber; + + // If the hash slot is empty, we've reached the end of a chain without + // finding the record and should terminate the search. + if (recordNumber == 0) { + record = NULL; + break; + } + + // If the name of the record referenced by the slot matches and has not + // been deleted, then we've found the requested name. + record = &openChapter->records[recordNumber]; + if ((memcmp(&record->name, name, UDS_CHUNK_NAME_SIZE) == 0) + && !openChapter->slots[recordNumber].recordDeleted) { + break; + } + + // Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. + // This performs better than linear probing and works best for 2^N slots. + probe += probeAttempts; + if (probe >= slots) { + probe = probe % slots; + } + } + + // These NULL checks will be optimized away in callers who don't care about + // the values when this function is inlined. + if (slotPtr != NULL) { + *slotPtr = probeSlot; + } + if (recordNumberPtr != NULL) { + *recordNumberPtr = recordNumber; + } + + return record; +} + +/**********************************************************************/ +void searchOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + UdsChunkData *metadata, + bool *found) +{ + UdsChunkRecord *record = probeChapterSlots(openChapter, name, NULL, NULL); + + if (record == NULL) { + *found = false; + } else { + *found = true; + if (metadata != NULL) { + *metadata = record->data; + } + } +} + +/**********************************************************************/ +int putOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + const UdsChunkData *metadata, + unsigned int *remaining) +{ + unsigned int slot; + UdsChunkRecord *record = probeChapterSlots(openChapter, name, &slot, NULL); + + if (record != NULL) { + record->data = *metadata; + *remaining = openChapter->capacity - openChapter->size; + return UDS_SUCCESS; + } + + if (openChapter->size >= openChapter->capacity) { + return makeUnrecoverable(UDS_VOLUME_OVERFLOW); + } + + unsigned int recordNumber = ++openChapter->size; + openChapter->slots[slot].recordNumber = recordNumber; + record = &openChapter->records[recordNumber]; + record->name = *name; + record->data = *metadata; + + *remaining = openChapter->capacity - openChapter->size; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void removeFromOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + bool *removed) +{ + unsigned int recordNumber; + UdsChunkRecord *record + = probeChapterSlots(openChapter, name, NULL, &recordNumber); + + if (record == NULL) { + *removed = false; + return; + } + + // Set the deleted flag on the recordNumber in the slot array so search + // won't find it and close won't index it. + openChapter->slots[recordNumber].recordDeleted = true; + openChapter->deleted += 1; + *removed = true; +} + +/**********************************************************************/ +void freeOpenChapter(OpenChapterZone *openChapter) +{ + if (openChapter != NULL) { + FREE(openChapter->records); + FREE(openChapter); + } +} diff --git a/source/uds/openChapterZone.h b/source/uds/openChapterZone.h new file mode 100644 index 0000000..cecee4b --- /dev/null +++ b/source/uds/openChapterZone.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/openChapterZone.h#1 $ + */ + +#ifndef OPEN_CHAPTER_ZONE_H +#define OPEN_CHAPTER_ZONE_H 1 + +#include "common.h" +#include "geometry.h" +#include "typeDefs.h" + +/** + * OpenChapterZone is the mutable, in-memory representation of one zone's + * section of an Albireo index chapter. + * + *

In addition to providing the same access to records as an on-disk + * chapter, the open chapter zone must allow records to be added or + * modified. It must provide a way to generate the on-disk representation + * without excessive work. It does that by accumulating records in the order + * they are added (maintaining temporal locality), and referencing them (as + * record numbers) from hash slots selected from the name. If the metadata for + * a name changes, the record field is just modified in place. + * + *

Storage for the records (names and metadata) is allocated when the zone + * is created. It keeps no references to the data passed to it, and performs + * no additional allocation when adding records. Opening a new chapter simply + * marks it as being empty. + * + *

Records are stored in a flat array. To allow a value of zero in a + * hash slot to indicate that the slot is empty, records are numbered starting + * at one (1-based). Since C arrays are 0-based, the records array contains + * enough space for N+1 records, and the record that starts at array index + * zero is never used or referenced. + * + *

The array of hash slots is actually two arrays, superimposed: an + * array of record numbers, indexed by hash value, and an array of deleted + * flags, indexed by record number. This overlay is possible because the + * number of hash slots always exceeds the number of records, and is done + * simply to save on memory. + **/ + +enum { + OPEN_CHAPTER_RECORD_NUMBER_BITS = 23, + OPEN_CHAPTER_MAX_RECORD_NUMBER = (1 << OPEN_CHAPTER_RECORD_NUMBER_BITS) - 1 +}; + +typedef struct { + /** If non-zero, the record number addressed by this hash slot */ + unsigned int recordNumber : OPEN_CHAPTER_RECORD_NUMBER_BITS; + /** If true, the record at the index of this hash slot was deleted */ + bool recordDeleted : 1; +} __attribute__((packed)) Slot; + +typedef struct openChapterZone { + /** Maximum number of records that can be stored */ + unsigned int capacity; + /** Number of records stored */ + unsigned int size; + /** Number of deleted records */ + unsigned int deleted; + /** Record data, stored as (name, metadata), 1-based */ + UdsChunkRecord *records; + /** The number of slots in the chapter zone hash table. */ + unsigned int slotCount; + /** Hash table, referencing virtual record numbers */ + Slot slots[]; +} OpenChapterZone; + +/** + * Allocate an open chapter zone. + * + * @param geometry the geometry of the volume + * @param zoneCount the total number of open chapter zones + * @param openChapterPtr a pointer to hold the new open chapter + * + * @return UDS_SUCCESS or an error code + **/ +int makeOpenChapter(const Geometry *geometry, + unsigned int zoneCount, + OpenChapterZone **openChapterPtr) + __attribute__((warn_unused_result)); + +/** + * Return the number of records in the open chapter zone that have not been + * deleted. + * + * @return the number of non-deleted records + **/ +size_t openChapterSize(const OpenChapterZone *openChapter) + __attribute__((warn_unused_result)); + +/** + * Open a chapter by marking it empty. + * + * @param openChapter The chapter to open + **/ +void resetOpenChapter(OpenChapterZone *openChapter); + +/** + * Search the open chapter for a chunk name. + * + * @param openChapter The chapter to search + * @param name The name of the desired chunk + * @param metadata The holder for the metadata associated with the + * chunk, if found (or NULL) + * @param found A pointer which will be set to true if the chunk + * name was found + **/ +void searchOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + UdsChunkData *metadata, + bool *found); + +/** + * Put a record into the open chapter. + * + * @param openChapter The chapter into which to put the record + * @param name The name of the record + * @param metadata The record data + * @param remaining Pointer to an integer set to the number of additional + * records that can be added to this chapter + * + * @return UDS_SUCCESS or an error code + **/ +int putOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + const UdsChunkData *metadata, + unsigned int *remaining) + __attribute__((warn_unused_result)); + +/** + * Remove a record from the open chapter. + * + * @param openChapter The chapter from which to remove the record + * @param name The name of the record + * @param removed Pointer to bool set to true if the + * record was found + **/ +void removeFromOpenChapter(OpenChapterZone *openChapter, + const UdsChunkName *name, + bool *removed); + +/** + * Clean up an open chapter and its memory. + * + * @param openChapter the chapter to destroy + **/ +void freeOpenChapter(OpenChapterZone *openChapter); + +#endif /* OPEN_CHAPTER_ZONE_H */ diff --git a/source/uds/pageCache.c b/source/uds/pageCache.c new file mode 100644 index 0000000..b2db9a5 --- /dev/null +++ b/source/uds/pageCache.c @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/pageCache.c#6 $ + */ + +#include "pageCache.h" + +#include "atomicDefs.h" +#include "cacheCounters.h" +#include "chapterIndex.h" +#include "compiler.h" +#include "errors.h" +#include "geometry.h" +#include "hashUtils.h" +#include "indexConfig.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "recordPage.h" +#include "stringUtils.h" +#include "threads.h" +#include "zone.h" + +/**********************************************************************/ +int assertPageInCache(PageCache *cache, CachedPage *page) +{ + int result = ASSERT((page->cp_physicalPage < cache->numIndexEntries), + "physicalPage %u is valid (< %u)", + page->cp_physicalPage, cache->numIndexEntries); + if (result != UDS_SUCCESS) { + return result; + } + + uint16_t pageIndex = cache->index[page->cp_physicalPage]; + return ASSERT((pageIndex < cache->numCacheEntries) + && (&cache->cache[pageIndex] == page), + "page is at expected location in cache"); +} + +/** + * Clear a cache page. Note: this does not clear readPending - a read could + * still be pending and the read thread needs to be able to proceed and restart + * the requests regardless. This page will still be marked invalid, but it + * won't get reused (see getLeastRecentPage()) until the readPending flag + * is cleared. This is a valid case, e.g. the chapter gets forgotten and + * replaced with a new one in LRU. Restarting the requests will lead them to + * not find the records in the MI. + * + * @param cache the cache + * @param page the cached page to clear + * + **/ +static void clearPage(PageCache *cache, CachedPage *page) +{ + page->cp_physicalPage = cache->numIndexEntries; + WRITE_ONCE(page->cp_lastUsed, 0); +} + +/** + * Get a page from the cache, but with no stats + * + * @param cache the cache + * @param physicalPage the physical page to get + * @param queueIndex the index of the page in the read queue if + * queued, -1 otherwise + * @param pagePtr a pointer to hold the page + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int getPageNoStats(PageCache *cache, + unsigned int physicalPage, + int *queueIndex, + CachedPage **pagePtr) +{ + /* + * ASSERTION: We are either a zone thread holding a searchPendingCounter, + * or we are any thread holding the readThreadsMutex. + * + * Holding only a searchPendingCounter is the most frequent case. + */ + + int result = ASSERT((physicalPage < cache->numIndexEntries), + "physical page %u is invalid", physicalPage); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * It would be unlikely that the compiler turns the usage of indexValue into + * two reads of cache->index, but it would be possible and very bad if those + * reads did not return the same bits. + */ + uint16_t indexValue = READ_ONCE(cache->index[physicalPage]); + bool queued = (indexValue & VOLUME_CACHE_QUEUED_FLAG) != 0; + uint16_t index = indexValue & ~VOLUME_CACHE_QUEUED_FLAG; + + if (!queued && (index < cache->numCacheEntries)) { + *pagePtr = &cache->cache[index]; + /* + * We have acquired access to the cached page, but unless we hold the + * readThreadsMutex, we need a read memory barrier now. The corresponding + * write memory barrier is in putPageInCache. + */ + smp_rmb(); + } else { + *pagePtr = NULL; + } + if (queueIndex != NULL) { + *queueIndex = queued ? index : -1; + } + return UDS_SUCCESS; +} + +/** + * Wait for all pending searches on a page in the cache to complete + * + * @param cache the page cache + * @param physicalPage the page to check searches on + **/ +static void waitForPendingSearches(PageCache *cache, unsigned int physicalPage) +{ + /* + * We hold the readThreadsMutex. We are waiting for threads that do not hold + * the readThreadsMutex. Those threads have "locked" their targeted page by + * setting the searchPendingCounter. The corresponding write memory barrier + * is in beginPendingSearch. + */ + smp_mb(); + + InvalidateCounter initialCounters[MAX_ZONES]; + unsigned int i; + for (i = 0; i < cache->zoneCount; i++) { + initialCounters[i] = getInvalidateCounter(cache, i); + } + for (i = 0; i < cache->zoneCount; i++) { + if (searchPending(initialCounters[i]) + && (pageBeingSearched(initialCounters[i]) == physicalPage)) { + // There is an active search using the physical page. + // We need to wait for the search to finish. + while (initialCounters[i] == getInvalidateCounter(cache, i)) { + yieldScheduler(); + } + } + } +} + +/** + * Invalidate a cache page + * + * @param cache the cache + * @param page the cached page + * @param reason the reason for invalidation, for stats + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int invalidatePageInCache(PageCache *cache, + CachedPage *page, + InvalidationReason reason) +{ + // We hold the readThreadsMutex. + if (page == NULL) { + return UDS_SUCCESS; + } + + if (page->cp_physicalPage != cache->numIndexEntries) { + switch (reason) { + case INVALIDATION_EVICT: + cache->counters.evictions++; + break; + case INVALIDATION_EXPIRE: + cache->counters.expirations++; + break; + default: + break; + } + + if (reason != INVALIDATION_ERROR) { + int result = assertPageInCache(cache, page); + if (result != UDS_SUCCESS) { + return result; + } + } + + WRITE_ONCE(cache->index[page->cp_physicalPage], cache->numCacheEntries); + waitForPendingSearches(cache, page->cp_physicalPage); + } + + clearPage(cache, page); + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int findInvalidateAndMakeLeastRecent(PageCache *cache, + unsigned int physicalPage, + QueuedRead *readQueue, + InvalidationReason reason, + bool mustFind) +{ + // We hold the readThreadsMutex. + if (cache == NULL) { + return UDS_SUCCESS; + } + + CachedPage *page; + int queuedIndex = -1; + int result + = getPageNoStats(cache, physicalPage, + ((readQueue != NULL) ? &queuedIndex : NULL), &page); + if (result != UDS_SUCCESS) { + return result; + } + + if (page == NULL) { + result = ASSERT(!mustFind, "found page"); + if (result != UDS_SUCCESS) { + return result; + } + + if (queuedIndex > -1) { + logDebug("setting pending read to invalid"); + readQueue[queuedIndex].invalid = true; + } + return UDS_SUCCESS; + } + + // Invalidate the page and unmap it from the cache. + result = invalidatePageInCache(cache, page, reason); + if (result != UDS_SUCCESS) { + return result; + } + + // Move the cached page to the least recently used end of the list + // so it will be replaced before any page with valid data. + WRITE_ONCE(page->cp_lastUsed, 0); + + return UDS_SUCCESS; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int initializePageCache(PageCache *cache, + const Geometry *geometry, + unsigned int chaptersInCache, + unsigned int readQueueMaxSize, + unsigned int zoneCount) +{ + cache->geometry = geometry; + cache->numIndexEntries = geometry->pagesPerVolume + 1; + cache->numCacheEntries = chaptersInCache * geometry->recordPagesPerChapter; + cache->readQueueMaxSize = readQueueMaxSize; + cache->zoneCount = zoneCount; + atomic64_set(&cache->clock, 1); + + int result = ALLOCATE(readQueueMaxSize, QueuedRead, + "volume read queue", &cache->readQueue); + if (result != UDS_SUCCESS) { + return result; + } + + result = ALLOCATE(cache->zoneCount, SearchPendingCounter, + "Volume Cache Zones", &cache->searchPendingCounters); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((cache->numCacheEntries <= VOLUME_CACHE_MAX_ENTRIES), + "requested cache size, %u, within limit %u", + cache->numCacheEntries, VOLUME_CACHE_MAX_ENTRIES); + if (result != UDS_SUCCESS) { + return result; + } + + result = ALLOCATE(cache->numIndexEntries, uint16_t, "page cache index", + &cache->index); + if (result != UDS_SUCCESS) { + return result; + } + + // Initialize index values to invalid values. + unsigned int i; + for (i = 0; i < cache->numIndexEntries; i++) { + cache->index[i] = cache->numCacheEntries; + } + + result = ALLOCATE(cache->numCacheEntries, CachedPage, + "page cache cache", &cache->cache); + if (result != UDS_SUCCESS) { + return result; + } + + for (i = 0; i < cache->numCacheEntries; i++) { + CachedPage *page = &cache->cache[i]; + result = initializeVolumePage(geometry, &page->cp_pageData); + if (result != UDS_SUCCESS) { + return result; + } + clearPage(cache, page); + } + + return UDS_SUCCESS; +} + +/*********************************************************************/ +int makePageCache(const Geometry *geometry, + unsigned int chaptersInCache, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + PageCache **cachePtr) +{ + if (chaptersInCache < 1) { + return logWarningWithStringError(UDS_BAD_STATE, + "cache size must be" + " at least one chapter"); + } + if (readQueueMaxSize <= 0) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "read queue max size must be" + " greater than 0"); + } + if (zoneCount < 1) { + return logWarningWithStringError(UDS_INVALID_ARGUMENT, + "cache must have at least one zone"); + } + + PageCache *cache; + int result = ALLOCATE(1, PageCache, "volume cache", &cache); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializePageCache(cache, geometry, chaptersInCache, + readQueueMaxSize, zoneCount); + if (result != UDS_SUCCESS) { + freePageCache(cache); + return result; + } + + *cachePtr = cache; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freePageCache(PageCache *cache) +{ + if (cache == NULL) { + return; + } + if (cache->cache != NULL) { + unsigned int i; + for (i = 0; i < cache->numCacheEntries; i++) { + destroyVolumePage(&cache->cache[i].cp_pageData); + } + } + FREE(cache->index); + FREE(cache->cache); + FREE(cache->searchPendingCounters); + FREE(cache->readQueue); + FREE(cache); +} + +/**********************************************************************/ +int invalidatePageCacheForChapter(PageCache *cache, + unsigned int chapter, + unsigned int pagesPerChapter, + InvalidationReason reason) +{ + // We hold the readThreadsMutex. + if ((cache == NULL) || (cache->cache == NULL)) { + return UDS_SUCCESS; + } + + int result; + unsigned int i; + for (i = 0; i < pagesPerChapter; i++) { + unsigned int physicalPage = 1 + (pagesPerChapter * chapter) + i; + result = findInvalidateAndMakeLeastRecent(cache, physicalPage, + cache->readQueue, reason, false); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/*********************************************************************/ +void makePageMostRecent(PageCache *cache, CachedPage *page) +{ + // ASSERTION: We are either a zone thread holding a searchPendingCounter, + // or we are any thread holding the readThreadsMutex. + if (atomic64_read(&cache->clock) != READ_ONCE(page->cp_lastUsed)) { + WRITE_ONCE(page->cp_lastUsed, atomic64_inc_return(&cache->clock)); + } +} + +/** + * Get the least recent valid page from the cache. + * + * @param cache the cache + * @param pagePtr a pointer to hold the new page (will be set to NULL + * if the page was not found) + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int getLeastRecentPage(PageCache *cache, CachedPage **pagePtr) +{ + // We hold the readThreadsMutex. + int oldestIndex = 0; + // Our first candidate is any page that does have a pending read. We ensure + // above that there are more entries than read threads, so there must be one. + unsigned int i; + for (i = 0;; i++) { + if (i >= cache->numCacheEntries) { + // This should never happen. + return ASSERT(false, "oldest page is not NULL"); + } + if (!cache->cache[i].cp_readPending) { + oldestIndex = i; + break; + } + } + // Now find the least recently used page that does not have a pending read. + for (i = 0; i < cache->numCacheEntries; i++) { + if (!cache->cache[i].cp_readPending + && (READ_ONCE(cache->cache[i].cp_lastUsed) + <= READ_ONCE(cache->cache[oldestIndex].cp_lastUsed))) { + oldestIndex = i; + } + } + *pagePtr = &cache->cache[oldestIndex]; + return UDS_SUCCESS; +} + +/***********************************************************************/ +int getPageFromCache(PageCache *cache, + unsigned int physicalPage, + int probeType, + CachedPage **pagePtr) +{ + // ASSERTION: We are in a zone thread. + // ASSERTION: We holding a searchPendingCounter or the readThreadsMutex. + if (cache == NULL) { + return logWarningWithStringError(UDS_BAD_STATE, + "cannot get page with NULL cache"); + } + + // Get the cache page from the index + CachedPage *page; + int queueIndex = -1; + int result = getPageNoStats(cache, physicalPage, &queueIndex, &page); + if (result != UDS_SUCCESS) { + return result; + } + + CacheResultKind cacheResult = ((page != NULL) + ? CACHE_RESULT_HIT + : ((queueIndex != -1) + ? CACHE_RESULT_QUEUED + : CACHE_RESULT_MISS)); + incrementCacheCounter(&cache->counters, probeType, cacheResult); + + if (pagePtr != NULL) { + *pagePtr = page; + } + return UDS_SUCCESS; +} + +/***********************************************************************/ +int enqueueRead(PageCache *cache, Request *request, unsigned int physicalPage) +{ + // We hold the readThreadsMutex. + uint16_t first = cache->readQueueFirst; + uint16_t last = cache->readQueueLast; + uint16_t next = (last + 1) % cache->readQueueMaxSize; + uint16_t readQueuePos; + + if ((cache->index[physicalPage] & VOLUME_CACHE_QUEUED_FLAG) == 0) { + /* Not seen before, add this to the read queue and mark it as queued */ + if (next == first) { + /* queue is full */ + return UDS_SUCCESS; + } + /* fill the read queue entry */ + cache->readQueue[last].physicalPage = physicalPage; + cache->readQueue[last].invalid = false; + + /* point the cache index to it */ + readQueuePos = last; + WRITE_ONCE(cache->index[physicalPage], + readQueuePos | VOLUME_CACHE_QUEUED_FLAG); + cache->readQueue[readQueuePos].requestList.first = NULL; + cache->readQueue[readQueuePos].requestList.last = NULL; + /* bump the last pointer */ + cache->readQueueLast = next; + } else { + /* It's already queued, just add on to it */ + readQueuePos = cache->index[physicalPage] & ~VOLUME_CACHE_QUEUED_FLAG; + } + + int result = ASSERT((readQueuePos < cache->readQueueMaxSize), + "queue is not overfull"); + if (result != UDS_SUCCESS) { + return result; + } + + request->nextRequest = NULL; + if (cache->readQueue[readQueuePos].requestList.first == NULL) { + cache->readQueue[readQueuePos].requestList.first = request; + } else { + cache->readQueue[readQueuePos].requestList.last->nextRequest = request; + } + cache->readQueue[readQueuePos].requestList.last = request; + return UDS_QUEUED; +} + +/***********************************************************************/ +bool reserveReadQueueEntry(PageCache *cache, + unsigned int *queuePos, + Request **firstRequest, + unsigned int *physicalPage, + bool *invalid) +{ + // We hold the readThreadsMutex. + uint16_t lastRead = cache->readQueueLastRead; + + // No items to dequeue + if (lastRead == cache->readQueueLast) { + return false; + } + + unsigned int pageNo = cache->readQueue[lastRead].physicalPage; + bool isInvalid = cache->readQueue[lastRead].invalid; + + uint16_t indexValue = cache->index[pageNo]; + bool queued = (indexValue & VOLUME_CACHE_QUEUED_FLAG) != 0; + + // ALB-1429 ... need to check to see if its still queued before resetting + if (isInvalid && queued) { + // invalidate cache index slot + WRITE_ONCE(cache->index[pageNo], cache->numCacheEntries); + } + + // If a sync read has taken this page, set invalid to true so we don't + // overwrite, we simply just requeue requests. + if (!queued) { + isInvalid = true; + } + + cache->readQueue[lastRead].reserved = true; + + *queuePos = lastRead; + *firstRequest = cache->readQueue[lastRead].requestList.first; + *physicalPage = pageNo; + *invalid = isInvalid; + cache->readQueueLastRead = (lastRead + 1) % cache->readQueueMaxSize; + + return true; +} + +/************************************************************************/ +void releaseReadQueueEntry(PageCache *cache, unsigned int queuePos) +{ + // We hold the readThreadsMutex. + cache->readQueue[queuePos].reserved = false; + + uint16_t lastRead = cache->readQueueLastRead; + + // Move the readQueueFirst pointer along when we can + while ((cache->readQueueFirst != lastRead) + && (!cache->readQueue[cache->readQueueFirst].reserved)) { + cache->readQueueFirst = + (cache->readQueueFirst + 1) % cache->readQueueMaxSize; + } +} + +/***********************************************************************/ +int selectVictimInCache(PageCache *cache, + CachedPage **pagePtr) +{ + // We hold the readThreadsMutex. + if (cache == NULL) { + return logWarningWithStringError(UDS_BAD_STATE, + "cannot put page in NULL cache"); + } + + CachedPage *page = NULL; + int result = getLeastRecentPage(cache, &page); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((page != NULL), "least recent page was not NULL"); + if (result != UDS_SUCCESS) { + return result; + } + + // If the page is currently being pointed to by the page map, clear + // it from the page map, and update cache stats + if (page->cp_physicalPage != cache->numIndexEntries) { + cache->counters.evictions++; + WRITE_ONCE(cache->index[page->cp_physicalPage], cache->numCacheEntries); + waitForPendingSearches(cache, page->cp_physicalPage); + } + + page->cp_readPending = true; + + *pagePtr = page; + + return UDS_SUCCESS; +} + +/***********************************************************************/ +int putPageInCache(PageCache *cache, + unsigned int physicalPage, + CachedPage *page) +{ + // We hold the readThreadsMutex. + if (cache == NULL) { + return logWarningWithStringError(UDS_BAD_STATE, + "cannot complete page in NULL cache"); + } + + int result = ASSERT((page != NULL), "page to install exists"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((page->cp_readPending), + "page to install has a pending read"); + if (result != UDS_SUCCESS) { + return result; + } + + clearPage(cache, page); + + page->cp_physicalPage = physicalPage; + + // Figure out the index into the cache array using pointer arithmetic + uint16_t value = page - cache->cache; + result = ASSERT((value < cache->numCacheEntries), "cache index is valid"); + if (result != UDS_SUCCESS) { + return result; + } + + makePageMostRecent(cache, page); + + page->cp_readPending = false; + + /* + * We hold the readThreadsMutex, but we must have a write memory barrier + * before making the CachedPage available to the readers that do not hold the + * mutex. The corresponding read memory barrier is in getPageNoStats. + */ + smp_wmb(); + + // Point the page map to the new page. Will clear queued flag + WRITE_ONCE(cache->index[physicalPage], value); + + return UDS_SUCCESS; +} + +/***********************************************************************/ +void cancelPageInCache(PageCache *cache, + unsigned int physicalPage, + CachedPage *page) +{ + // We hold the readThreadsMutex. + if (cache == NULL) { + logWarning("cannot cancel page in NULL cache"); + return; + } + + int result = ASSERT((page != NULL), "page to install exists"); + if (result != UDS_SUCCESS) { + return; + } + + result = ASSERT((page->cp_readPending), + "page to install has a pending read"); + if (result != UDS_SUCCESS) { + return; + } + + clearPage(cache, page); + page->cp_readPending = false; + + // Clear the page map for the new page. Will clear queued flag + WRITE_ONCE(cache->index[physicalPage], cache->numCacheEntries); +} + +/**********************************************************************/ +size_t getPageCacheSize(PageCache *cache) +{ + if (cache == NULL) { + return 0; + } + return sizeof(DeltaIndexPage) * cache->numCacheEntries; +} + diff --git a/source/uds/pageCache.h b/source/uds/pageCache.h new file mode 100644 index 0000000..d639b4a --- /dev/null +++ b/source/uds/pageCache.h @@ -0,0 +1,504 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/pageCache.h#5 $ + */ + +#ifndef PAGE_CACHE_H +#define PAGE_CACHE_H + +#include "atomicDefs.h" +#include "cacheCounters.h" +#include "chapterIndex.h" +#include "common.h" +#include "compiler.h" +#include "indexConfig.h" +#include "opaqueTypes.h" +#include "permassert.h" +#include "request.h" +#include "volumeStore.h" + +typedef struct requestList { + Request *first; + Request *last; +} RequestList; + +typedef struct cachedPage { + /* whether this page is currently being read asynchronously */ + bool cp_readPending; + /* if equal to numCacheEntries, the page is invalid */ + unsigned int cp_physicalPage; + /* the value of the volume clock when this page was last used */ + int64_t cp_lastUsed; + /* the cache page data */ + struct volume_page cp_pageData; + /* the chapter index page. This is here, even for record pages */ + DeltaIndexPage cp_indexPage; +} CachedPage; + +enum { + VOLUME_CACHE_MAX_ENTRIES = (UINT16_MAX >> 1), + VOLUME_CACHE_QUEUED_FLAG = (1 << 15), + VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS = 4096 +}; + +typedef struct queuedRead { + /* whether this queue entry is invalid */ + bool invalid; + /* whether this queue entry has a pending read on it */ + bool reserved; + /* physical page to read */ + unsigned int physicalPage; + /* list of requests waiting on a queued read */ + RequestList requestList; +} QueuedRead; + +// Reason for invalidating a cache entry, used for gathering statistics +typedef enum invalidationReason { + INVALIDATION_EVICT, // cache is full, goodbye + INVALIDATION_EXPIRE, // your chapter is being overwritten + INVALIDATION_ERROR, // error happened; don't try to use data + INVALIDATION_INIT_SHUTDOWN +} InvalidationReason; + +/* + * Value stored atomically in a SearchPendingCounter. The low order 32 bits is + * the physical page number of the cached page being read. The high order 32 + * bits is a sequence number. + * + * An InvalidateCounter is only written by its zone thread by calling the + * beginPendingSearch or endPendingSearch methods. + * + * Any other thread that is accessing an InvalidateCounter is reading the value + * in the waitForPendingSearches method. + */ +typedef int64_t InvalidateCounter; +// Fields of InvalidateCounter. +// These must be 64 bit, so an enum cannot be not used. +#define PAGE_FIELD ((long)UINT_MAX) // The page number field +#define COUNTER_LSB (PAGE_FIELD + 1L) // The LSB of the counter field + +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) { + atomic64_t atomicValue; +} SearchPendingCounter; + +typedef struct pageCache { + // Geometry governing the volume + const Geometry *geometry; + // The number of zones + unsigned int zoneCount; + // The number of index entries + unsigned int numIndexEntries; + // The max number of cached entries + uint16_t numCacheEntries; + // The index used to quickly access page in cache - top bit is a 'queued' + // flag + uint16_t *index; + // The cache + CachedPage *cache; + // A counter for each zone to keep track of when a search is occurring + // within that zone. + SearchPendingCounter *searchPendingCounters; + // Queued reads, as a circular array, with first and last indexes + QueuedRead *readQueue; + // Cache counters for stats. This is the first field of a PageCache that is + // not constant after the struct is initialized. + CacheCounters counters; + /** + * Entries are enqueued at readQueueLast. + * To 'reserve' entries, we get the entry pointed to by readQueueLastRead + * and increment last read. This is done with a lock so if another reader + * thread reserves a read, it will grab the next one. After every read + * is completed, the reader thread calls releaseReadQueueEntry which + * increments readQueueFirst until it is equal to readQueueLastRead, but only + * if the value pointed to by readQueueFirst is no longer pending. + * This means that if n reads are outstanding, readQueueFirst may not + * be incremented until the last of the reads finishes. + * + * First Last + * || | | | | | || + * LR (1) (2) + * + * Read thread 1 increments last read (1), then read thread 2 increments it + * (2). When each read completes, it checks to see if it can increment first, + * when all concurrent reads have completed, readQueueFirst should equal + * readQueueLastRead. + **/ + uint16_t readQueueFirst; + uint16_t readQueueLastRead; + uint16_t readQueueLast; + // The size of the read queue + unsigned int readQueueMaxSize; + // Page access counter + atomic64_t clock; +} PageCache; + +/** + * Allocate a cache for a volume. + * + * @param geometry The geometry governing the volume + * @param chaptersInCache The size (in chapters) of the page cache + * @param readQueueMaxSize The maximum size of the read queue + * @param zoneCount The number of zones in the index + * @param cachePtr A pointer to hold the new page cache + * + * @return UDS_SUCCESS or an error code + **/ +int makePageCache(const Geometry *geometry, + unsigned int chaptersInCache, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + PageCache **cachePtr) + __attribute__((warn_unused_result)); + +/** + * Clean up a volume's cache + * + * @param cache the volumecache + **/ +void freePageCache(PageCache *cache); + +/** + * Invalidates a page cache for a particular chapter + * + * @param cache the page cache + * @param chapter the chapter + * @param pagesPerChapter the number of pages per chapter + * @param reason the reason for invalidation + * + * @return UDS_SUCCESS or an error code + **/ +int invalidatePageCacheForChapter(PageCache *cache, + unsigned int chapter, + unsigned int pagesPerChapter, + InvalidationReason reason) + __attribute__((warn_unused_result)); + +/** + * Find a page, invalidate it, and make its memory the least recent. This + * method is only exposed for the use of unit tests. + * + * @param cache The cache containing the page + * @param physicalPage The id of the page to invalidate + * @param readQueue The queue of pending reads (may be NULL) + * @param reason The reason for the invalidation, for stats + * @param mustFind If true, it is an error if the page + * can't be found + * + * @return UDS_SUCCESS or an error code + **/ +int findInvalidateAndMakeLeastRecent(PageCache *cache, + unsigned int physicalPage, + QueuedRead *readQueue, + InvalidationReason reason, + bool mustFind); + +/** + * Make the page the most recent in the cache + * + * @param cache the page cache + * @param pagePtr the page to make most recent + * + * @return UDS_SUCCESS or an error code + **/ +void makePageMostRecent(PageCache *cache, CachedPage *pagePtr); + +/** + * Verifies that a page is in the cache. This method is only exposed for the + * use of unit tests. + * + * @param cache the cache to verify + * @param page the page to find + * + * @return UDS_SUCCESS or an error code + **/ +int assertPageInCache(PageCache *cache, CachedPage *page) + __attribute__((warn_unused_result)); + +/** + * Gets a page from the cache. + * + * @param [in] cache the page cache + * @param [in] physicalPage the page number + * @param [in] probeType the type of cache access being done (CacheProbeType + * optionally OR'ed with CACHE_PROBE_IGNORE_FAILURE) + * @param [out] pagePtr the found page + * + * @return UDS_SUCCESS or an error code + **/ +int getPageFromCache(PageCache *cache, + unsigned int physicalPage, + int probeType, + CachedPage **pagePtr) + __attribute__((warn_unused_result)); + +/** + * Enqueue a read request + * + * @param cache the page cache + * @param request the request that depends on the read + * @param physicalPage the physicalPage for the request + * + * @return UDS_QUEUED if the page was queued + * UDS_SUCCESS if the queue was full + * an error code if there was an error + **/ +int enqueueRead(PageCache *cache, Request *request, unsigned int physicalPage) + __attribute__((warn_unused_result)); + +/** + * Reserves a queued read for future dequeuing, but does not remove it from + * the queue. Must call releaseReadQueueEntry to complete the process + * + * @param cache the page cache + * @param queuePos the position in the read queue for this pending read + * @param firstRequests list of requests for the pending read + * @param physicalPage the physicalPage for the requests + * @param invalid whether or not this entry is invalid + * + * @return UDS_SUCCESS or an error code + **/ +bool reserveReadQueueEntry(PageCache *cache, + unsigned int *queuePos, + Request **firstRequests, + unsigned int *physicalPage, + bool *invalid); + +/** + * Releases a read from the queue, allowing it to be reused by future + * enqueues + * + * @param cache the page cache + * @param queuePos queue entry position + * + * @return UDS_SUCCESS or an error code + **/ +void releaseReadQueueEntry(PageCache *cache, + unsigned int queuePos); + +/** + * Check for the page cache read queue being empty. + * + * @param cache the page cache for which to check the read queue. + * + * @return true if the read queue for cache is empty, false otherwise. + **/ +static INLINE bool readQueueIsEmpty(PageCache *cache) +{ + return (cache->readQueueFirst == cache->readQueueLast); +} + +/** + * Check for the page cache read queue being full. + * + * @param cache the page cache for which to check the read queue. + * + * @return true if the read queue for cache is full, false otherwise. + **/ +static INLINE bool readQueueIsFull(PageCache *cache) +{ + return (cache->readQueueFirst == + (cache->readQueueLast + 1) % cache->readQueueMaxSize); +} + +/** + * Selects a page in the cache to be used for a read. + * + * This will clear the pointer in the page map and + * set readPending to true on the cache page + * + * @param cache the page cache + * @param pagePtr the page to add + * + * @return UDS_SUCCESS or an error code + **/ +int selectVictimInCache(PageCache *cache, + CachedPage **pagePtr) + __attribute__((warn_unused_result)); + +/** + * Completes an async page read in the cache, so that + * the page can now be used for incoming requests. + * + * This will invalidate the old cache entry and point + * the page map for the new page to this entry + * + * @param cache the page cache + * @param physicalPage the page number + * @param page the page to complete processing on + * + * @return UDS_SUCCESS or an error code + **/ +int putPageInCache(PageCache *cache, + unsigned int physicalPage, + CachedPage *page) + __attribute__((warn_unused_result)); + +/** + * Cancels an async page read in the cache, so that + * the page can now be used for incoming requests. + * + * This will invalidate the old cache entry and clear + * the read queued flag on the page map entry, if it + * was set. + * + * @param cache the page cache + * @param physicalPage the page number to clear the queued read flag on + * @param page the page to cancel processing on + * + * @return UDS_SUCCESS or an error code + **/ +void cancelPageInCache(PageCache *cache, + unsigned int physicalPage, + CachedPage *page); + +/** + * Get the page cache size + * + * @param cache the page cache + * + * @return the size of the page cache + **/ +size_t getPageCacheSize(PageCache *cache) + __attribute__((warn_unused_result)); + + +/** + * Read the InvalidateCounter for the given zone. + * + * @param cache the page cache + * @param zoneNumber the zone number + * + * @return the InvalidateCounter value + **/ +static INLINE InvalidateCounter getInvalidateCounter(PageCache *cache, + unsigned int zoneNumber) +{ + return atomic64_read(&cache->searchPendingCounters[zoneNumber].atomicValue); +} + +/** + * Write the InvalidateCounter for the given zone. + * + * @param cache the page cache + * @param zoneNumber the zone number + * @param invalidateCounter the InvalidateCounter value to write + **/ +static INLINE void setInvalidateCounter(PageCache *cache, + unsigned int zoneNumber, + InvalidateCounter invalidateCounter) +{ + atomic64_set(&cache->searchPendingCounters[zoneNumber].atomicValue, + invalidateCounter); +} + +/** + * Return the physical page number of the page being searched. The return + * value is only valid if searchPending indicates that a search is in progress. + * + * @param counter the InvalidateCounter value to check + * + * @return the page that the zone is searching + **/ +static INLINE unsigned int pageBeingSearched(InvalidateCounter counter) +{ + return counter & PAGE_FIELD; +} + +/** + * Determines whether a given value indicates that a search is occuring. + * + * @param invalidateCounter the InvalidateCounter value to check + * + * @return true if a search is pending, false otherwise + **/ +static INLINE bool searchPending(InvalidateCounter invalidateCounter) +{ + return (invalidateCounter & COUNTER_LSB) != 0; +} + +/** + * Determines whether there is a search occuring for the given zone. + * + * @param cache the page cache + * @param zoneNumber the zone number + * + * @return true if a search is pending, false otherwise + **/ +static INLINE bool isSearchPending(PageCache *cache, + unsigned int zoneNumber) +{ + return searchPending(getInvalidateCounter(cache, zoneNumber)); +} + +/** + * Increment the counter for the specified zone to signal that a search has + * begun. Also set which page is being searched. The searchPendingCounters + * are protecting read access to pages indexed by the cache. This is the + * "lock" action. + * + * @param cache the page cache + * @param physicalPage the page that the zone is searching + * @param zoneNumber the zone number + **/ +static INLINE void beginPendingSearch(PageCache *cache, + unsigned int physicalPage, + unsigned int zoneNumber) +{ + InvalidateCounter invalidateCounter = getInvalidateCounter(cache, + zoneNumber); + invalidateCounter &= ~PAGE_FIELD; + invalidateCounter |= physicalPage; + invalidateCounter += COUNTER_LSB; + setInvalidateCounter(cache, zoneNumber, invalidateCounter); + ASSERT_LOG_ONLY(searchPending(invalidateCounter), + "Search is pending for zone %u", zoneNumber); + /* + * This memory barrier ensures that the write to the invalidate counter is + * seen by other threads before this threads accesses the cached page. The + * corresponding read memory barrier is in waitForPendingSearches. + */ + smp_mb(); +} + +/** + * Increment the counter for the specified zone to signal that a search has + * finished. We do not need to reset the page since we only should ever look + * at the page value if the counter indicates a search is ongoing. The + * searchPendingCounters are protecting read access to pages indexed by the + * cache. This is the "unlock" action. + * + * @param cache the page cache + * @param zoneNumber the zone number + **/ +static INLINE void endPendingSearch(PageCache *cache, + unsigned int zoneNumber) +{ + // This memory barrier ensures that this thread completes reads of the + // cached page before other threads see the write to the invalidate counter. + smp_mb(); + + InvalidateCounter invalidateCounter = getInvalidateCounter(cache, + zoneNumber); + ASSERT_LOG_ONLY(searchPending(invalidateCounter), + "Search is pending for zone %u", zoneNumber); + invalidateCounter += COUNTER_LSB; + setInvalidateCounter(cache, zoneNumber, invalidateCounter); +} + +#endif /* PAGE_CACHE_H */ diff --git a/source/uds/permassert.c b/source/uds/permassert.c new file mode 100644 index 0000000..0c8afeb --- /dev/null +++ b/source/uds/permassert.c @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/permassert.c#1 $ + */ + +#include "permassert.h" +#include "permassertInternals.h" + +#include "errors.h" + +/*****************************************************************************/ +int assertionFailed(const char *expressionString, + int code, + const char *fileName, + int lineNumber, + const char *format, + ...) +{ + va_list args; + va_start(args, format); + handleAssertionFailure(expressionString, fileName, lineNumber, format, args); + va_end(args); + + return code; +} + +/*****************************************************************************/ +int assertionFailedLogOnly(const char *expressionString, + const char *fileName, + int lineNumber, + const char *format, + ...) +{ + va_list args; + va_start(args, format); + handleAssertionFailure(expressionString, fileName, lineNumber, format, args); + va_end(args); + + return UDS_ASSERTION_FAILED; +} diff --git a/source/uds/permassert.h b/source/uds/permassert.h new file mode 100644 index 0000000..d04336b --- /dev/null +++ b/source/uds/permassert.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/permassert.h#1 $ + */ + +#ifndef PERMASSERT_H +#define PERMASSERT_H + +#include "compiler.h" +#include "errors.h" +#include "uds-error.h" + +#define STRINGIFY(X) #X +#define STRINGIFY_VALUE(X) STRINGIFY(X) + +/* + * A hack to apply the "warn if unused" attribute to an integral expression. + * + * Since GCC doesn't propagate the warn_unused_result attribute to + * conditional expressions incorporating calls to functions with that + * attribute, this function can be used to wrap such an expression. + * With optimization enabled, this function contributes no additional + * instructions, but the warn_unused_result attribute still applies to + * the code calling it. + * + * @param value The value to return + * + * @return The supplied value + */ +__attribute__((warn_unused_result)) +static INLINE int mustUse(int value) +{ + return value; +} + +/* + * A replacement for assert() from assert.h. + * + * @param expr The boolean expression being asserted + * @param code The error code to return on non-fatal assertion + * failure + * @param format A printf() style format for the message to log on + * assertion failure + * @param arguments Any additional arguments required by the format + * + * @return UDS_SUCCESS If expr is true, code if expr is false and + * exitOnAssertionFailure is false. When exitOnAssertionFailure + * is true and expr is false, the program will exit from within + * this macro. + */ +#define ASSERT_WITH_ERROR_CODE(expr, code, ...) \ + mustUse(__builtin_expect(!!(expr), 1) \ + ? UDS_SUCCESS \ + : assertionFailed(STRINGIFY(expr), code, __FILE__, __LINE__, \ + __VA_ARGS__)) + +/* + * A replacement for assert() from assert.h. + * + * @param expr The boolean expression being asserted + * @param format A printf() style format for the message to log on + * assertion failure + * @param arguments Any additional arguments required by the format + * + * @return UDS_SUCCESS If expr is true, UDS_ASSERTION_FAILED if expr is + * false and exitOnAssertionFailure is false. When + * exitOnAssertionFailure is true and expr is false, the + * program will exit from within this macro. + */ +#define ASSERT(expr, ...) \ + ASSERT_WITH_ERROR_CODE(expr, UDS_ASSERTION_FAILED, __VA_ARGS__) + +/* + * A replacement for assert() which logs on failure, but does not return an + * error code. This should be used sparingly. If the expression is false and + * exitOnAssertionFailure is true, the program will exit from within this macro. + * + * @param expr The boolean expression being asserted + * @param format A printf() syle format for the message to log on + * assertion failure + * @param arguments Any additional arguments required by the format + */ +#define ASSERT_LOG_ONLY(expr, ...) \ + (__builtin_expect(!!(expr), 1) \ + ? UDS_SUCCESS \ + : assertionFailedLogOnly(STRINGIFY(expr), __FILE__, __LINE__, __VA_ARGS__)) + +/* + * This macro is a convenient wrapper for ASSERT(false, ...). + */ +#define ASSERT_FALSE(...) \ + ASSERT(false, __VA_ARGS__) + +#define STATIC_ASSERT(expr) \ + do { \ + switch (0) { \ + case 0: \ + case expr: \ + ; \ + default: \ + ; \ + } \ + } while(0) + +#define STATIC_ASSERT_SIZEOF(type, expectedSize) \ + STATIC_ASSERT(sizeof(type) == (expectedSize)) + +/** + * Set whether or not to exit on an assertion failure. + * + * @param shouldExit If true assertion failures will cause + * the program to exit + * + * @return The previous setting + **/ +bool setExitOnAssertionFailure(bool shouldExit); + +/** + * Log an assertion failure. + * + * @param expressionString The assertion + * @param errorCode The error code to return + * @param fileName The file in which the assertion appears + * @param lineNumber The line number on which the assertion + * appears + * @param format A printf() style format describing the + * assertion + * + * @return The supplied errorCode unless exitOnAssertionFailure is + * true, in which case the process will be aborted + **/ +int assertionFailed(const char *expressionString, + int errorCode, + const char *fileName, + int lineNumber, + const char *format, + ...) + __attribute__((format(printf, 5, 6), warn_unused_result)); + +/** + * Log an assertion failure. This function is different from + * assertionFailed() in that its return value may be ignored, and so should + * only be used in cases where the return value will be ignored. + * + * @param expressionString The assertion + * @param fileName The file in which the assertion appears + * @param lineNumber The line number on which the assertion + * appears + * @param format A printf() style format describing the + * assertion + * + * @return UDS_ASSERTION_FAILED unless exitOnAssertionFailure is + * true, in which case the process will be aborted + **/ +int assertionFailedLogOnly(const char *expressionString, + const char *fileName, + int lineNumber, + const char *format, + ...) + __attribute__((format(printf, 4, 5))); + +#endif /* PERMASSERT_H */ diff --git a/source/uds/permassertInternals.h b/source/uds/permassertInternals.h new file mode 100644 index 0000000..f0a3b95 --- /dev/null +++ b/source/uds/permassertInternals.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/permassertInternals.h#1 $ + */ + +#ifndef PERMASSERT_INTERNALS_H +#define PERMASSERT_INTERNALS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void handleAssertionFailure(const char *expressionString, + const char *fileName, + int lineNumber, + const char *format, + va_list args) + __attribute__((format(printf, 4, 0))); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* PERMASSERT_INTERNALS_H */ diff --git a/source/uds/permassertLinuxKernel.c b/source/uds/permassertLinuxKernel.c new file mode 100644 index 0000000..67f66d9 --- /dev/null +++ b/source/uds/permassertLinuxKernel.c @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/permassertLinuxKernel.c#1 $ + */ + +#include "logger.h" +#include "permassert.h" +#include "permassertInternals.h" + +/**********************************************************************/ +__attribute__((format(printf, 4, 0))) +void handleAssertionFailure(const char *expressionString, + const char *fileName, + int lineNumber, + const char *format, + va_list args) +{ + logEmbeddedMessage(LOG_ERR, "assertion \"", format, args, + "\" (%s) failed at %s:%d", + expressionString, fileName, lineNumber); + logBacktrace(LOG_ERR); +} diff --git a/source/uds/random.c b/source/uds/random.c new file mode 100644 index 0000000..acad146 --- /dev/null +++ b/source/uds/random.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/random.c#2 $ + */ + +#include "random.h" + +#include "permassert.h" + +/*****************************************************************************/ +unsigned int randomInRange(unsigned int lo, unsigned int hi) +{ + return lo + random() % (hi - lo + 1); +} + +/*****************************************************************************/ +void randomCompileTimeAssertions(void) +{ + STATIC_ASSERT((((uint64_t) RAND_MAX + 1) & RAND_MAX) == 0); +} + +#ifndef __KERNEL__ +/*****************************************************************************/ +void fillRandomly(void *ptr, size_t len) +{ + uint64_t randNum = 0; + uint64_t randMask = 0; + const uint64_t multiplier = (uint64_t) RAND_MAX + 1; + + byte *bp = ptr; + for (size_t i = 0; i < len; ++i) { + if (randMask < 0xff) { + randNum = randNum * multiplier + random(); + randMask = randMask * multiplier + RAND_MAX; + } + bp[i] = randNum & 0xff; + randNum >>= 8; + randMask >>= 8; + } +} +#endif diff --git a/source/uds/random.h b/source/uds/random.h new file mode 100644 index 0000000..f5d2f49 --- /dev/null +++ b/source/uds/random.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/random.h#2 $ + */ + +#ifndef RANDOM_H +#define RANDOM_H + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#include "compiler.h" +#include "typeDefs.h" + +/** + * Get random unsigned integer in a given range + * + * @param lo Minimum unsigned integer value + * @param hi Maximum unsigned integer value + * + * @return unsigned integer in the interval [lo,hi] + **/ +unsigned int randomInRange(unsigned int lo, unsigned int hi); + +/** + * Special function wrapper required for compile-time assertions. This + * function will fail to compile if RAND_MAX is not of the form 2^n - 1. + **/ +void randomCompileTimeAssertions(void); + +/** + * Fill bytes with random data. + * + * @param ptr where to store bytes + * @param len number of bytes to write + **/ +#ifdef __KERNEL__ +static INLINE void fillRandomly(void *ptr, size_t len) +{ + prandom_bytes(ptr, len); +} +#else +void fillRandomly(void *ptr, size_t len); +#endif + +#ifdef __KERNEL__ +#define RAND_MAX 2147483647 + +/** + * Random number generator + * + * @return a random number in the rand 0 to RAND_MAX + **/ +static INLINE long random(void) +{ + long value; + fillRandomly(&value, sizeof(value)); + return value & RAND_MAX; +} +#endif + +#endif /* RANDOM_H */ diff --git a/source/uds/recordPage.c b/source/uds/recordPage.c new file mode 100644 index 0000000..f4c2572 --- /dev/null +++ b/source/uds/recordPage.c @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/recordPage.c#3 $ + */ + +#include "recordPage.h" + +#include "permassert.h" + +/**********************************************************************/ +static unsigned int encodeTree(byte recordPage[], + const UdsChunkRecord *sortedPointers[], + unsigned int nextRecord, + unsigned int node, + unsigned int nodeCount) +{ + if (node < nodeCount) { + unsigned int child = (2 * node) + 1; + nextRecord = encodeTree(recordPage, sortedPointers, nextRecord, + child, nodeCount); + + // In-order traversal: copy the contents of the next record + // into the page at the node offset. + memcpy(&recordPage[node * BYTES_PER_RECORD], + sortedPointers[nextRecord], + BYTES_PER_RECORD); + ++nextRecord; + + nextRecord = encodeTree(recordPage, sortedPointers, nextRecord, + child + 1, nodeCount); + } + return nextRecord; +} + +/**********************************************************************/ +int encodeRecordPage(const Volume *volume, + const UdsChunkRecord records[], + byte recordPage[]) +{ + unsigned int recordsPerPage = volume->geometry->recordsPerPage; + const UdsChunkRecord **recordPointers = volume->recordPointers; + + // Build an array of record pointers. We'll sort the pointers by the block + // names in the records, which is less work than sorting the record values. + unsigned int i; + for (i = 0; i < recordsPerPage; i++) { + recordPointers[i] = &records[i]; + } + + STATIC_ASSERT(offsetof(UdsChunkRecord, name) == 0); + int result = radixSort(volume->radixSorter, (const byte **) recordPointers, + recordsPerPage, UDS_CHUNK_NAME_SIZE); + if (result != UDS_SUCCESS) { + return result; + } + + // Use the sorted pointers to copy the records from the chapter to the + // record page in tree order. + encodeTree(recordPage, recordPointers, 0, 0, recordsPerPage); + return UDS_SUCCESS; +} + +/**********************************************************************/ +bool searchRecordPage(const byte recordPage[], + const UdsChunkName *name, + const Geometry *geometry, + UdsChunkData *metadata) +{ + // The record page is just an array of chunk records. + const UdsChunkRecord *records = (const UdsChunkRecord *) recordPage; + + // The array of records is sorted by name and stored as a binary tree in + // heap order, so the root of the tree is the first array element. + unsigned int node = 0; + while (node < geometry->recordsPerPage) { + const UdsChunkRecord *record = &records[node]; + int result = memcmp(name, &record->name, UDS_CHUNK_NAME_SIZE); + if (result == 0) { + if (metadata != NULL) { + *metadata = record->data; + } + return true; + } + // The children of node N are in the heap at indexes 2N+1 and 2N+2. + node = ((2 * node) + ((result < 0) ? 1 : 2)); + } + return false; +} diff --git a/source/uds/recordPage.h b/source/uds/recordPage.h new file mode 100644 index 0000000..ecf9ddc --- /dev/null +++ b/source/uds/recordPage.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/recordPage.h#2 $ + */ + +#ifndef RECORDPAGE_H +#define RECORDPAGE_H 1 + +#include "common.h" +#include "volume.h" + +/** + * Generate the on-disk encoding of a record page from the list of records + * in the open chapter representation. + * + * @param volume The volume + * @param records The records to be encoded + * @param recordPage The record page + * + * @return UDS_SUCCESS or an error code + **/ +int encodeRecordPage(const Volume *volume, + const UdsChunkRecord records[], + byte recordPage[]); + +/** + * Find the metadata for a given block name in this page. + * + * @param recordPage The record page + * @param name The block name to look for + * @param geometry The geometry of the volume + * @param metadata an array in which to place the metadata of the + * record, if one was found + * + * @return true if the record was found + **/ +bool searchRecordPage(const byte recordPage[], + const UdsChunkName *name, + const Geometry *geometry, + UdsChunkData *metadata); + +#endif /* RECORDPAGE_H */ diff --git a/source/uds/regionIdentifiers.h b/source/uds/regionIdentifiers.h new file mode 100644 index 0000000..ff72b19 --- /dev/null +++ b/source/uds/regionIdentifiers.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/regionIdentifiers.h#1 $ + */ + +#ifndef REGION_IDENTIFIERS_H +#define REGION_IDENTIFIERS_H + +enum { + RH_TYPE_FREE = 0, // unused + RH_TYPE_SUPER = 1, + RH_TYPE_SAVE = 2, + RH_TYPE_CHECKPOINT = 3, + RH_TYPE_UNSAVED = 4, + + RL_KIND_SCRATCH = 0, // uninitialized or scrapped + RL_KIND_HEADER = 1, // for self-referential items + RL_KIND_CONFIG = 100, + RL_KIND_INDEX = 101, + RL_KIND_SEAL = 102, + RL_KIND_VOLUME = 201, + RL_KIND_SAVE = 202, + RL_KIND_INDEX_PAGE_MAP = 301, + RL_KIND_MASTER_INDEX = 302, + RL_KIND_OPEN_CHAPTER = 303, + RL_KIND_INDEX_STATE = 401, // not saved as region + + RL_SOLE_INSTANCE = 65535, +}; + +typedef unsigned int RegionType; +typedef unsigned int RegionKind; + +#endif // REGION_IDENTIFIERS_H diff --git a/source/uds/request.c b/source/uds/request.c new file mode 100644 index 0000000..c994181 --- /dev/null +++ b/source/uds/request.c @@ -0,0 +1,258 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/request.c#6 $ + */ + +#include "request.h" + +#include "indexRouter.h" +#include "indexSession.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "requestQueue.h" + +/**********************************************************************/ +int udsStartChunkOperation(UdsRequest *udsRequest) +{ + if (udsRequest->callback == NULL) { + return UDS_CALLBACK_REQUIRED; + } + switch (udsRequest->type) { + case UDS_DELETE: + case UDS_POST: + case UDS_QUERY: + case UDS_UPDATE: + break; + default: + return UDS_INVALID_OPERATION_TYPE; + } + memset(udsRequest->private, 0, sizeof(udsRequest->private)); + Request *request = (Request *)udsRequest; + + int result = getIndexSession(request->session); + if (result != UDS_SUCCESS) { + return sansUnrecoverable(result); + } + + request->found = false; + request->action = (RequestAction) request->type; + request->isControlMessage = false; + request->unbatched = false; + request->router = request->session->router; + + enqueueRequest(request, STAGE_TRIAGE); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int launchZoneControlMessage(RequestAction action, + ZoneMessage message, + unsigned int zone, + IndexRouter *router) +{ + Request *request; + int result = ALLOCATE(1, Request, __func__, &request); + if (result != UDS_SUCCESS) { + return result; + } + + request->router = router; + request->isControlMessage = true; + request->unbatched = true; + request->action = action; + request->zoneNumber = zone; + request->zoneMessage = message; + + enqueueRequest(request, STAGE_INDEX); + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeRequest(Request *request) +{ + if (request != NULL) { + FREE(request); + } +} + +/**********************************************************************/ +static RequestQueue *getNextStageQueue(Request *request, + RequestStage nextStage) +{ + if (nextStage == STAGE_CALLBACK) { + return request->session->callbackQueue; + } + + // Local and remote index routers handle the rest of the pipeline + // differently, so delegate the choice of queue to the router. + return selectIndexRouterQueue(request->router, request, nextStage); +} + +/**********************************************************************/ +static void handleRequestErrors(Request *request) +{ + // XXX Use the router's callback function to hand back the error + // and clean up the request? (Possible thread issues doing that.) + + freeRequest(request); +} + +/**********************************************************************/ +void enqueueRequest(Request *request, RequestStage nextStage) +{ + RequestQueue *nextQueue = getNextStageQueue(request, nextStage); + if (nextQueue == NULL) { + handleRequestErrors(request); + return; + } + + requestQueueEnqueue(nextQueue, request); +} + +/* + * This function pointer allows unit test code to intercept the slow-lane + * requeuing of a request. + */ +static RequestRestarter requestRestarter = NULL; + +/**********************************************************************/ +void restartRequest(Request *request) +{ + request->requeued = true; + if (requestRestarter == NULL) { + enqueueRequest(request, STAGE_INDEX); + } else { + requestRestarter(request); + } +} + +/**********************************************************************/ +void setRequestRestarter(RequestRestarter restarter) +{ + requestRestarter = restarter; +} + +/**********************************************************************/ +static INLINE void increment_once(uint64_t *countPtr) +{ + WRITE_ONCE(*countPtr, READ_ONCE(*countPtr) + 1); +} + +/**********************************************************************/ +void updateRequestContextStats(Request *request) +{ + /* + * We don't need any synchronization since the context stats are only + * modified from the single callback thread. + * + * We increment either 2 or 3 counters in this method. + * + * XXX We always increment the "requests" counter. But there is no code + * that uses the value stored in this counter. + * + * We always increment exactly one of these counters (unless there is an + * error in the code, which never happens): + * postsFound postsNotFound + * updatesFound updatesNotFound + * deletionsFound deletionsNotFound + * queriesFound queriesNotFound + * + * XXX In the case of post request that were found in the index, we increment + * exactly one of these counters. But there is no code that uses the + * value stored in these counters. + * inMemoryPostsFound + * densePostsFound + * sparsePostsFound + */ + + SessionStats *sessionStats = &request->session->stats; + + increment_once(&sessionStats->requests); + bool found = (request->location != LOC_UNAVAILABLE); + + switch (request->action) { + case REQUEST_INDEX: + if (found) { + increment_once(&sessionStats->postsFound); + + if (request->location == LOC_IN_OPEN_CHAPTER) { + increment_once(&sessionStats->postsFoundOpenChapter); + } else if (request->location == LOC_IN_DENSE) { + increment_once(&sessionStats->postsFoundDense); + } else if (request->location == LOC_IN_SPARSE) { + increment_once(&sessionStats->postsFoundSparse); + } + } else { + increment_once(&sessionStats->postsNotFound); + } + break; + + case REQUEST_UPDATE: + if (found) { + increment_once(&sessionStats->updatesFound); + } else { + increment_once(&sessionStats->updatesNotFound); + } + break; + + case REQUEST_DELETE: + if (found) { + increment_once(&sessionStats->deletionsFound); + } else { + increment_once(&sessionStats->deletionsNotFound); + } + break; + + case REQUEST_QUERY: + if (found) { + increment_once(&sessionStats->queriesFound); + } else { + increment_once(&sessionStats->queriesNotFound); + } + break; + + default: + request->status = ASSERT(false, "unknown next action in request: %d", + request->action); + } +} + +/**********************************************************************/ +void enterCallbackStage(Request *request) +{ + if (!request->isControlMessage) { + if (isUnrecoverable(request->status)) { + // Unrecoverable errors must disable the index session + disableIndexSession(request->session); + // The unrecoverable state is internal and must not sent to the client. + request->status = sansUnrecoverable(request->status); + } + + // Handle asynchronous client callbacks in the designated thread. + enqueueRequest(request, STAGE_CALLBACK); + } else { + /* + * Asynchronous control messages are complete when they are executed. + * There should be nothing they need to do on the callback thread. The + * message has been completely processed, so just free it. + */ + freeRequest(request); + } +} diff --git a/source/uds/request.h b/source/uds/request.h new file mode 100644 index 0000000..fb6250e --- /dev/null +++ b/source/uds/request.h @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/request.h#7 $ + */ + +#ifndef REQUEST_H +#define REQUEST_H + +#include "cacheCounters.h" +#include "common.h" +#include "compiler.h" +#include "opaqueTypes.h" +#include "threads.h" +#include "timeUtils.h" +#include "uds.h" +#include "util/funnelQueue.h" + +/** + * RequestAction values indicate what action, command, or query is to be + * performed when processing a Request instance. + **/ +typedef enum { + // Map the API's UdsCallbackType values directly to a corresponding action. + REQUEST_INDEX = UDS_POST, + REQUEST_UPDATE = UDS_UPDATE, + REQUEST_DELETE = UDS_DELETE, + REQUEST_QUERY = UDS_QUERY, + + REQUEST_CONTROL, + + // REQUEST_SPARSE_CACHE_BARRIER is the action for the control request used + // by localIndexRouter. + REQUEST_SPARSE_CACHE_BARRIER, + + // REQUEST_ANNOUNCE_CHAPTER_CLOSED is the action for the control + // request used by an indexZone to signal the other zones that it + // has closed the current open chapter. + REQUEST_ANNOUNCE_CHAPTER_CLOSED, +} RequestAction; + +/** + * The block's rough location in the index, if any. + **/ +typedef enum { + /* the block doesn't exist or the location isn't available */ + LOC_UNAVAILABLE, + /* if the block was found in the open chapter */ + LOC_IN_OPEN_CHAPTER, + /* if the block was found in the dense part of the index */ + LOC_IN_DENSE, + /* if the block was found in the sparse part of the index */ + LOC_IN_SPARSE +} IndexRegion; + +/** + * Abstract request pipeline stages, which can also be viewed as stages in the + * life-cycle of a request. + **/ +typedef enum { + STAGE_TRIAGE, + STAGE_INDEX, + STAGE_CALLBACK, +} RequestStage; + +/** + * Control message fields for the barrier messages used to coordinate the + * addition of a chapter to the sparse chapter index cache. + **/ +typedef struct barrierMessageData { + /** virtual chapter number of the chapter index to add to the sparse cache */ + uint64_t virtualChapter; +} BarrierMessageData; + +/** + * Control message fields for the chapter closed messages used to inform + * lagging zones of the first zone to close a given open chapter. + **/ +typedef struct chapterClosedMessageData { + /** virtual chapter number of the chapter which was closed */ + uint64_t virtualChapter; +} ChapterClosedMessageData; + +/** + * Union of the all the zone control message fields. The RequestAction field + * (or launch function argument) selects which of the members is valid. + **/ +typedef union zoneMessageData { + BarrierMessageData barrier; // for REQUEST_SPARSE_CACHE_BARRIER + ChapterClosedMessageData chapterClosed; // for REQUEST_ANNOUNCE_CHAPTER_CLOSED +} ZoneMessageData; + +typedef struct zoneMessage { + /** the index to which the message is directed */ + struct index *index; + /** the message specific data */ + ZoneMessageData data; +} ZoneMessage; + +/** + * Request context for queuing throughout the uds pipeline + * + * XXX Note that the typedef for this struct defines "Request", and that this + * should therefore be "struct request". However, this conflicts with the + * Linux kernel which also has a "struct request". This is a workaround so + * that we can make upstreaming progress. The real solution is to expose + * this structure as the true "struct uds_request" and do a lot of + * renaming. + **/ +struct internalRequest { + /* + * The first part of this structure must be exactly parallel to the + * UdsRequest structure, which is part of the public UDS API. + */ + UdsChunkName chunkName; // hash value + UdsChunkData oldMetadata; // metadata from index + UdsChunkData newMetadata; // metadata from request + UdsChunkCallback *callback; // callback method when complete + struct uds_index_session *session; // The public index session + UdsCallbackType type; // the type of request + int status; // success or error code for this request + bool found; // True if the block was found in index + bool update; // move record to newest chapter if found + + /* + * The remainder of this structure is private to the UDS implementation. + */ + FunnelQueueEntry requestQueueLink; // for lock-free request queue + Request *nextRequest; + IndexRouter *router; + + // Data for control message requests + ZoneMessage zoneMessage; + bool isControlMessage; + + bool unbatched; // if true, must wake worker when enqueued + bool requeued; + RequestAction action; // the action for the index to perform + unsigned int zoneNumber; // the zone for this request to use + IndexRegion location; // if and where the block was found + + bool slLocationKnown; // slow lane has determined a location + IndexRegion slLocation; // location determined by slowlane +}; + +typedef void (*RequestRestarter)(Request *); + +/** + * Make an asynchronous control message for an index zone and enqueue it for + * processing. + * + * @param action The control action to perform + * @param message The message to send + * @param zone The zone number of the zone to receive the message + * @param router The index router responsible for handling the message + * + * @return UDS_SUCCESS or an error code + **/ +int launchZoneControlMessage(RequestAction action, + ZoneMessage message, + unsigned int zone, + IndexRouter *router) + __attribute__((warn_unused_result)); + +/** + * Free an index request. + * + * @param request The request to free + **/ +void freeRequest(Request *request); + +/** + * Enqueue a request for the next stage of the pipeline. If there is more than + * one possible queue for a stage, this function uses the request to decide + * which queue should handle it. + * + * @param request The request to enqueue + * @param nextStage The next stage of the pipeline to process the request + **/ +void enqueueRequest(Request *request, RequestStage nextStage); + +/** + * A method to restart delayed requests. + * + * @param request The request to restart + **/ +void restartRequest(Request *request); + +/** + * Set the function pointer which is used to restart requests. + * This is needed by albserver code and is used as a test hook by the unit + * tests. + * + * @param restarter The function to call to restart requests. + **/ +void setRequestRestarter(RequestRestarter restarter); + +/** + * Enter the callback stage of processing for a request, notifying the waiting + * thread if the request is synchronous, freeing the request if it is an + * asynchronous control message, or placing it on the callback queue if it is + * an asynchronous client request. + * + * @param request the request which has completed execution + **/ +void enterCallbackStage(Request *request); + +/** + * Update the context statistics to reflect the successful completion of a + * client request. + * + * @param request a client request that has successfully completed execution + **/ +void updateRequestContextStats(Request *request); + +/** + * Compute the CacheProbeType value reflecting the request and page type. + * + * @param request The request being processed, or NULL + * @param isIndexPage Whether the cache probe will be for an index page + * + * @return the cache probe type enumeration + **/ +static INLINE CacheProbeType cacheProbeType(Request *request, + bool isIndexPage) +{ + if ((request != NULL) && request->requeued) { + return isIndexPage ? CACHE_PROBE_INDEX_RETRY : CACHE_PROBE_RECORD_RETRY; + } else { + return isIndexPage ? CACHE_PROBE_INDEX_FIRST : CACHE_PROBE_RECORD_FIRST; + } +} +#endif /* REQUEST_H */ diff --git a/source/uds/requestQueue.h b/source/uds/requestQueue.h new file mode 100644 index 0000000..5bf7ef6 --- /dev/null +++ b/source/uds/requestQueue.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/requestQueue.h#1 $ + */ + +#ifndef REQUEST_QUEUE_H +#define REQUEST_QUEUE_H + +#include "opaqueTypes.h" +#include "typeDefs.h" + +/* void return value because this function will process its own errors */ +typedef void RequestQueueProcessor(Request *); + +/** + * Allocate a new request processing queue and start a worker thread to + * consume and service requests in the queue. + * + * @param queueName the name of the queue and the worker thread + * @param processOne the function the worker will invoke on each request + * @param queuePtr a pointer to receive the new queue + * + * @return UDS_SUCCESS or an error code + **/ +int makeRequestQueue(const char *queueName, + RequestQueueProcessor *processOne, + RequestQueue **queuePtr) + __attribute__((warn_unused_result)); + +/** + * Add a request to the end of the queue for processing by the worker thread. + * If the requeued flag is set on the request, it will be processed before + * any non-requeued requests under most circumstances. + * + * @param queue the request queue that should process the request + * @param request the request to be processed on the queue's worker thread + **/ +void requestQueueEnqueue(RequestQueue *queue, Request *request); + +/** + * Shut down the request queue worker thread, then destroy and free the queue. + * + * @param queue the queue to shut down and free + **/ +void requestQueueFinish(RequestQueue *queue); + +#endif /* REQUEST_QUEUE_H */ diff --git a/source/uds/requestQueueKernel.c b/source/uds/requestQueueKernel.c new file mode 100644 index 0000000..a53ff12 --- /dev/null +++ b/source/uds/requestQueueKernel.c @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/requestQueueKernel.c#3 $ + */ + +#include "requestQueue.h" + +#include + +#include "atomicDefs.h" +#include "compiler.h" +#include "logger.h" +#include "request.h" +#include "memoryAlloc.h" +#include "threads.h" +#include "util/funnelQueue.h" + +/* + * Ordering: + * + * Multiple retry requests or multiple non-retry requests enqueued from + * a single producer thread will be processed in the order enqueued. + * + * Retry requests will generally be processed before normal requests. + * + * HOWEVER, a producer thread can enqueue a retry request (generally given + * higher priority) and then enqueue a normal request, and they can get + * processed in the reverse order. The checking of the two internal queues is + * very simple and there's a potential race with the producer regarding the + * "priority" handling. If an ordering guarantee is needed, it can be added + * without much difficulty, it just makes the code a bit more complicated. + * + * If requests are enqueued while the processing of another request is + * happening, and the enqueuing operations complete while the request + * processing is still in progress, then the retry request(s) *will* + * get processed next. (This is used for testing.) + */ + +/** + * Time constants, all in units of nanoseconds. + **/ +enum { + ONE_NANOSECOND = 1, + ONE_MICROSECOND = 1000 * ONE_NANOSECOND, + ONE_MILLISECOND = 1000 * ONE_MICROSECOND, + ONE_SECOND = 1000 * ONE_MILLISECOND, + + /** The initial time to wait after waiting with no timeout */ + DEFAULT_WAIT_TIME = 20 * ONE_MICROSECOND, + + /** The minimum time to wait when waiting with a timeout */ + MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2, + + /** The maximimum time to wait when waiting with a timeout */ + MAXIMUM_WAIT_TIME = ONE_MILLISECOND +}; + +/** + * Batch size tuning constants. These are compared to the number of requests + * that have been processed since the worker thread last woke up. + **/ +enum { + MINIMUM_BATCH = 32, // wait time increases if batches are smaller than this + MAXIMUM_BATCH = 64 // wait time decreases if batches are larger than this +}; + +struct requestQueue { + /* Wait queue for synchronizing producers and consumer */ + struct wait_queue_head wqhead; + /* function to process 1 request */ + RequestQueueProcessor *processOne; + /* new incoming requests */ + FunnelQueue *mainQueue; + /* old requests to retry first */ + FunnelQueue *retryQueue; + /* thread id of the worker thread */ + Thread thread; + /* true if the worker was started */ + bool started; + /* when true, requests can be enqueued */ + bool alive; + /* A flag set when the worker is waiting without a timeout */ + atomic_t dormant; +}; + +/*****************************************************************************/ +/** + * Poll the underlying lock-free queues for a request to process. Must only be + * called by the worker thread. + * + * @param queue the RequestQueue being serviced + * + * @return a dequeued request, or NULL if no request was available + **/ +static INLINE Request *pollQueues(RequestQueue *queue) +{ + // The retry queue has higher priority. + FunnelQueueEntry *entry = funnelQueuePoll(queue->retryQueue); + if (entry != NULL) { + return container_of(entry, Request, requestQueueLink); + } + + // The main queue has lower priority. + entry = funnelQueuePoll(queue->mainQueue); + if (entry != NULL) { + return container_of(entry, Request, requestQueueLink); + } + + // No entry found. + return NULL; +} + +/*****************************************************************************/ +/** + * Check if the underlying lock-free queues appear not just not to have any + * requests available right now, but also not to be in the intermediate state + * of getting requests added. Must only be called by the worker thread. + * + * @param queue the RequestQueue being serviced + * + * @return true iff both funnel queues are idle + **/ +static INLINE bool areQueuesIdle(RequestQueue *queue) +{ + return (isFunnelQueueIdle(queue->retryQueue) && + isFunnelQueueIdle(queue->mainQueue)); +} + +/*****************************************************************************/ +/** + * Remove the next request to be processed from the queue. Must only be called + * by the worker thread. + * + * @param queue the queue from which to remove an entry + * @param requestPtr the next request is returned here, or a NULL pointer to + * indicate that there will be no more requests + * @param waitedPtr return a boolean to indicate that we need to wait + * + * @return True when there is a next request, or when we know that there will + * never be another request. False when we must wait for a request. + **/ +static INLINE bool dequeueRequest(RequestQueue *queue, + Request **requestPtr, + bool *waitedPtr) +{ + // Because of batching, we expect this to be the most common code path. + Request *request = pollQueues(queue); + if (request != NULL) { + // Return because we found a request + *requestPtr = request; + return true; + } + + if (!READ_ONCE(queue->alive)) { + // Return because we see that shutdown is happening + *requestPtr = NULL; + return true; + } + + // Return indicating that we need to wait. + *requestPtr = NULL; + *waitedPtr = true; + return false; +} + +/*****************************************************************************/ +static void requestQueueWorker(void *arg) +{ + RequestQueue *queue = (RequestQueue *) arg; + unsigned long timeBatch = DEFAULT_WAIT_TIME; + bool dormant = atomic_read(&queue->dormant); + long currentBatch = 0; + + for (;;) { + Request *request; + bool waited = false; + if (dormant) { + /* + * Sleep/wakeup protocol: + * + * The enqueue operation updates "newest" in the + * funnel queue via xchg which is a memory barrier, + * and later checks "dormant" to decide whether to do + * a wakeup of the worker thread. + * + * The worker thread, when deciding to go to sleep, + * sets "dormant" and then examines "newest" to decide + * if the funnel queue is idle. In dormant mode, the + * last examination of "newest" before going to sleep + * is done inside the wait_event_interruptible macro, + * after a point where (one or more) memory barriers + * have been issued. (Preparing to sleep uses spin + * locks.) Even if the "next" field update isn't + * visible yet to make the entry accessible, its + * existence will kick the worker thread out of + * dormant mode and back into timer-based mode. + * + * So the two threads should agree on the ordering of + * the updating of the two fields. + */ + wait_event_interruptible(queue->wqhead, + dequeueRequest(queue, &request, &waited) || + !areQueuesIdle(queue)); + } else { + wait_event_interruptible_hrtimeout(queue->wqhead, + dequeueRequest(queue, &request, + &waited), + ns_to_ktime(timeBatch)); + } + + if (likely(request != NULL)) { + // We got a request. + currentBatch++; + queue->processOne(request); + } else if (!READ_ONCE(queue->alive)) { + // We got no request and we know we are shutting down. + break; + } + + if (dormant) { + // We've been roused from dormancy. Clear the flag so enqueuers can stop + // broadcasting (no fence needed for this transition). + atomic_set(&queue->dormant, false); + dormant = false; + // Reset the timeout back to the default since we don't know how long + // we've been asleep and we also want to be responsive to a new burst. + timeBatch = DEFAULT_WAIT_TIME; + } else if (waited) { + // We waited for this request to show up. Adjust the wait time if the + // last batch of requests was too small or too large.. + if (currentBatch < MINIMUM_BATCH) { + // Adjust the wait time if the last batch of requests was too small. + timeBatch += timeBatch / 4; + if (timeBatch >= MAXIMUM_WAIT_TIME) { + // The timeout is getting long enough that we need to switch into + // dormant mode. + atomic_set(&queue->dormant, true); + dormant = true; + } + } else if (currentBatch > MAXIMUM_BATCH) { + // Adjust the wait time if the last batch of requests was too large. + timeBatch -= timeBatch / 4; + if (timeBatch < MINIMUM_WAIT_TIME) { + // But if the producer is very fast or the scheduler doesn't wake up + // up promptly, waiting for very short times won't make the batches + // smaller. + timeBatch = MINIMUM_WAIT_TIME; + } + } + // And we must now start a new batch count + currentBatch = 0; + } + } + + /* + * Ensure that we see any requests that were guaranteed to have been fully + * enqueued before shutdown was flagged. The corresponding write barrier + * is in requestQueueFinish. + */ + smp_rmb(); + + // Process every request that is still in the queue, and never wait for any + // new requests to show up. + for (;;) { + Request *request = pollQueues(queue); + if (request == NULL) { + break; + } + queue->processOne(request); + } +} + +/**********************************************************************/ +int makeRequestQueue(const char *queueName, + RequestQueueProcessor *processOne, + RequestQueue **queuePtr) +{ + RequestQueue *queue; + int result = ALLOCATE(1, RequestQueue, __func__, &queue); + if (result != UDS_SUCCESS) { + return result; + } + queue->processOne = processOne; + queue->alive = true; + atomic_set(&queue->dormant, false); + init_waitqueue_head(&queue->wqhead); + + result = makeFunnelQueue(&queue->mainQueue); + if (result != UDS_SUCCESS) { + requestQueueFinish(queue); + return result; + } + + result = makeFunnelQueue(&queue->retryQueue); + if (result != UDS_SUCCESS) { + requestQueueFinish(queue); + return result; + } + + result = createThread(requestQueueWorker, queue, queueName, &queue->thread); + if (result != UDS_SUCCESS) { + requestQueueFinish(queue); + return result; + } + + queue->started = true; + smp_mb(); + *queuePtr = queue; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static INLINE void wakeUpWorker(RequestQueue *queue) +{ + // This is the code sequence recommended in + smp_mb(); + if (waitqueue_active(&queue->wqhead)) { + wake_up(&queue->wqhead); + } +} + +/**********************************************************************/ +void requestQueueEnqueue(RequestQueue *queue, Request *request) +{ + bool unbatched = request->unbatched; + funnelQueuePut(request->requeued ? queue->retryQueue : queue->mainQueue, + &request->requestQueueLink); + + /* + * We must wake the worker thread when it is dormant (waiting with no + * timeout). An atomic load (read fence) isn't needed here since we know the + * queue operation acts as one. + */ + if (atomic_read(&queue->dormant) || unbatched) { + wakeUpWorker(queue); + } +} + +/**********************************************************************/ +void requestQueueFinish(RequestQueue *queue) +{ + if (queue == NULL) { + return; + } + + /* + * This memory barrier ensures that any requests we queued will be seen. The + * point is that when dequeueRequest sees the following update to the alive + * flag, it will also be able to see any change we made to a next field in + * the FunnelQueue entry. The corresponding read barrier is in + * requestQueueWorker. + */ + smp_wmb(); + + // Mark the queue as dead. + WRITE_ONCE(queue->alive, false); + + if (queue->started) { + // Wake the worker so it notices that it should exit. + wakeUpWorker(queue); + + // Wait for the worker thread to finish processing any additional pending + // work and exit. + int result = joinThreads(queue->thread); + if (result != UDS_SUCCESS) { + logWarningWithStringError(result, "Failed to join worker thread"); + } + } + + freeFunnelQueue(queue->mainQueue); + freeFunnelQueue(queue->retryQueue); + FREE(queue); +} diff --git a/source/uds/searchList.c b/source/uds/searchList.c new file mode 100644 index 0000000..ec2ef70 --- /dev/null +++ b/source/uds/searchList.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/searchList.c#2 $ + */ + +#include "searchList.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" + +/**********************************************************************/ +int makeSearchList(unsigned int capacity, + SearchList **listPtr) +{ + if (capacity == 0) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "search list must have entries"); + } + if (capacity > UINT8_MAX) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "search list capacity must fit in 8 bits"); + } + + // We need three temporary entry arrays for purgeSearchList(). Allocate them + // contiguously with the main array. + unsigned int bytes = (sizeof(SearchList) + (4 * capacity * sizeof(uint8_t))); + SearchList *list; + int result = allocateCacheAligned(bytes, "search list", &list); + if (result != UDS_SUCCESS) { + return result; + } + + list->capacity = capacity; + list->firstDeadEntry = 0; + + // Fill in the indexes of the chapter index cache entries. These will be + // only ever be permuted as the search list is used. + uint8_t i; + for (i = 0; i < capacity; i++) { + list->entries[i] = i; + } + + *listPtr = list; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeSearchList(SearchList **listPtr) +{ + FREE(*listPtr); + *listPtr = NULL; +} + +/**********************************************************************/ +void purgeSearchList(SearchList *searchList, + const CachedChapterIndex chapters[], + uint64_t oldestVirtualChapter) +{ + if (searchList->firstDeadEntry == 0) { + // There are no live entries in the list to purge. + return; + } + + /* + * Partition the previously-alive entries in the list into three temporary + * lists, keeping the current LRU search order within each list. The element + * array was allocated with enough space for all four lists. + */ + uint8_t *entries = &searchList->entries[0]; + uint8_t *alive = &entries[searchList->capacity]; + uint8_t *skipped = &alive[searchList->capacity]; + uint8_t *dead = &skipped[searchList->capacity]; + unsigned int nextAlive = 0; + unsigned int nextSkipped = 0; + unsigned int nextDead = 0; + + int i; + for (i = 0; i < searchList->firstDeadEntry; i++) { + uint8_t entry = entries[i]; + const CachedChapterIndex *chapter = &chapters[entry]; + if ((chapter->virtualChapter < oldestVirtualChapter) + || (chapter->virtualChapter == UINT64_MAX)) { + dead[nextDead++] = entry; + } else if (chapter->skipSearch) { + skipped[nextSkipped++] = entry; + } else { + alive[nextAlive++] = entry; + } + } + + // Copy the temporary lists back to the search list so we wind up with + // [ alive, alive, skippable, new-dead, new-dead, old-dead, old-dead ] + memcpy(entries, alive, nextAlive); + entries += nextAlive; + + memcpy(entries, skipped, nextSkipped); + entries += nextSkipped; + + memcpy(entries, dead, nextDead); + // The first dead entry is now the start of the copied dead list. + searchList->firstDeadEntry = (nextAlive + nextSkipped); +} diff --git a/source/uds/searchList.h b/source/uds/searchList.h new file mode 100644 index 0000000..25d99e9 --- /dev/null +++ b/source/uds/searchList.h @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/searchList.h#1 $ + */ + +#ifndef SEARCH_LIST_H +#define SEARCH_LIST_H + +#include "cachedChapterIndex.h" +#include "compiler.h" +#include "stringUtils.h" +#include "typeDefs.h" + +/** + * A SearchList represents the permutations of the sparse chapter index cache + * entry array. Those permutations express an ordering on the chapter indexes, + * from most recently accessed to least recently accessed, which is the order + * in which the indexes should be searched and the reverse order in which they + * should be evicted from the cache (LRU cache replacement policy). + * + * Cache entries that are dead (virtualChapter == UINT64_MAX) are kept as a + * suffix of the list, avoiding the need to even iterate over them to search, + * and ensuring that dead entries are replaced before any live entries are + * evicted. + * + * The search list is intended to be instantated for each zone thread, + * avoiding any need for synchronization. The structure is allocated on a + * cache boundary to avoid false sharing of memory cache lines between zone + * threads. + **/ +typedef struct searchList { + /** The number of cached chapter indexes and search list entries */ + uint8_t capacity; + + /** The index in the entries array of the first dead cache entry */ + uint8_t firstDeadEntry; + + /** The chapter array indexes representing the chapter search order */ + uint8_t entries[]; +} SearchList; + +/** + * SearchListIterator captures the fields needed to iterate over the live + * entries in a search list and return the CachedChapterIndex pointers that + * the search code actually wants to deal with. + **/ +typedef struct { + /** The search list defining the chapter search iteration order */ + SearchList *list; + + /** The index of the next entry to return from the search list */ + unsigned int nextEntry; + + /** The cached chapters that are referenced by the search list */ + CachedChapterIndex *chapters; +} SearchListIterator; + +/** + * Allocate and initialize a new chapter cache search list with the same + * capacity as the cache. The index of each entry in the cache will appear + * exactly once in the array. All the chapters in the cache are assumed to be + * initially dead, so firstDeadEntry will be zero and no chapters will be + * returned when the search list is iterated. + * + * @param [in] capacity the number of entries in the search list + * @param [out] listPtr a pointer in which to return the new search list + **/ +int makeSearchList(unsigned int capacity, + SearchList **listPtr) + __attribute__((warn_unused_result)); + +/** + * Free a search list and null out the reference to it. + * + * @param listPtr the reference to the search list to free + **/ +void freeSearchList(SearchList **listPtr); + +/** + * Copy the contents of one search list to another. + * + * @param source the list to copy + * @param target the list to replace + **/ +static INLINE void copySearchList(const SearchList *source, + SearchList *target) +{ + *target = *source; + memcpy(target->entries, source->entries, source->capacity); +} + +/** + * Prepare to iterate over the live cache entries a search list. + * + * @param list the list defining the live chapters and the search order + * @param chapters the chapter index entries to return from getNextChapter() + * + * @return an iterator positioned at the start of the search list + **/ +static INLINE SearchListIterator +iterateSearchList(SearchList *list, CachedChapterIndex chapters[]) +{ + SearchListIterator iterator = { + .list = list, + .nextEntry = 0, + .chapters = chapters, + }; + return iterator; +} + +/** + * Check if the search list iterator has another entry to return. + * + * @param iterator the search list iterator + * + * @return true if getNextChapter() may be called + **/ +static INLINE bool hasNextChapter(const SearchListIterator *iterator) +{ + return (iterator->nextEntry < iterator->list->firstDeadEntry); +} + +/** + * Return a pointer to the next live chapter in the search list iteration and + * advance the iterator. This must only be called when hasNextChapter() + * returns true. + * + * @param iterator the search list iterator + * + * @return a pointer to the next live chapter index in the search list order + **/ +static INLINE CachedChapterIndex *getNextChapter(SearchListIterator *iterator) +{ + return &iterator->chapters[iterator->list->entries[iterator->nextEntry++]]; +} + +/** + * Rotate the pointers in a prefix of a search list downwards by one item, + * pushing elements deeper into the list and moving a new chapter to the start + * of the search list. This is the "make most recent" operation on the search + * list. + * + * If the search list provided is [ 0 1 2 3 4 ] and the prefix + * length is 4, then 3 is being moved to the front. + * The search list after the call will be [ 3 0 1 2 4 ] and the + * function will return 3. + * + * @param searchList the chapter index search list to rotate + * @param prefixLength the length of the prefix of the list to rotate + * + * @return the array index of the chapter cache entry that is now at the front + * of the search list + **/ +static INLINE uint8_t rotateSearchList(SearchList *searchList, + uint8_t prefixLength) +{ + // Grab the value of the last entry in the list prefix. + uint8_t mostRecent = searchList->entries[prefixLength - 1]; + + if (prefixLength > 1) { + // Push the first N-1 entries down by one entry, overwriting the entry + // we just grabbed. + memmove(&searchList->entries[1], + &searchList->entries[0], + prefixLength - 1); + + // We now have a hole at the front of the list in which we can place the + // rotated entry. + searchList->entries[0] = mostRecent; + } + + // This function is also used to move a dead chapter to the front of the + // list, in which case the suffix of dead chapters was pushed down too. + if (searchList->firstDeadEntry < prefixLength) { + searchList->firstDeadEntry += 1; + } + + return mostRecent; +} + +/** + * Purge invalid cache entries, marking them as dead and moving them to the + * end of the search list, then push any chapters that have skipSearch set + * down so they follow all the remaining live, valid chapters in the search + * list. This effectively sorts the search list into three regions--active, + * skippable, and dead--while maintaining the LRU ordering that already + * existed (a stable sort). + * + * This operation must only be called during the critical section in + * updateSparseCache() since it effectively changes cache membership. + * + * @param searchList the chapter index search list to purge + * @param chapters the chapter index cache entries + * @param oldestVirtualChapter the oldest virtual chapter + **/ +void purgeSearchList(SearchList *searchList, + const CachedChapterIndex chapters[], + uint64_t oldestVirtualChapter); + +#endif /* SEARCH_LIST_H */ diff --git a/source/uds/sparseCache.c b/source/uds/sparseCache.c new file mode 100644 index 0000000..f816d12 --- /dev/null +++ b/source/uds/sparseCache.c @@ -0,0 +1,535 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/sparseCache.c#3 $ + */ + +/** + * The sparse chapter index cache is implemented as a simple array of cache + * entries. Since the cache is small (seven chapters by default), searching + * for a specific virtual chapter is implemented as a linear search. The cache + * replacement policy is least-recently-used (LRU). Again, size of the cache + * allows the LRU order to be maintained by shifting entries in an array list. + * + * The most important property of this cache is the absence of synchronization + * for read operations. Safe concurrent access to the cache by the zone + * threads is controlled by the triage queue and the barrier requests it + * issues to the zone queues. The set of cached chapters does not and must not + * change between the carefully coordinated calls to updateSparseCache() from + * the zone threads. + * + * The critical invariant for that coordination is the cache membership must + * not change between those updates; the calls to sparseCacheContains() from + * the zone threads must all receive the same results for any virtual chapter + * number. To ensure that critical invariant, state changes such as "that + * virtual chapter is no longer in the volume" and "skip searching that + * chapter because it has had too many cache misses" are represented + * separately from the cache membership information (the virtual chapter + * number). + * + * As a result of this invariant, we have the guarantee that every zone thread + * will call updateSparseCache() once and exactly once to request a chapter + * that is not in the cache, and the serialization of the barrier requests + * from the triage queue ensures they will all request the same chapter + * number. This means the only synchronization we need can be provided by a + * pair of thread barriers used only in the updateSparseCache() call, + * providing a critical section where a single zone thread can drive the cache + * update while all the other zone threads are known to be blocked, waiting in + * the second barrier. Outside that critical section, all the zone threads + * implicitly hold a shared lock. Inside it, the "captain" (the thread that + * was uniquely flagged when passing through the first barrier) holds an + * exclusive lock. No other threads may access or modify the cache, except for + * accessing cache statistics and similar queries. + * + * Cache statistics must only be modified by a single thread, conventionally + * the zone zero thread. All fields that might be frequently updated by that + * thread are kept in separate cache-aligned structures so they will not cause + * cache contention via "false sharing" with the fields that are frequently + * accessed by all of the zone threads. + * + * LRU order is kept independently by each zone thread, and each zone uses its + * own list for searching and cache membership queries. The zone zero list is + * used to decide which chapter to evict when the cache is updated, and its + * search list is copied to the other threads at that time. + * + * The virtual chapter number field of the cache entry is the single field + * indicating whether a chapter is a member of the cache or not. The value + * UINT64_MAX is used to represent a null, undefined, or wildcard + * chapter number. When present in the virtual chapter number field + * CachedChapterIndex, it indicates that the cache entry is dead, and all + * the other fields of that entry (other than immutable pointers to cache + * memory) are undefined and irrelevant. Any cache entry that is not marked as + * dead is fully defined and a member of the cache--sparseCacheContains() + * must always return true for any virtual chapter number that appears in any + * of the cache entries. + * + * A chapter index that is a member of the cache may be marked for different + * treatment (disabling search) between calls to updateSparseCache() in two + * different ways. When a chapter falls off the end of the volume, its virtual + * chapter number will be less that the oldest virtual chapter number. Since + * that chapter is no longer part of the volume, there's no point in continuing + * to search that chapter index. Once invalidated, that virtual chapter will + * still be considered a member of the cache, but it will no longer be searched + * for matching chunk names. + * + * The second mechanism for disabling search is the heuristic based on keeping + * track of the number of consecutive search misses in a given chapter index. + * Once that count exceeds a threshold, the skipSearch flag will be set to + * true, causing the chapter to be skipped in the fallback search of the + * entire cache, but still allowing it to be found when searching for a hook + * in that specific chapter. Finding a hook will clear the skipSearch flag, + * once again allowing the non-hook searches to use the cache entry. Again, + * regardless of the state of the skipSearch flag, the virtual chapter must + * still considered to be a member of the cache for sparseCacheContains(). + * + * Barrier requests and the sparse chapter index cache are also described in + * + * https://intranet.permabit.com/wiki/Chapter_Index_Cache_supports_concurrent_access + * + * and in a message to the albireo mailing list on 5/28/2011 titled "true + * barriers with a hook resolution queue". + **/ + +#include "sparseCache.h" + +#include "cachedChapterIndex.h" +#include "chapterIndex.h" +#include "common.h" +#include "index.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "searchList.h" +#include "threads.h" +#include "zone.h" + +enum { + /** The number of consecutive search misses that will disable searching */ + SKIP_SEARCH_THRESHOLD = 20000, + + /** a named constant to use when identifying zone zero */ + ZONE_ZERO = 0 +}; + +/** + * These counter values are essentially fields of the SparseCache, but are + * segregated into this structure because they are frequently modified. We + * group them and align them to keep them on different cache lines from the + * cache fields that are accessed far more often than they are updated. + **/ +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) sparseCacheCounters { + /** the total number of virtual chapter probes that succeeded */ + uint64_t chapterHits; + + /** the total number of virtual chapter probes that failed */ + uint64_t chapterMisses; + + /** the total number of cache searches that found a possible match */ + uint64_t searchHits; + + /** the total number of cache searches that found no matches */ + uint64_t searchMisses; + + /** the number of cache entries that fell off the end of the volume */ + uint64_t invalidations; + + /** the number of cache entries that were evicted while still valid */ + uint64_t evictions; +} SparseCacheCounters; + +/** + * This is the private structure definition of a SparseCache. + **/ +struct sparseCache { + /** the number of cache entries, which is the size of the chapters array */ + unsigned int capacity; + + /** the number of zone threads using the cache */ + unsigned int zoneCount; + + /** the geometry governing the volume */ + const Geometry *geometry; + + /** the number of search misses in zone zero that will disable searching */ + unsigned int skipSearchThreshold; + + /** pointers to the cache-aligned chapter search order for each zone */ + SearchList *searchLists[MAX_ZONES]; + + /** the thread barriers used to synchronize the zone threads for update */ + Barrier beginCacheUpdate; + Barrier endCacheUpdate; + + /** frequently-updated counter fields (cache-aligned) */ + SparseCacheCounters counters; + + /** the counted array of chapter index cache entries (cache-aligned) */ + CachedChapterIndex chapters[]; +}; + +/** + * Initialize a sparse chapter index cache. + * + * @param cache the sparse cache to initialize + * @param geometry the geometry governing the volume + * @param capacity the number of chapters the cache will hold + * @param zoneCount the number of zone threads using the cache + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int initializeSparseCache(SparseCache *cache, + const Geometry *geometry, + unsigned int capacity, + unsigned int zoneCount) +{ + cache->geometry = geometry; + cache->capacity = capacity; + cache->zoneCount = zoneCount; + + // Scale down the skip threshold by the number of zones since we count the + // chapter search misses only in zone zero. + cache->skipSearchThreshold = (SKIP_SEARCH_THRESHOLD / zoneCount); + + int result = initializeBarrier(&cache->beginCacheUpdate, zoneCount); + if (result != UDS_SUCCESS) { + return result; + } + result = initializeBarrier(&cache->endCacheUpdate, zoneCount); + if (result != UDS_SUCCESS) { + return result; + } + unsigned int i; + for (i = 0; i < capacity; i++) { + result = initializeCachedChapterIndex(&cache->chapters[i], geometry); + if (result != UDS_SUCCESS) { + return result; + } + } + + // Allocate each zone's independent LRU order. + for (i = 0; i < zoneCount; i++) { + result = makeSearchList(capacity, &cache->searchLists[i]); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeSparseCache(const Geometry *geometry, + unsigned int capacity, + unsigned int zoneCount, + SparseCache **cachePtr) +{ + unsigned int bytes + = (sizeof(SparseCache) + (capacity * sizeof(CachedChapterIndex))); + + SparseCache *cache; + int result = allocateCacheAligned(bytes, "sparse cache", &cache); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializeSparseCache(cache, geometry, capacity, zoneCount); + if (result != UDS_SUCCESS) { + freeSparseCache(cache); + return result; + } + + *cachePtr = cache; + return UDS_SUCCESS; +} + +/**********************************************************************/ +size_t getSparseCacheMemorySize(const SparseCache *cache) +{ + // Count the DeltaIndexPage as cache memory, but ignore all other overhead. + size_t pageSize = (sizeof(DeltaIndexPage) + cache->geometry->bytesPerPage); + size_t chapterSize = (pageSize * cache->geometry->indexPagesPerChapter); + return (cache->capacity * chapterSize); +} + +/** + * Update counters to reflect a chapter access hit and clear the skipSearch + * flag on the chapter, if set. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void scoreChapterHit(SparseCache *cache, + CachedChapterIndex *chapter) +{ + cache->counters.chapterHits += 1; + setSkipSearch(chapter, false); +} + +/** + * Update counters to reflect a chapter access miss. + * + * @param cache the cache to update + **/ +static void scoreChapterMiss(SparseCache *cache) +{ + cache->counters.chapterMisses += 1; +} + +/** + * Check if the cache entry that is about to be replaced is already dead, and + * if it's not, add to tally of evicted or invalidated cache entries. + * + * @param zone the zone used to find the oldest chapter + * @param cache the cache to update + * @param chapter the cache entry about to be replaced + **/ +static void scoreEviction(IndexZone *zone, + SparseCache *cache, + CachedChapterIndex *chapter) +{ + if (chapter->virtualChapter == UINT64_MAX) { + return; + } + if (chapter->virtualChapter < zone->oldestVirtualChapter) { + cache->counters.invalidations += 1; + } else { + cache->counters.evictions += 1; + } +} + +/** + * Update counters to reflect a cache search hit. This bumps the hit + * count, clears the miss count, and clears the skipSearch flag. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void scoreSearchHit(SparseCache *cache, + CachedChapterIndex *chapter) +{ + cache->counters.searchHits += 1; + chapter->counters.searchHits += 1; + chapter->counters.consecutiveMisses = 0; + setSkipSearch(chapter, false); +} + +/** + * Update counters to reflect a cache search miss. This bumps the consecutive + * miss count, and if it goes over skipSearchThreshold, sets the skipSearch + * flag on the chapter. + * + * @param cache the cache to update + * @param chapter the cache entry to update + **/ +static void scoreSearchMiss(SparseCache *cache, + CachedChapterIndex *chapter) +{ + cache->counters.searchMisses += 1; + chapter->counters.searchMisses += 1; + chapter->counters.consecutiveMisses += 1; + if (chapter->counters.consecutiveMisses > cache->skipSearchThreshold) { + setSkipSearch(chapter, true); + } +} + +/**********************************************************************/ +void freeSparseCache(SparseCache *cache) +{ + if (cache == NULL) { + return; + } + + unsigned int i; + for (i = 0; i < cache->zoneCount; i++) { + freeSearchList(&cache->searchLists[i]); + } + + for (i = 0; i < cache->capacity; i++) { + CachedChapterIndex *chapter = &cache->chapters[i]; + destroyCachedChapterIndex(chapter); + } + + destroyBarrier(&cache->beginCacheUpdate); + destroyBarrier(&cache->endCacheUpdate); + FREE(cache); +} + + +/**********************************************************************/ +bool sparseCacheContains(SparseCache *cache, + uint64_t virtualChapter, + unsigned int zoneNumber) +{ + /* + * The correctness of the barriers depends on the invariant that between + * calls to updateSparseCache(), the answers this function returns must + * never vary--the result for a given chapter must be identical across + * zones. That invariant must be maintained even if the chapter falls off + * the end of the volume, or if searching it is disabled because of too many + * search misses. + */ + + // Get the chapter search order for this zone thread. + SearchListIterator iterator + = iterateSearchList(cache->searchLists[zoneNumber], cache->chapters); + while (hasNextChapter(&iterator)) { + CachedChapterIndex *chapter = getNextChapter(&iterator); + if (virtualChapter == chapter->virtualChapter) { + if (zoneNumber == ZONE_ZERO) { + scoreChapterHit(cache, chapter); + } + + // Move the chapter to the front of the search list. + rotateSearchList(iterator.list, iterator.nextEntry); + return true; + } + } + + // The specified virtual chapter isn't cached. + if (zoneNumber == ZONE_ZERO) { + scoreChapterMiss(cache); + } + return false; +} + +/**********************************************************************/ +int updateSparseCache(IndexZone *zone, uint64_t virtualChapter) +{ + const Index *index = zone->index; + SparseCache *cache = index->volume->sparseCache; + + // If the chapter is already in the cache, we don't need to do a thing + // except update the search list order, which this check does. + if (sparseCacheContains(cache, virtualChapter, zone->id)) { + return UDS_SUCCESS; + } + + // Wait for every zone thread to have reached its corresponding barrier + // request and invoked this function before starting to modify the cache. + enterBarrier(&cache->beginCacheUpdate, NULL); + + /* + * This is the start of the critical section: the zone zero thread is + * captain, effectively holding an exclusive lock on the sparse cache. All + * the other zone threads must do nothing between the two barriers. They + * will wait at the endCacheUpdate barrier for the captain to finish the + * update. + */ + + int result = UDS_SUCCESS; + if (zone->id == ZONE_ZERO) { + // Purge invalid chapters from the LRU search list. + SearchList *zoneZeroList = cache->searchLists[ZONE_ZERO]; + purgeSearchList(zoneZeroList, cache->chapters, zone->oldestVirtualChapter); + + // First check that the desired chapter is still in the volume. If it's + // not, the hook fell out of the index and there's nothing to do for it. + if (virtualChapter >= index->oldestVirtualChapter) { + // Evict the least recently used live chapter, or replace a dead cache + // entry, all by rotating the the last list entry to the front. + CachedChapterIndex *victim + = &cache->chapters[rotateSearchList(zoneZeroList, cache->capacity)]; + + // Check if the victim is already dead, and if it's not, add to the + // tally of evicted or invalidated cache entries. + scoreEviction(zone, cache, victim); + + // Read the index page bytes and initialize the page array. + result = cacheChapterIndex(victim, virtualChapter, index->volume); + } + + // Copy the new search list state to all the other zone threads so they'll + // get the result of pruning and see the new chapter. + unsigned int z; + for (z = 1; z < cache->zoneCount; z++) { + copySearchList(zoneZeroList, cache->searchLists[z]); + } + } + + // This is the end of the critical section. All cache invariants must have + // been restored--it will be shared/read-only again beyond the barrier. + + enterBarrier(&cache->endCacheUpdate, NULL); + return result; +} + + +/**********************************************************************/ +int searchSparseCache(IndexZone *zone, + const UdsChunkName *name, + uint64_t *virtualChapterPtr, + int *recordPagePtr) +{ + Volume *volume = zone->index->volume; + SparseCache *cache = volume->sparseCache; + unsigned int zoneNumber = zone->id; + // If the caller did not specify a virtual chapter, search the entire cache. + bool searchAll = (*virtualChapterPtr == UINT64_MAX); + unsigned int chaptersSearched = 0; + + // Get the chapter search order for this zone thread, searching the chapters + // from most recently hit to least recently hit. + SearchListIterator iterator + = iterateSearchList(cache->searchLists[zoneNumber], cache->chapters); + while (hasNextChapter(&iterator)) { + CachedChapterIndex *chapter = getNextChapter(&iterator); + + // Skip chapters no longer cached, or that have too many search misses. + if (shouldSkipChapterIndex(zone, chapter, *virtualChapterPtr)) { + continue; + } + + int result = searchCachedChapterIndex(chapter, cache->geometry, + volume->indexPageMap, name, + recordPagePtr); + if (result != UDS_SUCCESS) { + return result; + } + chaptersSearched += 1; + + // Did we find an index entry for the name? + if (*recordPagePtr != NO_CHAPTER_INDEX_ENTRY) { + if (zoneNumber == ZONE_ZERO) { + scoreSearchHit(cache, chapter); + } + + // Move the chapter to the front of the search list. + rotateSearchList(iterator.list, iterator.nextEntry); + + // Return a matching entry as soon as it is found. It might be a false + // collision that has a true match in another chapter, but that's a very + // rare case and not worth the extra search cost or complexity. + *virtualChapterPtr = chapter->virtualChapter; + return UDS_SUCCESS; + } + + if (zoneNumber == ZONE_ZERO) { + scoreSearchMiss(cache, chapter); + } + + if (!searchAll) { + // We just searched the virtual chapter the caller specified and there + // was no match, so we're done. + break; + } + } + + // The name was not found in the cache. + *recordPagePtr = NO_CHAPTER_INDEX_ENTRY; + return UDS_SUCCESS; +} diff --git a/source/uds/sparseCache.h b/source/uds/sparseCache.h new file mode 100644 index 0000000..09c4a1c --- /dev/null +++ b/source/uds/sparseCache.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/sparseCache.h#1 $ + */ + +#ifndef SPARSE_CACHE_H +#define SPARSE_CACHE_H + +#include "cacheCounters.h" +#include "geometry.h" +#include "indexZone.h" +#include "typeDefs.h" + +/** + * SparseCache is the cache of entire chapter indexes from sparse chapters + * used for searching for chunks after all other search paths have failed. It + * contains only complete chapter indexes; record pages from sparse chapters + * and single index pages used for resolving hooks are kept in the volume page + * cache. + * + * Searching the cache is an unsynchronized operation. Changing the contents + * of the cache is a coordinated process requiring the coordinated + * participation of all zone threads via the careful use of barrier messages + * sent to all the index zones by the triage queue worker thread. + **/ +typedef struct sparseCache SparseCache; + +// Bare declaration to avoid include dependency loops. +struct index; + +/** + * Allocate and initialize a sparse chapter index cache. + * + * @param [in] geometry the geometry governing the volume + * @param [in] capacity the number of chapters the cache will hold + * @param [in] zoneCount the number of zone threads using the cache + * @param [out] cachePtr a pointer in which to return the new cache + * + * @return UDS_SUCCESS or an error code + **/ +int makeSparseCache(const Geometry *geometry, + unsigned int capacity, + unsigned int zoneCount, + SparseCache **cachePtr) + __attribute__((warn_unused_result)); + +/** + * Destroy and free a sparse chapter index cache. + * + * @param cache the cache to free + **/ +void freeSparseCache(SparseCache *cache); + +/** + * Get the number of bytes of memory used by a sparse chapter cache. + * + * @param cache the cache to measure + **/ +size_t getSparseCacheMemorySize(const SparseCache *cache); + + +/** + * Check whether a sparse chapter index is present in the chapter cache. This + * is only intended for use by the zone threads. + * + * @param cache the cache to search for the virtual chapter + * @param virtualChapter the virtual chapter number of the chapter index + * @param zoneNumber the zone number of the calling thread + * + * @return true iff the sparse chapter index is cached + **/ +bool sparseCacheContains(SparseCache *cache, + uint64_t virtualChapter, + unsigned int zoneNumber); + +/** + * Update the sparse cache to contain a chapter index. + * + * This function must be called by all the zone threads with the same chapter + * numbers to correctly enter the thread barriers used to synchronize the + * cache updates. + * + * @param zone the index zone + * @param virtualChapter the virtual chapter number of the chapter index + * + * @return UDS_SUCCESS or an error code if the chapter index could not be + * read or decoded + **/ +int updateSparseCache(IndexZone *zone, uint64_t virtualChapter) + __attribute__((warn_unused_result)); + + +/** + * Search the cached sparse chapter indexes for a chunk name, returning a + * virtual chapter number and record page number that may contain the name. + * + * @param [in] zone the zone containing the volume, sparse + * chapter index cache and the index page + * number map + * @param [in] name the chunk name to search for + * @param [in,out] virtualChapterPtr If UINT64_MAX on input, + * search all cached chapters, else search + * the specified virtual chapter, if cached. + * On output, if a match was found, set to + * the virtual chapter number of the match, + * otherwise set to UINT64_MAX on a miss. + * @param [out] recordPagePtr the record page number of a match, else + * NO_CHAPTER_INDEX_ENTRY if nothing matched + * + * @return UDS_SUCCESS or an error code + **/ +int searchSparseCache(IndexZone *zone, + const UdsChunkName *name, + uint64_t *virtualChapterPtr, + int *recordPagePtr) + __attribute__((warn_unused_result)); + +#endif /* SPARSE_CACHE_H */ diff --git a/source/uds/stringLinuxKernel.c b/source/uds/stringLinuxKernel.c new file mode 100644 index 0000000..bf0a255 --- /dev/null +++ b/source/uds/stringLinuxKernel.c @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/stringLinuxKernel.c#1 $ + */ + +#include + +#include "errors.h" +#include "logger.h" +#include "stringUtils.h" + +/**********************************************************************/ +int stringToSignedLong(const char *nptr, long *num) +{ + while (*nptr == ' ') { + nptr++; + } + return kstrtol(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS; +} + +/**********************************************************************/ +int stringToUnsignedLong(const char *nptr, unsigned long *num) +{ + while (*nptr == ' ') { + nptr++; + } + if (*nptr == '+') { + nptr++; + } + return kstrtoul(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS; +} + +/*****************************************************************************/ +char *nextToken(char *str, const char *delims, char **state) +{ + char *sp = str ? str : *state; + while (*sp && strchr(delims, *sp)) { + ++sp; + } + if (!*sp) { + return NULL; + } + char *ep = sp; + while (*ep && !strchr(delims, *ep)) { + ++ep; + } + if (*ep) { + *ep++ = '\0'; + } + *state = ep; + return sp; +} + +/*****************************************************************************/ +int parseUint64(const char *str, uint64_t *num) +{ + unsigned long value = *num; + int result = stringToUnsignedLong(str, &value); + *num = value; + return result; +} diff --git a/source/uds/stringUtils.c b/source/uds/stringUtils.c new file mode 100644 index 0000000..93d7da1 --- /dev/null +++ b/source/uds/stringUtils.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/stringUtils.c#2 $ + */ + +#include "stringUtils.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" + +/*****************************************************************************/ +int allocSprintf(const char *what, char **strp, const char *fmt, ...) +{ + if (strp == NULL) { + return UDS_INVALID_ARGUMENT; + } + va_list args; +#ifdef __KERNEL__ + // We want the memory allocation to use our own ALLOCATE/FREE wrappers. + va_start(args, fmt); + int count = vsnprintf(NULL, 0, fmt, args) + 1; + va_end(args); + int result = ALLOCATE(count, char, what, strp); + if (result == UDS_SUCCESS) { + va_start(args, fmt); + vsnprintf(*strp, count, fmt, args); + va_end(args); + } +#else + va_start(args, fmt); + int result = vasprintf(strp, fmt, args) == -1 ? ENOMEM : UDS_SUCCESS; + va_end(args); +#endif + if ((result != UDS_SUCCESS) && (what != NULL)) { + logError("cannot allocate %s", what); + } + return result; +} + +/*****************************************************************************/ +int wrapVsnprintf(const char *what, char *buf, size_t bufSize, + int error, const char *fmt, va_list ap, size_t *needed) +{ + if (buf == NULL) { + static char nobuf[1]; + buf = nobuf; + bufSize = 0; + } + int n = vsnprintf(buf, bufSize, fmt, ap); + if (n < 0) { + return logErrorWithStringError(UDS_UNEXPECTED_RESULT, + "%s: vsnprintf failed", what); + } + if (needed) { + *needed = n; + } + if (((size_t) n >= bufSize) && (buf != NULL) && (error != UDS_SUCCESS)) { + return logErrorWithStringError(error, "%s: string too long", what); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int fixedSprintf(const char *what, + char *buf, + size_t bufSize, + int error, + const char *fmt, + ...) +{ + if (buf == NULL) { + return UDS_INVALID_ARGUMENT; + } + va_list args; + va_start(args, fmt); + int result = wrapVsnprintf(what, buf, bufSize, error, fmt, args, NULL); + va_end(args); + return result; +} + +/*****************************************************************************/ +char *vAppendToBuffer(char *buffer, + char *bufEnd, + const char *fmt, + va_list args) +{ + size_t n = vsnprintf(buffer, bufEnd - buffer, fmt, args); + if (n >= (size_t) (bufEnd - buffer)) { + buffer = bufEnd; + } else { + buffer += n; + } + return buffer; +} + +/*****************************************************************************/ +char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + char *pos = vAppendToBuffer(buffer, bufEnd, fmt, ap); + va_end(ap); + return pos; +} + +/*****************************************************************************/ +int stringToSignedInt(const char *nptr, int *num) +{ + long value; + int result = stringToSignedLong(nptr, &value); + if (result != UDS_SUCCESS) { + return result; + } + if ((value < INT_MIN) || (value > INT_MAX)) { + return ERANGE; + } + *num = (int) value; + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int stringToUnsignedInt(const char *nptr, unsigned int *num) +{ + unsigned long value; + int result = stringToUnsignedLong(nptr, &value); + if (result != UDS_SUCCESS) { + return result; + } + if (value > UINT_MAX) { + return ERANGE; + } + *num = (unsigned int) value; + return UDS_SUCCESS; +} diff --git a/source/uds/stringUtils.h b/source/uds/stringUtils.h new file mode 100644 index 0000000..bd685bb --- /dev/null +++ b/source/uds/stringUtils.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/stringUtils.h#2 $ + */ + +#ifndef STRING_UTILS_H +#define STRING_UTILS_H + +#include +#ifdef __KERNEL__ +#include +#include +#else +#include // for vsnprintf +#include // for strtol +#include +#include +#endif + +#include "compiler.h" +#include "typeDefs.h" + +/** + * Convert a boolean value to its corresponding "true" or "false" string. + * + * @param value The boolean value to convert + * + * @return "true" if value is true, "false" otherwise. + **/ +static INLINE const char *boolToString(bool value) +{ + return (value ? "true" : "false"); +} + +/** + * Allocate a string built according to format (our version of asprintf). + * + * @param [in] what A description of what is being allocated, for error + * logging; if NULL doesn't log anything. + * @param [out] strp The pointer in which to store the allocated string. + * @param [in] fmt The sprintf format parameter. + * + * @return UDS_SUCCESS, or the appropriately translated asprintf error + **/ +int allocSprintf(const char *what, char **strp, const char *fmt, ...) + __attribute__((format(printf, 3, 4), warn_unused_result)); + +/** + * Write a printf-style string into a fixed-size buffer, returning + * errors if it would not fit. (our version of snprintf) + * + * @param [in] what A description of what is being written, for error + * logging; if NULL doesn't log anything. + * @param [out] buf The target buffer + * @param [in] bufSize The size of buf + * @param [in] error Error code to return on overflow + * @param [in] fmt The sprintf format parameter. + * @return UDS_SUCCESS or error + **/ +int fixedSprintf(const char *what, char *buf, size_t bufSize, + int error, const char *fmt, ...) + __attribute__((format(printf, 5, 6), warn_unused_result)); + +/** + * Write printf-style string into an existing buffer, returning a specified + * error code if it would not fit, and setting ``needed`` to the amount of + * space that would be required. + * + * @param [in] what A description of what is being written, for logging. + * @param [in] buf The buffer in which to write the string, or NULL to + * merely determine the required space. + * @param [in] bufSize The size of buf. + * @param [in] error The error code to return for exceeding the specified + * space, UDS_SUCCESS if no logging required. + * @param [in] fmt The sprintf format specification. + * @param [in] ap The variable argument pointer (see ). + * @param [out] needed If non-NULL, the actual amount of string space + * required, which may be smaller or larger than bufSize. + * + * @return UDS_SUCCESS if the string fits, the value of the error parameter if + * the string does not fit and a buffer was supplied, or + * UDS_UNEXPECTED_RESULT if vsnprintf fails in some other undocumented + * way. + **/ +int wrapVsnprintf(const char *what, char *buf, size_t bufSize, + int error, const char *fmt, va_list ap, size_t *needed) + __attribute__((format(printf, 5, 0), warn_unused_result)); + +/** + * Helper to append a string to a buffer. + * + * @param buffer the place at which to append the string + * @param bufEnd pointer to the end of the buffer + * @param fmt a printf format string + * + * @return the updated buffer position after the append + * + * if insufficient space is available, the contents are silently truncated + **/ +char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) + __attribute__((format(printf, 3, 4))); + +/** + * Variable-arglist helper to append a string to a buffer. + * + * @param buffer the place at which to append the string + * @param bufEnd pointer to the end of the buffer + * @param fmt a printf format string + * @param args printf arguments + * + * @return the updated buffer position after the append + * + * if insufficient space is available, the contents are silently truncated + **/ +char *vAppendToBuffer(char *buffer, + char *bufEnd, + const char *fmt, + va_list args) + __attribute__((format(printf, 3, 0))); + +/** + * Our version of strtok_r, since some platforma apparently don't define it. + * + * @param str On first call, the string to tokenize. On subsequent + * calls, NULL. + * @param delims The set of delimiter characters. + * @param statePtr The address of a variable which holds the state of + * the tokenization between calls to nextToken. + * + * @return the next token if any, or NULL + **/ +char *nextToken(char *str, const char *delims, char **statePtr); + +/** + * Parse a string representing a decimal uint64_t. + * + * @param str The string. + * @param num Where to put the number. + * + * @return UDS_SUCCESS or the error UDS_INVALID_ARGUMENT if the string + * is not in the correct format. + **/ +int parseUint64(const char *str, uint64_t *num) + __attribute__((warn_unused_result)); + +/** + * Attempt to convert a string to an integer (base 10) + * + * @param nptr Pointer to string to convert + * @param num The resulting integer + * + * @return UDS_SUCCESS or an error code + **/ +int stringToSignedInt(const char *nptr, int *num) + __attribute__((warn_unused_result)); + +/** + * Attempt to convert a string to a long integer (base 10) + * + * @param nptr Pointer to string to convert + * @param num The resulting long integer + * + * @return UDS_SUCCESS or an error code + **/ +int stringToSignedLong(const char *nptr, long *num) + __attribute__((warn_unused_result)); + +/** + * Attempt to convert a string to an unsigned integer (base 10). + * + * @param nptr Pointer to string to convert + * @param num The resulting unsigned integer + * + * @return UDS_SUCCESS or an error code + **/ +int stringToUnsignedInt(const char *nptr, unsigned int *num) + __attribute__((warn_unused_result)); + +/** + * Attempt to convert a string to an unsigned long integer (base 10). + * + * @param nptr Pointer to string to convert + * @param num The resulting long unsigned integer + * + * @return UDS_SUCCESS or an error code + **/ +int stringToUnsignedLong(const char *nptr, unsigned long *num) + __attribute__((warn_unused_result)); + +#endif /* STRING_UTILS_H */ diff --git a/source/uds/sysfs.c b/source/uds/sysfs.c new file mode 100644 index 0000000..b2009d7 --- /dev/null +++ b/source/uds/sysfs.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/sysfs.c#4 $ + */ + +#include "sysfs.h" + +#include +#include +#include + +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" +#include "uds.h" + +static struct { + struct kobject kobj; // /sys/uds + struct kobject parameterKobj; // /sys/uds/parameter + // These flags are used to ensure a clean shutdown + bool flag; // /sys/uds + bool parameterFlag; // /sys/uds/parameter +} objectRoot; + +/**********************************************************************/ +static char *bufferToString(const char *buf, size_t length) +{ + char *string; + if (ALLOCATE(length + 1, char, __func__, &string) != UDS_SUCCESS) { + return NULL; + } + memcpy(string, buf, length); + string[length] = '\0'; + if (string[length - 1] == '\n') { + string[length - 1] = '\0'; + } + return string; +} + +/**********************************************************************/ +// This is the code for a directory in the /sys/ tree that +// contains no regular files (only subdirectories). +/**********************************************************************/ + +/**********************************************************************/ +static void emptyRelease(struct kobject *kobj) +{ + // Many of our sysfs share this release function that does nothing. +} + +/**********************************************************************/ +static ssize_t emptyShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return 0; +} + +/**********************************************************************/ +static ssize_t emptyStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + return length; +} + +static struct sysfs_ops emptyOps = { + .show = emptyShow, + .store = emptyStore, +}; + +static struct attribute *emptyAttrs[] = { + NULL, +}; + +static struct kobj_type emptyObjectType = { + .release = emptyRelease, + .sysfs_ops = &emptyOps, + .default_attrs = emptyAttrs, +}; + + +/**********************************************************************/ +// This is the the code for the /sys//parameter directory. +// +//

/log_level UDS_LOG_LEVEL +// +/**********************************************************************/ + +typedef struct { + struct attribute attr; + const char *(*showString)(void); + void (*storeString)(const char *); +} ParameterAttribute; + +/**********************************************************************/ +static ssize_t parameterShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + ParameterAttribute *pa = container_of(attr, ParameterAttribute, attr); + if (pa->showString != NULL) { + return sprintf(buf, "%s\n", pa->showString()); + } else { + return -EINVAL; + } +} + +/**********************************************************************/ +static ssize_t parameterStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + ParameterAttribute *pa = container_of(attr, ParameterAttribute, attr); + char *string = bufferToString(buf, length); + if (string == NULL) { + return -ENOMEM; + } + int result = UDS_SUCCESS; + if (pa->storeString != NULL) { + pa->storeString(string); + } else { + return -EINVAL; + } + FREE(string); + return result == UDS_SUCCESS ? length : result; +} + +/**********************************************************************/ + +static const char *parameterShowLogLevel(void) +{ + return priorityToString(getLogLevel()); +} + +/**********************************************************************/ + +static void parameterStoreLogLevel(const char *string) +{ + setLogLevel(stringToPriority(string)); +} + +/**********************************************************************/ + +static ParameterAttribute logLevelAttr = { + .attr = { .name = "log_level", .mode = 0600 }, + .showString = parameterShowLogLevel, + .storeString = parameterStoreLogLevel, +}; + +static struct attribute *parameterAttrs[] = { + &logLevelAttr.attr, + NULL, +}; + +static struct sysfs_ops parameterOps = { + .show = parameterShow, + .store = parameterStore, +}; + +static struct kobj_type parameterObjectType = { + .release = emptyRelease, + .sysfs_ops = ¶meterOps, + .default_attrs = parameterAttrs, +}; + +/**********************************************************************/ +int initSysfs(void) +{ + memset(&objectRoot, 0, sizeof(objectRoot)); + kobject_init(&objectRoot.kobj, &emptyObjectType); + int result = kobject_add(&objectRoot.kobj, NULL, THIS_MODULE->name); + if (result == 0) { + objectRoot.flag = true; + kobject_init(&objectRoot.parameterKobj, ¶meterObjectType); + result = kobject_add(&objectRoot.parameterKobj, &objectRoot.kobj, + "parameter"); + if (result == 0) { + objectRoot.parameterFlag = true; + } + } + if (result != 0) { + putSysfs(); + } + return result; +} + +/**********************************************************************/ +void putSysfs() +{ + if (objectRoot.parameterFlag) { + kobject_put(&objectRoot.parameterKobj); + } + if (objectRoot.flag) { + kobject_put(&objectRoot.kobj); + } +} diff --git a/source/uds/sysfs.h b/source/uds/sysfs.h new file mode 100644 index 0000000..d5f9ccf --- /dev/null +++ b/source/uds/sysfs.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/sysfs.h#1 $ + */ + +#ifndef SYSFS_H +#define SYSFS_H + +/** + * Called when the module is loaded to initialize the /sys/\ + * tree. + * + * @return 0 on success, or non-zero on error + **/ +int initSysfs(void); + +/** + * Called when the module is being unloaded to terminate the + * /sys/\ tree. + **/ +void putSysfs(void); + +#endif /* SYSFS_H */ diff --git a/source/uds/threadCondVarLinuxKernel.c b/source/uds/threadCondVarLinuxKernel.c new file mode 100644 index 0000000..e3c1517 --- /dev/null +++ b/source/uds/threadCondVarLinuxKernel.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadCondVarLinuxKernel.c#2 $ + */ + +#include "threads.h" +#include "timeUtils.h" +#include "uds-error.h" + +/**********************************************************************/ +int initCond(CondVar *cv) +{ + cv->eventCount = NULL; + return makeEventCount(&cv->eventCount); +} + +/**********************************************************************/ +int signalCond(CondVar *cv) +{ + eventCountBroadcast(cv->eventCount); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int broadcastCond(CondVar *cv) +{ + eventCountBroadcast(cv->eventCount); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int waitCond(CondVar *cv, Mutex *mutex) +{ + EventToken token = eventCountPrepare(cv->eventCount); + unlockMutex(mutex); + eventCountWait(cv->eventCount, token, NULL); + lockMutex(mutex); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int timedWaitCond(CondVar *cv, Mutex *mutex, RelTime timeout) +{ + EventToken token = eventCountPrepare(cv->eventCount); + unlockMutex(mutex); + bool happened = eventCountWait(cv->eventCount, token, &timeout); + lockMutex(mutex); + return happened ? UDS_SUCCESS : ETIMEDOUT; +} + +/**********************************************************************/ +int destroyCond(CondVar *cv) +{ + freeEventCount(cv->eventCount); + cv->eventCount = NULL; + return UDS_SUCCESS; +} diff --git a/source/uds/threadOnce.c b/source/uds/threadOnce.c new file mode 100644 index 0000000..62149ca --- /dev/null +++ b/source/uds/threadOnce.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/threadOnce.c#1 $ + */ + +#include "errors.h" +#include "threads.h" + +enum { + ONCE_NOT_DONE = 0, + ONCE_IN_PROGRESS = 1, + ONCE_COMPLETE = 2, +}; + +/*****************************************************************************/ +int performOnce(OnceState *once, void (*function)(void)) +{ + for (;;) { + switch (atomic_cmpxchg(once, ONCE_NOT_DONE, ONCE_IN_PROGRESS)) { + case ONCE_NOT_DONE: + function(); + atomic_set_release(once, ONCE_COMPLETE); + return UDS_SUCCESS; + case ONCE_IN_PROGRESS: + yieldScheduler(); + break; + case ONCE_COMPLETE: + return UDS_SUCCESS; + default: + return UDS_BAD_STATE; + } + } +} diff --git a/source/uds/threadOnce.h b/source/uds/threadOnce.h new file mode 100644 index 0000000..58b6da3 --- /dev/null +++ b/source/uds/threadOnce.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/threadOnce.h#1 $ + */ + +#ifndef THREAD_ONCE_H +#define THREAD_ONCE_H + +#include "atomicDefs.h" + +#define ONCE_STATE_INITIALIZER ATOMIC_INIT(0) + +typedef atomic_t OnceState; + +/** + * Thread safe once only initialization. + * + * @param onceState pointer to object to record that initialization + * has been performed + * @param initFunction called if onceState does not indicate + * initialization has been performed + * + * @return UDS_SUCCESS or error code + * + * @note Generally the following declaration of onceState is performed in + * at file scope: + * + * static OnceState onceState = ONCE_STATE_INITIALIZER; + **/ +int performOnce(OnceState *onceState, void (*initFunction) (void)); + +#endif /* THREAD_ONCE_H */ diff --git a/source/uds/threadRegistry.c b/source/uds/threadRegistry.c new file mode 100644 index 0000000..c37e77a --- /dev/null +++ b/source/uds/threadRegistry.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadRegistry.c#1 $ + */ + +#include "threadRegistry.h" + +#include +#include + +#include "permassert.h" + +/* + * We need to be careful when using other facilities that may use + * threadRegistry functions in their normal operation. For example, + * we do not want to invoke the logger while holding a lock. + */ + +/*****************************************************************************/ +void registerThread(ThreadRegistry *registry, + RegisteredThread *newThread, + const void *pointer) +{ + INIT_LIST_HEAD(&newThread->links); + newThread->pointer = pointer; + newThread->task = current; + + bool foundIt = false; + RegisteredThread *thread; + write_lock(®istry->lock); + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + // This should not have been there. + // We'll complain after releasing the lock. + list_del_init(&thread->links); + foundIt = true; + break; + } + } + list_add_tail(&newThread->links, ®istry->links); + write_unlock(®istry->lock); + ASSERT_LOG_ONLY(!foundIt, "new thread not already in registry"); +} + +/*****************************************************************************/ +void unregisterThread(ThreadRegistry *registry) +{ + bool foundIt = false; + RegisteredThread *thread; + write_lock(®istry->lock); + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + list_del_init(&thread->links); + foundIt = true; + break; + } + } + write_unlock(®istry->lock); + ASSERT_LOG_ONLY(foundIt, "thread found in registry"); +} + +/*****************************************************************************/ +void initializeThreadRegistry(ThreadRegistry *registry) +{ + INIT_LIST_HEAD(®istry->links); + rwlock_init(®istry->lock); +} + +/*****************************************************************************/ +const void *lookupThread(ThreadRegistry *registry) +{ + const void *result = NULL; + read_lock(®istry->lock); + RegisteredThread *thread; + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + result = thread->pointer; + break; + } + } + read_unlock(®istry->lock); + return result; +} diff --git a/source/uds/threadRegistry.h b/source/uds/threadRegistry.h new file mode 100644 index 0000000..ec1832d --- /dev/null +++ b/source/uds/threadRegistry.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadRegistry.h#1 $ + */ + +#ifndef THREAD_REGISTRY_H +#define THREAD_REGISTRY_H 1 + +#include +#include + +/* + * We don't expect this set to ever get really large, so a linked list + * is adequate. + */ + +typedef struct threadRegistry { + struct list_head links; + rwlock_t lock; +} ThreadRegistry; + +typedef struct registeredThread { + struct list_head links; + const void *pointer; + struct task_struct *task; +} RegisteredThread; + +/*****************************************************************************/ + +/** + * Initialize a registry of threads and associated data pointers. + * + * @param registry The registry to initialize + **/ +void initializeThreadRegistry(ThreadRegistry *registry); + +/** + * Register the current thread and associate it with a data pointer. + * + * This call will log messages if the thread is already registered. + * + * @param registry The thread registry + * @param newThread RegisteredThread structure to use for the current thread + * @param pointer The value to associated with the current thread + **/ +void registerThread(ThreadRegistry *registry, + RegisteredThread *newThread, + const void *pointer); + +/** + * Remove the registration for the current thread. + * + * A message may be logged if the thread was not registered. + * + * @param registry The thread registry + **/ +void unregisterThread(ThreadRegistry *registry); + +/** + * Fetch a pointer that may have been registered for the current + * thread. If the thread is not registered, a null pointer is + * returned. + * + * @param registry The thread registry + * + * @return the registered pointer, if any, or NULL + **/ +const void *lookupThread(ThreadRegistry *registry); + +#endif /* THREAD_REGISTRY_H */ diff --git a/source/uds/threads.h b/source/uds/threads.h new file mode 100644 index 0000000..793355c --- /dev/null +++ b/source/uds/threads.h @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/threads.h#4 $ + */ + +#ifndef THREADS_H +#define THREADS_H + +#include "compiler.h" +#include "threadOnce.h" +#include "timeUtils.h" +#include "uds-error.h" + +#ifdef __KERNEL__ +#include +#include +#include +#include "util/eventCount.h" +#else +#include +#include +#include +#endif + +#ifdef __KERNEL__ +typedef struct { EventCount *eventCount; } CondVar; +typedef struct mutex Mutex; +typedef struct semaphore Semaphore; +typedef struct kernelThread *Thread; +typedef pid_t ThreadId; + +typedef struct { + Semaphore mutex; // Mutex for this barrier object + Semaphore wait; // Semaphore for threads waiting at the barrier + int arrived; // Number of threads which have arrived + int threadCount; // Total number of threads using this barrier +} Barrier; +#else +typedef pthread_barrier_t Barrier; +typedef pthread_cond_t CondVar; +typedef pthread_mutex_t Mutex; +typedef sem_t Semaphore; +typedef pthread_t Thread; +typedef pid_t ThreadId; + +#ifndef NDEBUG +#define MUTEX_INITIALIZER PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP +#else +#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER +#endif + +extern const bool DO_ASSERTIONS; +#endif + +#ifdef __KERNEL__ +/** + * Apply a function to every thread that we have created. + * + * @param applyFunc The function to apply + * @param argument The first argument to applyFunc + * + **/ +void applyToThreads(void applyFunc(void *, struct task_struct *), + void *argument); +#endif + +/** + * Create a thread, logging any cause of failure. + * + * @param threadFunc function to run in new thread + * @param threadData private data for new thread + * @param name name of the new thread + * @param newThread where to store the new thread id + * + * @return success or failure indication + **/ +int createThread(void (*threadFunc)(void *), + void *threadData, + const char *name, + Thread *newThread) + __attribute__((warn_unused_result)); + +/** + * Retrieve the current numbers of cores. + * + * This is either the total number or the number of cores that this + * process has been limited to. + * + * @return number of cores + **/ +unsigned int getNumCores(void); + +/** + * Return the id of the current thread. + * + * @return the thread id + **/ +ThreadId getThreadId(void) __attribute__((warn_unused_result)); + +#ifndef __KERNEL__ +/** + * Get the name of the current thread. + * + * @param name a buffer of size at least 16 to write the name to + **/ +void getThreadName(char *name); +#endif + +/** + * Wait for termination of another thread. + * + * + * @param th The thread for which to wait. + * + * @return UDS_SUCCESS or error code + **/ +int joinThreads(Thread th); + +#ifdef __KERNEL__ +/** + * Exit the current thread. This is a kernel-only function that is intended to + * be an alternative to using BUG() or BUG_ON(). + **/ +__attribute__((noreturn)) +void exitThread(void); +#endif + +/** + * Initialize a thread synchronization barrier (also known as a rendezvous). + * + * @param barrier the barrier to initialize + * @param threadCount the number of threads that must enter the barrier before + * any threads are permitted to leave it + * + * @return UDS_SUCCESS or an error code + **/ +int initializeBarrier(Barrier *barrier, unsigned int threadCount) + __attribute__((warn_unused_result)); + +/** + * Destroy a thread synchronization barrier. + * + * @param barrier the barrier to destroy + * + * @return UDS_SUCCESS or an error code + **/ +int destroyBarrier(Barrier *barrier); + +/** + * Enter a thread synchronization barrier, waiting for the configured number + * of threads to have entered before exiting the barrier. Exactly one thread + * will be arbitrarily selected to be flagged as the "winner" of a barrier. + * + * @param barrier the barrier to enter + * @param winner if non-NULL, a pointer to the flag indicating whether the + * calling thread was the unique winner + * + * @return UDS_SUCCESS or an error code + **/ +int enterBarrier(Barrier *barrier, bool *winner); + +/** + * Initialize a condition variable with default attributes. + * + * @param cond condition variable to init + * + * @return UDS_SUCCESS or error code + **/ +int initCond(CondVar *cond) __attribute__((warn_unused_result)); + +/** + * Signal a condition variable. + * + * @param cond condition variable to signal + * + * @return UDS_SUCCESS or error code + **/ +int signalCond(CondVar *cond); + +/** + * Broadcast a condition variable. + * + * @param cond condition variable to broadcast + * + * @return UDS_SUCCESS or error code + **/ +int broadcastCond(CondVar *cond); + +/** + * Wait on a condition variable. + * + * @param cond condition variable to wait on + * @param mutex mutex to release while waiting + * + * @return UDS_SUCCESS or error code + **/ +int waitCond(CondVar *cond, Mutex *mutex); + +/** + * Wait on a condition variable with a timeout. + * + * @param cond condition variable to wait on + * @param mutex mutex to release while waiting + * @param timeout the relative time until the timeout expires + * + * @return error code (ETIMEDOUT if the deadline is hit) + **/ +int timedWaitCond(CondVar *cond, Mutex *mutex, RelTime timeout); + +/** + * Destroy a condition variable. + * + * @param cond condition variable to destroy + * + * @return UDS_SUCCESS or error code + **/ +int destroyCond(CondVar *cond); + +#ifndef __KERNEL__ +/** + * Initialize a mutex, optionally asserting if the mutex initialization fails. + * This function should only be called directly in places where making + * assertions is not safe. + * + * @param mutex the mutex to initialize + * @param assertOnError if true, an error initializing the + * mutex will make an assertion + * + * @return UDS_SUCCESS or an error code + **/ +int initializeMutex(Mutex *mutex, bool assertOnError); +#endif + +/** + * Initialize the default type (error-checking during development) mutex. + * + * @param mutex the mutex to initialize + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +#ifdef __KERNEL__ +static INLINE int initMutex(Mutex *mutex) +{ + mutex_init(mutex); + return UDS_SUCCESS; +} +#else +int initMutex(Mutex *mutex); +#endif + +/** + * Destroy a mutex (with error checking during development). + * + * @param mutex mutex to destroy + * + * @return UDS_SUCCESS or error code + **/ +#ifdef __KERNEL__ +static INLINE int destroyMutex(Mutex *mutex) +{ + return UDS_SUCCESS; +} +#else +int destroyMutex(Mutex *mutex); +#endif + +/** + * Lock a mutex, with optional error checking during development. + * + * @param mutex mutex to lock + **/ +#ifdef __KERNEL__ +static INLINE void lockMutex(Mutex *mutex) +{ + mutex_lock(mutex); +} +#else +void lockMutex(Mutex *mutex); +#endif + +/** + * Unlock a mutex, with optional error checking during development. + * + * @param mutex mutex to unlock + **/ +#ifdef __KERNEL__ +static INLINE void unlockMutex(Mutex *mutex) +{ + mutex_unlock(mutex); +} +#else +void unlockMutex(Mutex *mutex); +#endif + +/** + * Initialize a semaphore used among threads in the same process. + * + * @param semaphore the semaphore to initialize + * @param value the initial value of the semaphore + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +#ifdef __KERNEL__ +static INLINE int initializeSemaphore(Semaphore *semaphore, unsigned int value) +{ + sema_init(semaphore, value); + return UDS_SUCCESS; +} +#else +int initializeSemaphore(Semaphore *semaphore, unsigned int value); +#endif + +/** + * Destroy a semaphore used among threads in the same process. + * + * @param semaphore the semaphore to destroy + * + * @return UDS_SUCCESS or an error code + **/ +#ifdef __KERNEL__ +static INLINE int destroySemaphore(Semaphore *semaphore) +{ + return UDS_SUCCESS; +} +#else +int destroySemaphore(Semaphore *semaphore); +#endif + +/** + * Acquire a permit from a semaphore, waiting if none are currently available. + * + * @param semaphore the semaphore to acquire + **/ +#ifdef __KERNEL__ +static INLINE void acquireSemaphore(Semaphore *semaphore) +{ + // Do not use down(semaphore). Instead use down_interruptible so that we do + // not get 120 second stall messages in kern.log. + while (down_interruptible(semaphore) != 0) { + } +} +#else +void acquireSemaphore(Semaphore *semaphore); +#endif + +/** + * Attempt to acquire a permit from a semaphore. + * + * If a permit is available, it is claimed and the function immediately + * returns true. If a timeout is zero or negative, the function immediately + * returns false. Otherwise, this will wait either a permit to become + * available (returning true) or the relative timeout to expire (returning + * false). + * + * @param semaphore the semaphore to decrement + * @param timeout the relative time until the timeout expires + * + * @return true if a permit was acquired, otherwise false + **/ +__attribute__((warn_unused_result)) +#ifdef __KERNEL__ +static INLINE bool attemptSemaphore(Semaphore *semaphore, RelTime timeout) +{ + if (timeout <= 0) { + // No timeout, just try to grab the semaphore. + return down_trylock(semaphore) == 0; + } else { + unsigned int jiffies = usecs_to_jiffies(relTimeToMicroseconds(timeout)); + return down_timeout(semaphore, jiffies) == 0; + } +} +#else +bool attemptSemaphore(Semaphore *semaphore, RelTime timeout); +#endif + +/** + * Release a semaphore, incrementing the number of available permits. + * + * @param semaphore the semaphore to increment + **/ +#ifdef __KERNEL__ +static INLINE void releaseSemaphore(Semaphore *semaphore) +{ + up(semaphore); +} +#else +void releaseSemaphore(Semaphore *semaphore); +#endif + +/** + * Yield the time slice in the given thread. + * + * @return UDS_SUCCESS or an error code + **/ +int yieldScheduler(void); + +#ifndef __KERNEL__ +/** + * Allocate a thread specific key for thread specific data. + * + * @param key points to location for new key + * @param destr_function destructor function called when thread exits + * + * @return UDS_SUCCESS or error code + **/ +int createThreadKey(pthread_key_t *key, void (*destr_function) (void *)); + +/** + * Delete a thread specific key for thread specific data. + * + * @param key key to delete + * + * @return UDS_SUCCESS or error code + **/ +int deleteThreadKey(pthread_key_t key); + +/** + * Set pointer for thread specific data. + * + * @param key key to be associated with pointer + * @param pointer data associated with key + * + * @return UDS_SUCCESS or error code + **/ +int setThreadSpecific(pthread_key_t key, const void *pointer); + +/** + * Get pointer for thread specific data. + * + * @param key key identifying the thread specific data + **/ +void *getThreadSpecific(pthread_key_t key); +#endif + +#endif /* THREADS_H */ diff --git a/source/uds/threadsLinuxKernel.c b/source/uds/threadsLinuxKernel.c new file mode 100644 index 0000000..7ac972d --- /dev/null +++ b/source/uds/threadsLinuxKernel.c @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadsLinuxKernel.c#4 $ + */ + +#include +#include +#include + +#include "memoryAlloc.h" +#include "logger.h" +#include "threads.h" +#include "uds-error.h" + +static struct hlist_head kernelThreadList; +static struct mutex kernelThreadMutex; +static OnceState kernelThreadOnce; + +typedef struct kernelThread { + void (*threadFunc)(void *); + void *threadData; + struct hlist_node threadLinks; + struct task_struct *threadTask; + struct completion threadDone; +} KernelThread; + +/**********************************************************************/ +static void kernelThreadInit(void) +{ + mutex_init(&kernelThreadMutex); +} + +/**********************************************************************/ +static int threadStarter(void *arg) +{ + KernelThread *kt = arg; + kt->threadTask = current; + performOnce(&kernelThreadOnce, kernelThreadInit); + mutex_lock(&kernelThreadMutex); + hlist_add_head(&kt->threadLinks, &kernelThreadList); + mutex_unlock(&kernelThreadMutex); + RegisteredThread allocatingThread; + registerAllocatingThread(&allocatingThread, NULL); + kt->threadFunc(kt->threadData); + unregisterAllocatingThread(); + complete(&kt->threadDone); + return 0; +} + +/**********************************************************************/ +int createThread(void (*threadFunc)(void *), + void *threadData, + const char *name, + Thread *newThread) +{ + char *nameColon = strchr(name, ':'); + char *myNameColon = strchr(current->comm, ':'); + KernelThread *kt; + int result = ALLOCATE(1, KernelThread, __func__, &kt); + if (result != UDS_SUCCESS) { + logWarning("Error allocating memory for %s", name); + return result; + } + kt->threadFunc = threadFunc; + kt->threadData = threadData; + init_completion(&kt->threadDone); + struct task_struct *thread; + /* + * Start the thread, with an appropriate thread name. + * + * If the name supplied contains a colon character, use that name. This + * causes uds module threads to have names like "uds:callbackW" and the main + * test runner thread to be named "zub:runtest". + * + * Otherwise if the current thread has a name containing a colon character, + * prefix the name supplied with the name of the current thread up to (and + * including) the colon character. Thus when the "kvdo0:dedupeQ" thread + * opens an index session, all the threads associated with that index will + * have names like "kvdo0:foo". + * + * Otherwise just use the name supplied. This should be a rare occurrence. + */ + if ((nameColon == NULL) && (myNameColon != NULL)) { + thread = kthread_run(threadStarter, kt, "%.*s:%s", + (int) (myNameColon - current->comm), current->comm, + name); + } else { + thread = kthread_run(threadStarter, kt, "%s", name); + } + if (IS_ERR(thread)) { + FREE(kt); + return UDS_ENOTHREADS; + } + *newThread = kt; + return UDS_SUCCESS; +} +/**********************************************************************/ +int joinThreads(Thread kt) +{ + while (wait_for_completion_interruptible(&kt->threadDone) != 0) { + } + mutex_lock(&kernelThreadMutex); + hlist_del(&kt->threadLinks); + mutex_unlock(&kernelThreadMutex); + FREE(kt); + return UDS_SUCCESS; +} + +/**********************************************************************/ +void applyToThreads(void applyFunc(void *, struct task_struct *), + void *argument) +{ + KernelThread *kt; + performOnce(&kernelThreadOnce, kernelThreadInit); + mutex_lock(&kernelThreadMutex); + hlist_for_each_entry(kt, &kernelThreadList, threadLinks) { + applyFunc(argument, kt->threadTask); + } + mutex_unlock(&kernelThreadMutex); +} + +/**********************************************************************/ +void exitThread(void) +{ + KernelThread *kt; + struct completion *completion = NULL; + performOnce(&kernelThreadOnce, kernelThreadInit); + mutex_lock(&kernelThreadMutex); + hlist_for_each_entry(kt, &kernelThreadList, threadLinks) { + if (kt->threadTask == current) { + completion = &kt->threadDone; + break; + } + } + mutex_unlock(&kernelThreadMutex); + unregisterAllocatingThread(); + complete_and_exit(completion, 1); +} + +/**********************************************************************/ +ThreadId getThreadId(void) +{ + return current->pid; +} + +/**********************************************************************/ +unsigned int getNumCores(void) +{ + return num_online_cpus(); +} + +/**********************************************************************/ +int initializeBarrier(Barrier *barrier, unsigned int threadCount) +{ + barrier->arrived = 0; + barrier->threadCount = threadCount; + int result = initializeSemaphore(&barrier->mutex, 1); + if (result != UDS_SUCCESS) { + return result; + } + return initializeSemaphore(&barrier->wait, 0); +} + +/**********************************************************************/ +int destroyBarrier(Barrier *barrier) +{ + int result = destroySemaphore(&barrier->mutex); + if (result != UDS_SUCCESS) { + return result; + } + return destroySemaphore(&barrier->wait); +} + +/**********************************************************************/ +int enterBarrier(Barrier *barrier, bool *winner) +{ + acquireSemaphore(&barrier->mutex); + bool lastThread = ++barrier->arrived == barrier->threadCount; + if (lastThread) { + // This is the last thread to arrive, so wake up the others + int i; + for (i = 1; i < barrier->threadCount; i++) { + releaseSemaphore(&barrier->wait); + } + // Then reinitialize for the next cycle + barrier->arrived = 0; + releaseSemaphore(&barrier->mutex); + } else { + // This is NOT the last thread to arrive, so just wait + releaseSemaphore(&barrier->mutex); + acquireSemaphore(&barrier->wait); + } + if (winner != NULL) { + *winner = lastThread; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int yieldScheduler(void) +{ + yield(); + return UDS_SUCCESS; +} diff --git a/source/uds/timeUtils.c b/source/uds/timeUtils.c new file mode 100644 index 0000000..ddf3b2b --- /dev/null +++ b/source/uds/timeUtils.c @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/timeUtils.c#4 $ + */ + +#include "stringUtils.h" +#include "timeUtils.h" + +#ifdef __KERNEL__ +#include +#include // for getnstimeofday on Vivid +#else +#include +#endif + +#ifndef __KERNEL__ +static const struct timespec invalidTime = { + .tv_sec = -1, + .tv_nsec = LONG_MAX +}; + +static const long BILLION = 1000 * 1000 * 1000; +#endif + +#ifndef __KERNEL__ +/*****************************************************************************/ +AbsTime currentTime(clockid_t clock) +{ + struct timespec ts; + if (clock_gettime(clock, &ts) != 0) { + ts = invalidTime; + } + return ts; +} +#endif + +#ifndef __KERNEL__ +/*****************************************************************************/ +/** + * Return a time offset from the specified time. + * + * @param time A time. + * @param reltime The relative time + * + * @return the sum of the time and the offset, possibly rounded up to the + * next representable instant. + * + * @note timeDifference(a, deltaTime(a, n)) may only be approx == -n + * depending on the system-specific time resolution + **/ +static AbsTime deltaTime(AbsTime time, RelTime reltime) +{ + if (!isValidTime(time)) { + return time; + } + if ((reltime >= 0) && (reltime < 10 * BILLION)) { + reltime += time.tv_nsec; + while (reltime >= BILLION) { + reltime -= BILLION; + time.tv_sec++; + } + time.tv_nsec = reltime; + return time; + } + // may not be accurate for times before the Epoch... + // (is the ns time positive or negative for negative time_t?) + int64_t ns = time.tv_sec * BILLION + time.tv_nsec; + if ((ns < INT64_MIN / 2) || + (ns > INT64_MAX / 2) || + (reltime < INT64_MIN / 2) || + (reltime > INT64_MAX / 2)) { + return invalidTime; + } + ns += reltime; + return (AbsTime) { .tv_sec = ns / BILLION, .tv_nsec = ns % BILLION }; +} +#endif + +#ifndef __KERNEL__ +/*****************************************************************************/ +AbsTime futureTime(clockid_t clock, RelTime reltime) +{ + return deltaTime(currentTime(clock), reltime); +} +#endif + +#ifndef __KERNEL__ +/*****************************************************************************/ +bool isValidTime(AbsTime time) +{ + if (time.tv_nsec < 0 || time.tv_nsec >= BILLION) { + return false; + } + return true; +} +#endif + +/*****************************************************************************/ +uint64_t nowUsec(void) +{ +#ifdef __KERNEL__ + static const AbsTime epoch = 0; +#else + static const AbsTime epoch = { 0, 0 }; +#endif + return relTimeToMicroseconds(timeDifference(currentTime(CLOCK_REALTIME), + epoch)); +} + + + +#ifndef __KERNEL__ +/*****************************************************************************/ +RelTime timeDifference(AbsTime a, AbsTime b) +{ + if (isValidTime(a) && isValidTime(b)) { + int64_t ans = a.tv_sec * BILLION + a.tv_nsec; + int64_t bns = b.tv_sec * BILLION + b.tv_nsec; + return ans - bns; + } else if (isValidTime(a)) { + return INT64_MAX; + } else if (isValidTime(b)) { + return INT64_MIN; + } else { + return 0; + } +} +#endif diff --git a/source/uds/timeUtils.h b/source/uds/timeUtils.h new file mode 100644 index 0000000..8d159f4 --- /dev/null +++ b/source/uds/timeUtils.h @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/timeUtils.h#5 $ + */ + +#ifndef TIME_UTILS_H +#define TIME_UTILS_H + +#include "compiler.h" +#include "typeDefs.h" + +#ifdef __KERNEL__ +#include +#include +#else +#include +#include +#endif + +// Absolute time. +#ifdef __KERNEL__ +typedef int64_t AbsTime; +#else +typedef struct timespec AbsTime; +#endif + +// Relative time, the length of a time interval, or the difference between +// two times. A signed 64-bit number of nanoseconds. +typedef int64_t RelTime; + +#ifndef __KERNEL__ +/** + * Return true if the time is valid. + * + * @param time a time + * + * @return true if the time is valid + * + * @note an invalid time is generally returned from a failed attempt + * to get the time from the system + **/ +bool isValidTime(AbsTime time); +#endif + +/** + * Return the current time according to the specified clock type. + * + * @param clock Either CLOCK_REALTIME or CLOCK_MONOTONIC + * + * @return the current time according to the clock in question + * + * @note the precision of the clock is system specific + **/ +#ifdef __KERNEL__ +static INLINE AbsTime currentTime(clockid_t clock) +{ + // clock is always a constant, so gcc reduces this to a single call + return clock == CLOCK_MONOTONIC ? ktime_get_ns() : ktime_get_real_ns(); +} +#else +AbsTime currentTime(clockid_t clock); +#endif + +#ifndef __KERNEL__ +/** + * Return the timestamp a certain number of nanoseconds in the future. + * + * @param clock Either CLOCK_REALTIME or CLOCK_MONOTONIC + * @param reltime The relative time to the clock value + * + * @return the timestamp for that time (potentially rounded to the next + * representable instant for the system in question) + **/ +AbsTime futureTime(clockid_t clock, RelTime reltime); +#endif + +/** + * Return the difference between two timestamps. + * + * @param a A time + * @param b Another time, based on the same clock as a. + * + * @return the relative time between the two timestamps + **/ +#ifdef __KERNEL__ +static INLINE RelTime timeDifference(AbsTime a, AbsTime b) +{ + return a - b; +} +#else +RelTime timeDifference(AbsTime a, AbsTime b); +#endif + + + +/** + * Convert seconds to a RelTime value + * + * @param seconds A number of seconds + * + * @return the equivalent number of seconds as a RelTime + **/ +static INLINE RelTime secondsToRelTime(int64_t seconds) +{ + return (RelTime) seconds * (1000 * 1000 * 1000); +} + +/** + * Convert milliseconds to a RelTime value + * + * @param milliseconds A number of milliseconds + * + * @return the equivalent number of milliseconds as a RelTime + **/ +static INLINE RelTime millisecondsToRelTime(int64_t milliseconds) +{ + return (RelTime) milliseconds * (1000 * 1000); +} + +/** + * Convert microseconds to a RelTime value + * + * @param microseconds A number of microseconds + * + * @return the equivalent number of microseconds as a RelTime + **/ +static INLINE RelTime microsecondsToRelTime(int64_t microseconds) +{ + return (RelTime) microseconds * 1000; +} + +/** + * Convert nanoseconds to a RelTime value + * + * @param nanoseconds A number of nanoseconds + * + * @return the equivalent number of nanoseconds as a RelTime + **/ +static INLINE RelTime nanosecondsToRelTime(int64_t nanoseconds) +{ + return (RelTime) nanoseconds; +} + +/** + * Convert a RelTime value to milliseconds + * + * @param reltime The relative time + * + * @return the equivalent number of milliseconds + **/ +static INLINE int64_t relTimeToSeconds(RelTime reltime) +{ + return reltime / (1000 * 1000 * 1000); +} + +/** + * Convert a RelTime value to milliseconds + * + * @param reltime The relative time + * + * @return the equivalent number of milliseconds + **/ +static INLINE int64_t relTimeToMilliseconds(RelTime reltime) +{ + return reltime / (1000 * 1000); +} + +/** + * Convert a RelTime value to microseconds + * + * @param reltime The relative time + * + * @return the equivalent number of microseconds + **/ +static INLINE int64_t relTimeToMicroseconds(RelTime reltime) +{ + return reltime / 1000; +} + +/** + * Convert a RelTime value to nanoseconds + * + * @param reltime The relative time + * + * @return the equivalent number of nanoseconds + **/ +static INLINE int64_t relTimeToNanoseconds(RelTime reltime) +{ + return reltime; +} + +/** + * Return the wall clock time in microseconds. The actual value is time + * since the epoch (see "man gettimeofday"), but the typical use is to call + * this twice and compute the difference, giving the elapsed time between + * the two calls. + * + * @return the time in microseconds + **/ +uint64_t nowUsec(void) __attribute__((warn_unused_result)); + +/** + * Convert from an AbsTime to a time_t + * + * @param time an AbsTime time + * + * @return a time_t time + **/ +static INLINE time_t asTimeT(AbsTime time) +{ +#ifdef __KERNEL__ + return time / 1000000000; +#else + return time.tv_sec; +#endif +} + +/** + * Convert from a time_t to an AbsTime, + * + * @param time a time_t time + * + * @return an AbsTime time + **/ +static INLINE AbsTime fromTimeT(time_t time) +{ +#ifdef __KERNEL__ + return time * 1000000000; +#else + AbsTime abs; + abs.tv_sec = time; + abs.tv_nsec = 0; + return abs; +#endif +} + +#ifndef __KERNEL__ +/** + * Convert from an AbsTime to a struct timespec + * + * @param time an AbsTime time + * + * @return a time_t time + **/ +static INLINE struct timespec asTimeSpec(AbsTime time) +{ + return time; +} +#endif + +#ifndef __KERNEL__ +/** + * Convert from an AbsTime to a struct timeval + * + * @param time an AbsTime time + * + * @return a time_t time + **/ +static INLINE struct timeval asTimeVal(AbsTime time) +{ + struct timeval tv = { time.tv_sec, time.tv_nsec / 1000 }; + return tv; +} +#endif + +#endif /* TIME_UTILS_H */ diff --git a/source/uds/typeDefs.h b/source/uds/typeDefs.h new file mode 100644 index 0000000..927bd23 --- /dev/null +++ b/source/uds/typeDefs.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/typeDefs.h#1 $ + */ + +#ifndef LINUX_KERNEL_TYPE_DEFS_H +#define LINUX_KERNEL_TYPE_DEFS_H + +/* + * General system type definitions. This file is parallel to the other + * typeDefs.h files in this project. We pick up what we can from the system + * include files, and explicitly define the other things we need. + */ + +#include +#include +#include + +#define CHAR_BIT 8 + +#define INT64_MAX (9223372036854775807L) +#define UCHAR_MAX ((unsigned char)~0ul) +#define UINT8_MAX ((uint8_t)~0ul) +#define UINT16_MAX ((uint16_t)~0ul) +#define UINT64_MAX ((uint64_t)~0ul) + +// Some recent versions of define this for us +#ifndef SIZE_MAX +#define SIZE_MAX ((size_t)~0ul) +#endif + +#define PRId64 "lld" +#define PRIu16 "u" +#define PRIu32 "u" +#define PRIu64 "llu" + +typedef unsigned long uintmax_t; +#define PRIuMAX "lu" + +typedef unsigned char byte; + +#endif /* LINUX_KERNEL_TYPE_DEFS_H */ diff --git a/source/uds/uds-block.h b/source/uds/uds-block.h new file mode 100644 index 0000000..e1b8e61 --- /dev/null +++ b/source/uds/uds-block.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/uds-block.h#1 $ + */ + +/** + * @file + * @brief Definitions for the UDS block interface + **/ +#ifndef UDS_BLOCK_H +#define UDS_BLOCK_H + +#include "uds.h" + +/** General UDS block constants. */ +enum { + /** The maximum metadata size for a block. */ + UDS_MAX_BLOCK_DATA_SIZE = UDS_MAX_METADATA_SIZE +}; + +/** + * Metadata to associate with a blockName. + **/ +struct udsChunkData { + unsigned char data[UDS_MAX_BLOCK_DATA_SIZE]; +}; + +/** + * Represents a block address on disk. + * + * #UdsBlockAddress objects allow the Application Software and UDS + * to refer to specific disk blocks. It might be, for instance, the + * logical block address divided by the block size. + * + * These objects are stored persistently in the index and are also cached. + * Therefore, make every effort to ensure that these objects are as small as + * possible. + **/ +typedef void *UdsBlockAddress; + +/** @{ */ +/** @name Deduplication */ + +typedef struct udsRequest UdsRequest; + +/** + * Callback function invoked to inform the Application Software that an + * operation started by #udsStartChunkOperation has completed. + * + * @param [in] request The operation that finished. When the callback + * function is called, this UdsRequest structure can be + * reused or freed. + **/ +typedef void UdsChunkCallback(UdsRequest *request); + +/** + * Request structure passed to #udsStartChunkOperation to begin an operation, + * and returned to the Application Software when the callback function is + * invoked. + **/ +struct udsRequest { + /* + * The name of the block. + * Set before starting an operation. + * Unchanged at time of callback. + */ + UdsChunkName chunkName; + /* + * The metadata found in the index that was associated with the block + * (sometimes called the canonical address). + * Set before the callback. + */ + struct udsChunkData oldMetadata; + /* + * The new metadata to associate with the name of the block (sometimes called + * the duplicate address). + * Set before starting a #UDS_POST or #UDS_QUERY operation. + * Unchanged at time of callback. + */ + struct udsChunkData newMetadata; + /* + * The callback method to be invoked when the operation finishes. + * Set before starting an operation. + * Unchanged at time of callback. + */ + UdsChunkCallback *callback; + /* + * The index session. + * Set before starting an operation. + * Unchanged at time of callback. + */ + struct uds_index_session *session; + /* + * The operation type, which is one of #UDS_DELETE, #UDS_POST, #UDS_QUERY or + * #UDS_UPDATE. + * Set before starting an operation. + * Unchanged at time of callback. + */ + UdsCallbackType type; + /* + * The operation status, which is either #UDS_SUCCESS or an error code. + * Set before the callback. + */ + int status; + /* + * If true, the name of the block was found in the index. + * Set before the callback. + */ + bool found; + /* + * If true, move the entry to the end of the deduplication window. + * Set before starting a #UDS_QUERY operation. + * Unchanged at time of callback. + */ + bool update; + long private[25]; +}; + +/** + * Start a UDS index chunk operation. The request type field must + * be set to the type of operation. This is an asynchronous interface to the + * block-oriented UDS API. The callback is invoked upon completion. + * + * The #UDS_DELETE operation type deletes the mapping for a particular block. + * #UDS_DELETE is typically used when UDS provides invalid advice. + * + * The #UDS_POST operation type indexes a block name and associates it with a + * particular address. The caller provides the block's name. UDS then checks + * this name against its index. + *
    + *
  • If the block is new, it is stored in the index.
  • + *
  • If the block is a duplicate of an indexed block, UDS returns the + * canonical block address via the callback.
  • + *
+ * + * The #UDS_QUERY operation type checks to see if a block name exists in the + * index. The caller provides the block's name. UDS then checks + * this name against its index. + *
    + *
  • If the block is new, no action is taken.
  • + + *
  • If the block is a duplicate of an indexed block, UDS returns the + * canonical block address via the callback. If the update + * field is set, the entry is moved to the end of the deduplication + * window.
+ * + * The #UDS_UPDATE operation type updates the mapping for a particular block. + * #UDS_UPDATE is typically used if the callback function provides invalid + * advice. + * + * @param [in] request The operation. The type, + * chunkName, newMetadata, + * context, callback, and + * update fields must be set. At callback + * time, the oldMetadata, + * status, and found fields will + * be set. + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsStartChunkOperation(UdsRequest *request); +/** @} */ + +#endif /* UDS_BLOCK_H */ diff --git a/source/uds/uds-error.h b/source/uds/uds-error.h new file mode 100644 index 0000000..7658982 --- /dev/null +++ b/source/uds/uds-error.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/uds-error.h#3 $ + */ + +/** + * @file + * @brief UDS error code definitions + **/ +#ifndef UDS_ERROR_H +#define UDS_ERROR_H + + +/** + * Valid return status codes for API routines. + **/ +enum udsStatusCodes { + /** Successful return */ + UDS_SUCCESS = 0, + + /** Used as a base value for reporting errors */ + UDS_ERROR_CODE_BASE = 1024, + /** The UDS library is not initialized */ + UDS_UNINITIALIZED = UDS_ERROR_CODE_BASE + 0, + /** The UDS library is shutting down */ + UDS_SHUTTINGDOWN = UDS_ERROR_CODE_BASE + 1, + /** Could not load scanner modules */ + UDS_EMODULE_LOAD = UDS_ERROR_CODE_BASE + 2, + /** Could not create a new thread */ + UDS_ENOTHREADS = UDS_ERROR_CODE_BASE + 3, + /** Could not find the specified library context */ + UDS_NOCONTEXT = UDS_ERROR_CODE_BASE + 4, + /** The specified library context is disabled */ + UDS_DISABLED = UDS_ERROR_CODE_BASE + 5, + /** Some saved index component is corrupt */ + UDS_CORRUPT_COMPONENT = UDS_ERROR_CODE_BASE + 6, + UDS_CORRUPT_FILE = UDS_CORRUPT_COMPONENT, + /** Unknown error */ + UDS_UNKNOWN_ERROR = UDS_ERROR_CODE_BASE + 7, + /** Unused */ + UDS_UNUSED_CODE_8 = UDS_ERROR_CODE_BASE + 8, + /** Unused */ + UDS_UNUSED_CODE_9 = UDS_ERROR_CODE_BASE + 9, + /** The index configuration or volume format is no longer supported */ + UDS_UNSUPPORTED_VERSION = UDS_ERROR_CODE_BASE + 10, + /** Index session not available */ + UDS_NO_INDEXSESSION = UDS_ERROR_CODE_BASE + 11, + /** Index data in memory is corrupt */ + UDS_CORRUPT_DATA = UDS_ERROR_CODE_BASE + 12, + /** Short read due to truncated file */ + UDS_SHORT_READ = UDS_ERROR_CODE_BASE + 13, + /** Unused */ + UDS_UNUSED_CODE_14 = UDS_ERROR_CODE_BASE + 14, + /** Internal resource limits exceeded */ + UDS_RESOURCE_LIMIT_EXCEEDED = UDS_ERROR_CODE_BASE + 15, + /** Memory overflow due to storage failure */ + UDS_VOLUME_OVERFLOW = UDS_ERROR_CODE_BASE + 16, + /** Unused */ + UDS_UNUSED_CODE_17 = UDS_ERROR_CODE_BASE + 17, + /** Unused */ + UDS_UNUSED_CODE_18 = UDS_ERROR_CODE_BASE + 18, + /** Unused */ + UDS_UNUSED_CODE_19 = UDS_ERROR_CODE_BASE + 19, + /** Configuration pointer required */ + UDS_CONF_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 20, + /** Index stats pointer required */ + UDS_INDEX_STATS_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 21, + /** Context stats pointer required */ + UDS_CONTEXT_STATS_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 22, + /** Unused */ + UDS_UNUSED_CODE_23 = UDS_ERROR_CODE_BASE + 23, + /** Unused */ + UDS_UNUSED_CODE_24 = UDS_ERROR_CODE_BASE + 24, + /** Unused */ + UDS_UNUSED_CODE_25 = UDS_ERROR_CODE_BASE + 25, + /** Unused */ + UDS_UNUSED_CODE_26 = UDS_ERROR_CODE_BASE + 26, + /** Unused */ + UDS_UNUSED_CODE_27 = UDS_ERROR_CODE_BASE + 27, + /** Memory configuration not supported */ + UDS_INVALID_MEMORY_SIZE = UDS_ERROR_CODE_BASE + 28, + /** Unused */ + UDS_UNUSED_CODE_29 = UDS_ERROR_CODE_BASE + 29, + /** Index name required */ + UDS_INDEX_NAME_REQUIRED = UDS_ERROR_CODE_BASE + 30, + /** Configuration required */ + UDS_CONF_REQUIRED = UDS_ERROR_CODE_BASE + 31, + /** Unused */ + UDS_UNUSED_CODE_32 = UDS_ERROR_CODE_BASE + 32, + /** Unused */ + UDS_UNUSED_CODE_33 = UDS_ERROR_CODE_BASE + 33, + /** Unused */ + UDS_UNUSED_CODE_34 = UDS_ERROR_CODE_BASE + 34, + /** Unused */ + UDS_UNUSED_CODE_35 = UDS_ERROR_CODE_BASE + 35, + /** Unused */ + UDS_UNUSED_CODE_36 = UDS_ERROR_CODE_BASE + 36, + /** Essential files for index not found */ + UDS_NO_INDEX = UDS_ERROR_CODE_BASE + 37, + /** Checkpoint frequency out of range */ + UDS_BAD_CHECKPOINT_FREQUENCY = UDS_ERROR_CODE_BASE + 38, + /** Wrong type of index configuration */ + UDS_WRONG_INDEX_CONFIG = UDS_ERROR_CODE_BASE + 39, + /** Unused */ + UDS_UNUSED_CODE_40 = UDS_ERROR_CODE_BASE + 40, + /** Unused */ + UDS_UNUSED_CODE_41 = UDS_ERROR_CODE_BASE + 41, + /** Unused */ + UDS_UNUSED_CODE_42 = UDS_ERROR_CODE_BASE + 42, + /** Unused */ + UDS_UNUSED_CODE_43 = UDS_ERROR_CODE_BASE + 43, + /** Premature end of file in scanned file */ + UDS_END_OF_FILE = UDS_ERROR_CODE_BASE + 44, + /** Attempt to access unsaved index */ + UDS_INDEX_NOT_SAVED_CLEANLY = UDS_ERROR_CODE_BASE + 45, + /** Unused */ + UDS_UNUSED_CODE_46 = UDS_ERROR_CODE_BASE + 46, + /** There is not sufficient space to create the index */ + UDS_INSUFFICIENT_INDEX_SPACE = UDS_ERROR_CODE_BASE + 47, + /** Unused */ + UDS_UNUSED_CODE_48 = UDS_ERROR_CODE_BASE + 48, + /** Unused */ + UDS_UNUSED_CODE_49 = UDS_ERROR_CODE_BASE + 49, + /** Index is suspended */ + UDS_SUSPENDED = UDS_ERROR_CODE_BASE + 50, + /** Unused */ + UDS_UNUSED_CODE_51 = UDS_ERROR_CODE_BASE + 51, + /** Index session is already initialized */ + UDS_INDEXSESSION_IN_USE = UDS_ERROR_CODE_BASE + 52, + /** Callback required */ + UDS_CALLBACK_REQUIRED = UDS_ERROR_CODE_BASE + 53, + /** Wrong operation type */ + UDS_INVALID_OPERATION_TYPE = UDS_ERROR_CODE_BASE + 54, + /** One more than the last UDS_ERROR_CODE */ + UDS_ERROR_CODE_LAST, + /** One more than this block can use */ + UDS_ERROR_CODE_BLOCK_END = UDS_ERROR_CODE_BASE + 1024 +}; + +#endif /* UDS_ERROR_H */ diff --git a/source/uds/uds-platform.h b/source/uds/uds-platform.h new file mode 100644 index 0000000..0df39ef --- /dev/null +++ b/source/uds/uds-platform.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/uds-platform.h#1 $ + */ + +/** + * @file + * @brief Platform definitions for albireo + **/ +#ifndef UDS_PLATFORM_H +#define UDS_PLATFORM_H + + +#ifdef __KERNEL__ +#include +#else +#include +#include +#include +#include +#include +#endif + +#endif /* UDS_PLATFORM_H */ diff --git a/source/uds/uds.h b/source/uds/uds.h new file mode 100644 index 0000000..42e2863 --- /dev/null +++ b/source/uds/uds.h @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/uds.h#2 $ + */ + +/** + * @mainpage UDS API Reference + *
Copyright (c) 2020 Red Hat, Inc.
+ **/ + +/** + * @file + * @brief General UDS definitions + **/ +#ifndef UDS_H +#define UDS_H + +#include "uds-platform.h" + +#ifdef UDS_DISABLE_ATTR_WARN_UNUSED_RESULT +#define UDS_ATTR_WARN_UNUSED_RESULT +#else +#define UDS_ATTR_WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +#endif + +/** + * Valid request types as described in callbacks. + **/ +typedef enum { + /** + * Callback type for operations that post mappings to the UDS + * index. When the chunk-hash being added already exists, the + * existing metadata is not overwritten. Regardless, the + * recency of the chunk is updated. + **/ + UDS_POST, + + /** + * Callback type for operations that update mappings in the UDS + * index. If the indicated entry does not have any mapping in the + * index, one is created. In either case, the recency of + * the chunk is updated. + **/ + UDS_UPDATE, + + /** + * Callback type for operations that delete mappings from the + * UDS index. */ + UDS_DELETE, + + /** + * Callback type for operations that query mappings in the UDS + * index. When a mapping is found, the recency of the mapping + * is updated unless it's the no-update call. + **/ + UDS_QUERY +} UdsCallbackType; + +/** + * Valid types for opening an index. + **/ +typedef enum { + /** + * Load an existing index. If the index was not saved cleanly, try to + * recover and rebuild the index. + **/ + UDS_LOAD = 0, + + /** + * Create a new index. + **/ + UDS_CREATE = 1, + + /** + * Load an existing index, but only if it was cleanly saved. + **/ + UDS_NO_REBUILD = 2, +} UdsOpenIndexType; + +/** General UDS constants. */ +enum { + /** The chunk name size in bytes (128 bits = 16 bytes). */ + UDS_CHUNK_NAME_SIZE = 16, + /** The maximum metadata size in bytes. */ + UDS_MAX_METADATA_SIZE = 16, +}; + +/** + * Type representing memory configuration which is either a positive + * integer number of gigabytes or one of the three special constants + * for configurations which are smaller than 1 gigabyte. + **/ +typedef unsigned int UdsMemoryConfigSize; + +extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_256MB; +extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_512MB; +extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_768MB; + +/** + * The maximum configurable amount of memory. + **/ +extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_MAX; + +/** The name (hash) of a chunk. */ +typedef struct udsChunkName { + /** The name (hash) of a chunk. */ + unsigned char name[UDS_CHUNK_NAME_SIZE]; +} UdsChunkName; + +/** + * An active index session. + **/ +struct uds_index_session; + +/** + * The data used to configure a new index. + **/ +typedef struct udsConfiguration *UdsConfiguration; +typedef uint64_t UdsNonce; + +/** + * The data used to configure a new index session. + **/ +struct uds_parameters { + // Tne number of threads used to process index requests. + int zone_count; + // The number of threads used to read volume pages. + int read_threads; + // The number of chapters to write between checkpoints. + int checkpoint_frequency; +}; +#define UDS_PARAMETERS_INITIALIZER { \ + .zone_count = 0, \ + .read_threads = 2, \ + .checkpoint_frequency = 0, \ + } + +/** + * Index statistics + * + * These statistics capture the current index characteristics, + * including resource usage. + **/ +typedef struct udsIndexStats { + /** The total number of chunk names stored in the index */ + uint64_t entriesIndexed; + /** An estimate of the index's memory usage */ + uint64_t memoryUsed; + /** The number of collisions recorded in the master index */ + uint64_t collisions; + /** The number of entries discarded from the index since index startup */ + uint64_t entriesDiscarded; + /** The number of checkpoints done this session */ + uint64_t checkpoints; +} UdsIndexStats; + +/** + * Context statistics + * + * These statistics capture a library context's characteristics either since + * it was initialized or since its statistics were last reset, whichever + * is more recent. + **/ +typedef struct udsContextStats { + /** The time at which context statistics were last fetched */ + time_t currentTime; + /** + * The number of post calls since context statistics were last reset that + * found an existing entry + **/ + uint64_t postsFound; + /** + * The number of post calls since context statistics were last reset that + * added an entry + **/ + uint64_t postsNotFound; + /** + * The number of post calls since context statistics were last reset that + * found an existing entry is current enough to only exist in memory and not + * have been commited to disk yet. + **/ + uint64_t inMemoryPostsFound; + /** + * The number of post calls since context statistics were last reset that + * found an existing entry in the dense portion of the index. + **/ + uint64_t densePostsFound; + /** + * The number of post calls since context statistics were last reset that + * found an existing entry in the sparse portion of the index (if one + * exists). + **/ + uint64_t sparsePostsFound; + /** + * The number of update calls since context statistics were last reset that + * updated an existing entry + **/ + uint64_t updatesFound; + /** + * The number of update calls since context statistics were last reset that + * added a new entry + **/ + uint64_t updatesNotFound; + /** + * The number of delete requests since context statistics were last reset + * that deleted an existing entry + **/ + uint64_t deletionsFound; + /** + * The number of delete requests since context statistics were last reset + * that did nothing. + **/ + uint64_t deletionsNotFound; + /** + * The number of query calls since context statistics were last reset that + * found existing entry + **/ + uint64_t queriesFound; + /** + * The number of query calls since context statistics were last reset that + * did not find an entry + **/ + uint64_t queriesNotFound; + /** + * The total number of library requests (the sum of posts, updates, + * deletions, and queries) since context + * statistics were last reset + **/ + uint64_t requests; +} UdsContextStats; + +/** + * Initializes an index configuration. + * + * @param [out] conf The new configuration + * @param [in] memGB The maximum memory allocation, in GB + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsInitializeConfiguration(UdsConfiguration *conf, + UdsMemoryConfigSize memGB); + +/** + * Sets or clears an index configuration's sparse indexing settings. + * + * @param [in,out] conf The configuration to change + * @param [in] sparse If true, request a sparse + * index; if false, request + * a default index. + * + **/ +void udsConfigurationSetSparse(UdsConfiguration conf, bool sparse); + +/** + * Tests whether an index configuration specifies sparse indexing. + * + * @param [in] conf The configuration to check + * + * @return Returns true if the configuration + * is sparse, or false if not + **/ +UDS_ATTR_WARN_UNUSED_RESULT +bool udsConfigurationGetSparse(UdsConfiguration conf); + +/** + * Sets an index configuration's nonce. + * + * @param [in,out] conf The configuration to change + * @param [in] nonce The 64 bit nonce. + * + **/ +void udsConfigurationSetNonce(UdsConfiguration conf, UdsNonce nonce); + +/** + * Gets an index configuration's nonce. + * + * @param [in] conf The configuration to check + * + * @return The 64 bit nonce. + **/ +UDS_ATTR_WARN_UNUSED_RESULT +UdsNonce udsConfigurationGetNonce(UdsConfiguration conf); + +/** + * Fetches a configuration's maximum memory allocation. + * + * @param [in] conf The configuration to check + * + * @return The amount of memory allocated, in GB + **/ +UDS_ATTR_WARN_UNUSED_RESULT +UdsMemoryConfigSize udsConfigurationGetMemory(UdsConfiguration conf); + +/** + * Fetches a configuration's chapters per volume value. + * + * @param [in] conf The configuration to check + * + * @return The number of chapters per volume + **/ +UDS_ATTR_WARN_UNUSED_RESULT +unsigned int udsConfigurationGetChaptersPerVolume(UdsConfiguration conf); + +/** + * Frees memory used by a configuration. + * + * @param [in,out] conf The configuration for which memory is being freed + **/ +void udsFreeConfiguration(UdsConfiguration conf); + +/** + * Compute the size required to store the index on persistent storage. This + * size is valid for any index stored in a single file or on a single block + * device. This size should be used when configuring a block device on which + * to store an index. + * + * @param [in] config A UdsConfiguration for an index. + * @param [in] numCheckpoints The maximum number of checkpoints. + * @param [out] indexSize The number of bytes required to store + * the index. + * + * @return UDS_SUCCESS or an error code. + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsComputeIndexSize(const UdsConfiguration config, + unsigned int numCheckpoints, + uint64_t *indexSize); + +/** + * Opens an index session. + * + * Creates a session for an index. #udsOpenIndex must be called before + * the index can be used. + * + * Destroy the session with #udsDestroyIndexSession. + * + * @param [out] session A pointer to the new session + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsCreateIndexSession(struct uds_index_session **session); + +/** + * Fetches the UDS library version. + * + * @return The library version + **/ +UDS_ATTR_WARN_UNUSED_RESULT +const char *udsGetVersion(void); + +#ifdef __KERNEL__ +/** + * The name argument to #udsOpenIndex is a text string that names the index. + * The name should have the form "path", where path is the name of the block + * device. The path should not contain white space. The names can optionally + * contain size and/or offset options which give the number of bytes in the + * index and the byte offset to the start of the index. For example, the name + * "/dev/sda8 offset=409600 size=2048000000" is an index that is stored in + * 2040000000 bytes of /dev/sda8 starting at byte 409600. + **/ +#else +/** + * The name argument to #udsOpenIndex is a text string that names the index. + * The name should have the form "path", where path is the name of the file or + * block device. The path should not contain white space. The name can + * optionally contain size and/or offset options which give the number of bytes + * in the index and the byte offset to the start of the index. For example, + * the name "/dev/sda8 offset=409600 size=2048000000" is an index that is + * stored in 2040000000 bytes of /dev/sda8 starting at byte 409600. + **/ +#endif + +/** + * Opens an index with an existing session. This operation will fail if the + * index session is suspended, or if there is already an open index. + * + * The index should be closed with #udsCloseIndex. + * + * @param openType The type of open, which is one of #UDS_LOAD, #UDS_CREATE, + * or #UDS_NO_REBUILD. + * @param name The name of the index + * @param params The index session parameters. If NULL, the default + * session parameters will be used. + * @param conf The index configuration + * @param session The index session + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsOpenIndex(UdsOpenIndexType openType, + const char *name, + const struct uds_parameters *params, + UdsConfiguration conf, + struct uds_index_session *session); + +/** + * Waits until all callbacks for index operations are complete, and prevents + * new index operations from starting. Index operations will return + * UDS_SUSPENDED until #udsResumeIndexSession is called. Optionally saves all + * index data before returning. + * + * @param session The session to suspend + * @param save Whether to save index data + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsSuspendIndexSession(struct uds_index_session *session, bool save); + +/** + * Allows new index operations for an index, whether it was suspended or not. + * + * @param session The session to resume + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsResumeIndexSession(struct uds_index_session *session); + +/** + * Waits until all callbacks for index operations are complete. + * + * @param [in] session The session to flush + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsFlushIndexSession(struct uds_index_session *session); + +/** + * Closes an index. This operation will fail if the index session is + * suspended. + * + * Saves changes to the index so that #udsOpenIndex can re-open it. + * + * @param [in] session The session containing the index to close + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsCloseIndex(struct uds_index_session *session); + +/** + * Destroys an index session. + * + * Saves changes to the index and closes the index if one is open. + * Use #udsDestroyIndexSession for index sessions created by + * #udsCreateIndexSession. + * + * @param [in] session The session to destroy + * + * @return Either #UDS_SUCCESS or an error code + **/ +int udsDestroyIndexSession(struct uds_index_session *session); + +/** + * Returns the configuration for the given index session. + * + * @param [in] session The session + * @param [out] conf The index configuration + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsGetIndexConfiguration(struct uds_index_session *session, + UdsConfiguration *conf); + +/** + * Fetches index statistics for the given index session. + * + * @param [in] session The session + * @param [out] stats The index statistics structure to fill + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsGetIndexStats(struct uds_index_session *session, UdsIndexStats *stats); + +/** + * Fetches index session statistics for the given index session. + * + * @param [in] session The session + * @param [out] stats The context statistics structure to fill + * + * @return Either #UDS_SUCCESS or an error code + **/ +UDS_ATTR_WARN_UNUSED_RESULT +int udsGetIndexSessionStats(struct uds_index_session *session, + UdsContextStats *stats); + +/** + * Convert an error code to a string. + * + * @param errnum The error code + * @param buf The buffer to hold the error string + * @param buflen The length of the buffer + * + * @return A pointer to buf + **/ +UDS_ATTR_WARN_UNUSED_RESULT +const char *udsStringError(int errnum, char *buf, size_t buflen); + +/** + * Suggested buffer size for udsStringError. + **/ +enum { + UDS_STRING_ERROR_BUFSIZE = 128 +}; + +#endif /* UDS_H */ diff --git a/source/uds/udsMain.c b/source/uds/udsMain.c new file mode 100644 index 0000000..8d4f411 --- /dev/null +++ b/source/uds/udsMain.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/udsMain.c#12 $ + */ + +#include "uds.h" + +#include "config.h" +#include "geometry.h" +#include "indexLayout.h" +#include "indexRouter.h" +#include "indexSession.h" +#include "loadType.h" +#include "logger.h" +#include "memoryAlloc.h" + +const UdsMemoryConfigSize UDS_MEMORY_CONFIG_MAX = 1024; +const UdsMemoryConfigSize UDS_MEMORY_CONFIG_256MB = (UdsMemoryConfigSize) -256; +const UdsMemoryConfigSize UDS_MEMORY_CONFIG_512MB = (UdsMemoryConfigSize) -512; +const UdsMemoryConfigSize UDS_MEMORY_CONFIG_768MB = (UdsMemoryConfigSize) -768; + +/* + * =========================================================================== + * UDS system management + * =========================================================================== + */ + +/**********************************************************************/ +int udsInitializeConfiguration(UdsConfiguration *userConfig, + UdsMemoryConfigSize memGB) +{ + if (userConfig == NULL) { + return logErrorWithStringError(UDS_CONF_PTR_REQUIRED, + "received a NULL config pointer"); + } + + /* Set the configuration parameters that change with memory size. If you + * change these values, you should also: + * + * Change Configuration_x1, which tests these values and expects to see them + * + * Bump the index configuration version number. This bump ensures that + * the test infrastructure will be forced to test the new configuration. + */ + + unsigned int chaptersPerVolume, recordPagesPerChapter; + if (memGB == UDS_MEMORY_CONFIG_256MB) { + chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (memGB == UDS_MEMORY_CONFIG_512MB) { + chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (memGB == UDS_MEMORY_CONFIG_768MB) { + chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; + } else if (memGB == 1) { + chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else if ((memGB > 1) && (memGB <= UDS_MEMORY_CONFIG_MAX)) { + chaptersPerVolume = memGB * DEFAULT_CHAPTERS_PER_VOLUME; + recordPagesPerChapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; + } else { + return UDS_INVALID_MEMORY_SIZE; + } + + int result = ALLOCATE(1, struct udsConfiguration, "udsConfiguration", + userConfig); + if (result != UDS_SUCCESS) { + return result; + } + + (*userConfig)->recordPagesPerChapter = recordPagesPerChapter; + (*userConfig)->chaptersPerVolume = chaptersPerVolume; + (*userConfig)->sparseChaptersPerVolume = DEFAULT_SPARSE_CHAPTERS_PER_VOLUME; + (*userConfig)->cacheChapters = DEFAULT_CACHE_CHAPTERS; + (*userConfig)->checkpointFrequency = DEFAULT_CHECKPOINT_FREQUENCY; + (*userConfig)->masterIndexMeanDelta = DEFAULT_MASTER_INDEX_MEAN_DELTA; + (*userConfig)->bytesPerPage = DEFAULT_BYTES_PER_PAGE; + (*userConfig)->sparseSampleRate = DEFAULT_SPARSE_SAMPLE_RATE; + (*userConfig)->nonce = 0; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void udsConfigurationSetSparse(UdsConfiguration userConfig, bool sparse) +{ + bool prevSparse = (userConfig->sparseChaptersPerVolume != 0); + if (sparse == prevSparse) { + // nothing to do + return; + } + + unsigned int prevChaptersPerVolume = userConfig->chaptersPerVolume; + if (sparse) { + // Index 10TB with 4K blocks, 95% sparse, fit in dense (1TB) footprint + userConfig->chaptersPerVolume = 10 * prevChaptersPerVolume; + userConfig->sparseChaptersPerVolume = 9 * prevChaptersPerVolume + + prevChaptersPerVolume / 2; + userConfig->sparseSampleRate = 32; + } else { + userConfig->chaptersPerVolume = prevChaptersPerVolume / 10; + userConfig->sparseChaptersPerVolume = 0; + userConfig->sparseSampleRate = 0; + } +} + +/**********************************************************************/ +bool udsConfigurationGetSparse(UdsConfiguration userConfig) +{ + return userConfig->sparseChaptersPerVolume > 0; +} + +/**********************************************************************/ +void udsConfigurationSetNonce(UdsConfiguration userConfig, UdsNonce nonce) +{ + userConfig->nonce = nonce; +} + +/**********************************************************************/ +UdsNonce udsConfigurationGetNonce(UdsConfiguration userConfig) +{ + return userConfig->nonce; +} + +/**********************************************************************/ +unsigned int udsConfigurationGetMemory(UdsConfiguration userConfig) +{ + enum { + CHAPTERS = DEFAULT_CHAPTERS_PER_VOLUME, + SMALL_PAGES = CHAPTERS * SMALL_RECORD_PAGES_PER_CHAPTER, + LARGE_PAGES = CHAPTERS * DEFAULT_RECORD_PAGES_PER_CHAPTER + }; + unsigned int pages = (userConfig->chaptersPerVolume + * userConfig->recordPagesPerChapter); + if (userConfig->sparseChaptersPerVolume != 0) { + pages /= 10; + } + switch (pages) { + case SMALL_PAGES: return UDS_MEMORY_CONFIG_256MB; + case 2 * SMALL_PAGES: return UDS_MEMORY_CONFIG_512MB; + case 3 * SMALL_PAGES: return UDS_MEMORY_CONFIG_768MB; + default: return pages / LARGE_PAGES; + } +} + +/**********************************************************************/ +unsigned int +udsConfigurationGetChaptersPerVolume(UdsConfiguration userConfig) +{ + return userConfig->chaptersPerVolume; +} + +/**********************************************************************/ +void udsFreeConfiguration(UdsConfiguration userConfig) +{ + FREE(userConfig); +} + +/**********************************************************************/ +int udsCreateIndexSession(struct uds_index_session **session) +{ + if (session == NULL) { + return UDS_NO_INDEXSESSION; + } + + struct uds_index_session *indexSession = NULL; + int result = makeEmptyIndexSession(&indexSession); + if (result != UDS_SUCCESS) { + return result; + } + + *session = indexSession; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static +int initializeIndexSessionWithLayout(struct uds_index_session *indexSession, + IndexLayout *layout, + const struct uds_parameters *userParams, + LoadType loadType) +{ + int result = ((loadType == LOAD_CREATE) + ? writeIndexConfig(layout, &indexSession->userConfig) + : verifyIndexConfig(layout, &indexSession->userConfig)); + if (result != UDS_SUCCESS) { + return result; + } + + Configuration *indexConfig; + result = makeConfiguration(&indexSession->userConfig, &indexConfig); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Failed to allocate config"); + return result; + } + + // Zero the stats for the new index. + memset(&indexSession->stats, 0, sizeof(indexSession->stats)); + + result = makeIndexRouter(layout, indexConfig, userParams, loadType, + &indexSession->loadContext, enterCallbackStage, + &indexSession->router); + freeConfiguration(indexConfig); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Failed to make router"); + return result; + } + + logUdsConfiguration(&indexSession->userConfig); + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int initializeIndexSession(struct uds_index_session *indexSession, + const char *name, + const struct uds_parameters *userParams, + LoadType loadType) +{ + IndexLayout *layout; + int result = makeIndexLayout(name, loadType == LOAD_CREATE, + &indexSession->userConfig, &layout); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializeIndexSessionWithLayout(indexSession, layout, userParams, + loadType); + putIndexLayout(&layout); + return result; +} + +/**********************************************************************/ +int udsOpenIndex(UdsOpenIndexType openType, + const char *name, + const struct uds_parameters *userParams, + UdsConfiguration userConfig, + struct uds_index_session *session) +{ + if (name == NULL) { + return UDS_INDEX_NAME_REQUIRED; + } + if (userConfig == NULL) { + return UDS_CONF_REQUIRED; + } + if (session == NULL) { + return UDS_NO_INDEXSESSION; + } + + int result = startLoadingIndexSession(session); + if (result != UDS_SUCCESS) { + return result; + } + + session->userConfig = *userConfig; + + // Map the external openType to the internal loadType + LoadType loadType = openType == UDS_CREATE ? LOAD_CREATE + : openType == UDS_NO_REBUILD ? LOAD_LOAD + : LOAD_REBUILD; + logNotice("%s: %s", getLoadType(loadType), name); + + result = initializeIndexSession(session, name, userParams, loadType); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Failed %s", getLoadType(loadType)); + saveAndFreeIndex(session); + } + + finishLoadingIndexSession(session, result); + return sansUnrecoverable(result); +} + +/**********************************************************************/ +const char *udsGetVersion(void) +{ +#ifdef UDS_VERSION + return UDS_VERSION; +#else + return "internal version"; +#endif +} + +/**********************************************************************/ +const char *udsStringError(int errnum, char *buf, size_t buflen) +{ + if (buf == NULL) { + return NULL; + } + + return stringError(errnum, buf, buflen); +} diff --git a/source/uds/udsModule.c b/source/uds/udsModule.c new file mode 100644 index 0000000..007f1a8 --- /dev/null +++ b/source/uds/udsModule.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/kernelLinux/uds/udsModule.c#32 $ + */ + +#include + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "murmur/MurmurHash3.h" +#include "sysfs.h" +#include "timeUtils.h" +#include "uds.h" +#include "uds-block.h" +#include "util/funnelQueue.h" + +/**********************************************************************/ +static int __init dedupeInit(void) +{ + memoryInit(); + logInfo("loaded version %s", UDS_VERSION); + initSysfs(); + return 0; +} + +/**********************************************************************/ +static void __exit dedupeExit(void) +{ + putSysfs(); + memoryExit(); + logInfo("unloaded version %s", UDS_VERSION); +} + +/**********************************************************************/ +module_init(dedupeInit); +module_exit(dedupeExit); + +EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_256MB); +EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_512MB); +EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_768MB); +EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_MAX); +EXPORT_SYMBOL_GPL(udsInitializeConfiguration); +EXPORT_SYMBOL_GPL(udsComputeIndexSize); +EXPORT_SYMBOL_GPL(udsConfigurationSetNonce); +EXPORT_SYMBOL_GPL(udsConfigurationGetNonce); +EXPORT_SYMBOL_GPL(udsConfigurationSetSparse); +EXPORT_SYMBOL_GPL(udsConfigurationGetSparse); +EXPORT_SYMBOL_GPL(udsConfigurationGetMemory); +EXPORT_SYMBOL_GPL(udsConfigurationGetChaptersPerVolume); +EXPORT_SYMBOL_GPL(udsFreeConfiguration); +EXPORT_SYMBOL_GPL(udsGetVersion); +EXPORT_SYMBOL_GPL(udsCreateIndexSession); +EXPORT_SYMBOL_GPL(udsOpenIndex); +EXPORT_SYMBOL_GPL(udsSuspendIndexSession); +EXPORT_SYMBOL_GPL(udsResumeIndexSession); +EXPORT_SYMBOL_GPL(udsCloseIndex); +EXPORT_SYMBOL_GPL(udsDestroyIndexSession); +EXPORT_SYMBOL_GPL(udsFlushIndexSession); +EXPORT_SYMBOL_GPL(udsGetIndexConfiguration); +EXPORT_SYMBOL_GPL(udsGetIndexStats); +EXPORT_SYMBOL_GPL(udsGetIndexSessionStats); +EXPORT_SYMBOL_GPL(udsStringError); +EXPORT_SYMBOL_GPL(udsStartChunkOperation); + +EXPORT_SYMBOL_GPL(allocSprintf); +EXPORT_SYMBOL_GPL(allocateMemory); +EXPORT_SYMBOL_GPL(allocateMemoryNowait); +EXPORT_SYMBOL_GPL(assertionFailed); +EXPORT_SYMBOL_GPL(assertionFailedLogOnly); +EXPORT_SYMBOL_GPL(availableSpace); +EXPORT_SYMBOL_GPL(bufferLength); +EXPORT_SYMBOL_GPL(bufferUsed); +EXPORT_SYMBOL_GPL(clearBuffer); +EXPORT_SYMBOL_GPL(compactBuffer); +EXPORT_SYMBOL_GPL(contentLength); +EXPORT_SYMBOL_GPL(copyBytes); +EXPORT_SYMBOL_GPL(currentTime); +EXPORT_SYMBOL_GPL(duplicateString); +EXPORT_SYMBOL_GPL(ensureAvailableSpace); +EXPORT_SYMBOL_GPL(equalBuffers); +EXPORT_SYMBOL_GPL(fixedSprintf); +EXPORT_SYMBOL_GPL(freeBuffer); +EXPORT_SYMBOL_GPL(freeFunnelQueue); +EXPORT_SYMBOL_GPL(freeMemory); +EXPORT_SYMBOL_GPL(funnelQueuePoll); +EXPORT_SYMBOL_GPL(getBoolean); +EXPORT_SYMBOL_GPL(getBufferContents); +EXPORT_SYMBOL_GPL(getByte); +EXPORT_SYMBOL_GPL(getBytesFromBuffer); +EXPORT_SYMBOL_GPL(getMemoryStats); +EXPORT_SYMBOL_GPL(getUInt16BEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt16LEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt16LEsFromBuffer); +EXPORT_SYMBOL_GPL(getUInt32BEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt32BEsFromBuffer); +EXPORT_SYMBOL_GPL(getUInt32LEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt64BEsFromBuffer); +EXPORT_SYMBOL_GPL(getUInt64LEFromBuffer); +EXPORT_SYMBOL_GPL(getUInt64LEsFromBuffer); +EXPORT_SYMBOL_GPL(growBuffer); +EXPORT_SYMBOL_GPL(hasSameBytes); +EXPORT_SYMBOL_GPL(isFunnelQueueEmpty); +EXPORT_SYMBOL_GPL(makeBuffer); +EXPORT_SYMBOL_GPL(makeFunnelQueue); +EXPORT_SYMBOL_GPL(MurmurHash3_x64_128); +EXPORT_SYMBOL_GPL(nowUsec); +EXPORT_SYMBOL_GPL(peekByte); +EXPORT_SYMBOL_GPL(putBoolean); +EXPORT_SYMBOL_GPL(putBuffer); +EXPORT_SYMBOL_GPL(putByte); +EXPORT_SYMBOL_GPL(putBytes); +EXPORT_SYMBOL_GPL(putInt64LEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt16BEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt16LEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt16LEsIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt32BEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt32BEsIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt32LEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt64BEsIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt64LEIntoBuffer); +EXPORT_SYMBOL_GPL(putUInt64LEsIntoBuffer); +EXPORT_SYMBOL_GPL(reallocateMemory); +EXPORT_SYMBOL_GPL(registerAllocatingThread); +EXPORT_SYMBOL_GPL(reportMemoryUsage); +EXPORT_SYMBOL_GPL(resetBufferEnd); +EXPORT_SYMBOL_GPL(rewindBuffer); +EXPORT_SYMBOL_GPL(skipForward); +EXPORT_SYMBOL_GPL(uncompactedAmount); +EXPORT_SYMBOL_GPL(unregisterAllocatingThread); +EXPORT_SYMBOL_GPL(wrapBuffer); +EXPORT_SYMBOL_GPL(zeroBytes); + +/**********************************************************************/ + + +/**********************************************************************/ + +MODULE_DESCRIPTION("deduplication engine"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); +MODULE_VERSION(UDS_VERSION); diff --git a/source/uds/util/eventCount.c b/source/uds/util/eventCount.c new file mode 100644 index 0000000..7efeac6 --- /dev/null +++ b/source/uds/util/eventCount.c @@ -0,0 +1,317 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/eventCount.c#2 $ + */ + +/** + * This EventCount implementation uses a posix semaphore for portability, + * although a futex would be slightly superior to use and easy to substitute. + * It is designed to make signalling as cheap as possible, since that is the + * code path likely triggered on most updates to a lock-free data structure. + * Waiters are likely going to sleep, so optimizing for that case isn't + * necessary. + * + * The critical field is the state, which is really two fields that can be + * atomically updated in unison: an event counter and a waiter count. Every + * call to eventCountPrepare() issues a wait token by atomically incrementing + * the waiter count. The key invariant is a strict accounting of the number of + * tokens issued. Every token returned by eventCountPrepare() is a contract + * that the caller will call acquireSemaphore() and a signaller will call + * releaseSemaphore(), each exactly once. Atomic updates to the state field + * ensure that each token is counted once and that tokens are not lost. + * Cancelling a token attempts to take a fast-path by simply decrementing the + * waiters field, but if the token has already been claimed by a signaller, + * the canceller must still wait on the semaphore to consume the transferred + * token. + * + * The state field is 64 bits, partitioned into a 16-bit waiter field and a + * 48-bit counter. We are unlikely to have 2^16 threads, much less 2^16 + * threads waiting on any single event transition. 2^48 microseconds is + * several years, so a token holder would have to wait that long for the + * counter to wrap around, and then call eventCountWait() at the exact right + * time to see the re-used counter, in order to lose a wakeup due to counter + * wrap-around. Using a 32-bit state field would greatly increase that chance, + * but if forced to do so, the implementation could likely tolerate it since + * callers are supposed to hold tokens for miniscule periods of time. + * Fortunately, x64 has 64-bit compare-and-swap, and the performance of + * interlocked 64-bit operations appears to be about the same as for 32-bit + * ones, so being paranoid and using 64 bits costs us nothing. + * + * Here are some sequences of calls and state transitions: + * + * action postcondition + * counter waiters semaphore + * initialized 0 0 0 + * prepare 0 1 0 + * wait (blocks) 0 1 0 + * signal 1 0 1 + * wait (unblocks) 1 0 0 + * + * signal (fast-path) 1 0 0 + * signal (fast-path) 1 0 0 + * + * prepare A 1 1 0 + * prepare B 1 2 0 + * signal 2 0 2 + * wait B (fast-path) 2 0 1 + * wait A (fast-path) 2 0 0 + * + * prepare 2 1 0 + * cancel (fast-path) 2 0 0 + * + * prepare 2 1 0 + * signal 3 0 1 + * cancel (must wait) 3 0 0 + * + * The EventCount structure is aligned, sized, and allocated to cache line + * boundaries to avoid any false sharing between the EventCount and other + * shared state. The state field and semaphore should fit on a single cache + * line. The instrumentation counters increase the size of the structure so it + * rounds up to use two (64-byte x86) cache lines. + * + * XXX Need interface to access or display instrumentation counters. + **/ + +#include "eventCount.h" + +#include "atomicDefs.h" +#include "common.h" +#include "compiler.h" +#include "cpu.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "threads.h" + +enum { + ONE_WAITER = 1, // value used to increment the waiters field + ONE_EVENT = (1 << 16), // value used to increment the event counter + WAITERS_MASK = (ONE_EVENT - 1), // bit mask to access the waiters field + EVENTS_MASK = ~WAITERS_MASK, // bit mask to access the event counter +}; + +struct eventCount { + // Atomically mutable state: + // low 16 bits: the number of wait tokens not posted to the semaphore + // high 48 bits: current event counter + atomic64_t state; + + // Semaphore used to block threads when waiting is required. + Semaphore semaphore; + + // Instrumentation counters. + + // Declare alignment so we don't share a cache line. +} __attribute__((aligned(CACHE_LINE_BYTES))); + +/** + * Test the event field in two tokens for equality. + * + * @return true iff the tokens contain the same event field value + **/ +static INLINE bool sameEvent(EventToken token1, EventToken token2) +{ + return ((token1 & EVENTS_MASK) == (token2 & EVENTS_MASK)); +} + +/**********************************************************************/ +void eventCountBroadcast(EventCount *ec) +{ + + // Even if there are no waiters (yet), we will need a memory barrier. + smp_mb(); + + uint64_t waiters; + uint64_t state = atomic64_read(&ec->state); + uint64_t oldState = state; + do { + // Check if there are any tokens that have not yet been been transferred + // to the semaphore. This is the fast no-waiters path. + waiters = (state & WAITERS_MASK); + if (waiters == 0) { + // Fast path first time through--no need to signal or post if there are + // no observers. + return; + } + + /* + * Attempt to atomically claim all the wait tokens and bump the event count + * using an atomic compare-and-swap. This operation contains a memory + * barrier. + */ + EventToken newState = ((state & ~WAITERS_MASK) + ONE_EVENT); + oldState = state; + state = atomic64_cmpxchg(&ec->state, oldState, newState); + // The cmpxchg fails when we lose a race with a new waiter or another + // signaller, so try again. + } while (unlikely(state != oldState)); + + + /* + * Wake the waiters by posting to the semaphore. This effectively transfers + * the wait tokens to the semaphore. There's sadly no bulk post for posix + * semaphores, so we've got to loop to do them all. + */ + while (waiters-- > 0) { + releaseSemaphore(&ec->semaphore); + } +} + +/** + * Attempt to cancel a prepared wait token by decrementing the + * number of waiters in the current state. This can only be done + * safely if the event count hasn't been bumped. + * + * @param ec the event count on which the wait token was issued + * @param token the wait to cancel + * + * @return true if the wait was cancelled, false if the caller must + * still wait on the semaphore + **/ +static INLINE bool fastCancel(EventCount *ec, EventToken token) +{ + EventToken currentToken = atomic64_read(&ec->state); + while (sameEvent(currentToken, token)) { + // Try to decrement the waiter count via compare-and-swap as if we had + // never prepared to wait. + EventToken et = atomic64_cmpxchg(&ec->state, currentToken, + currentToken - 1); + if (et == currentToken) { + return true; + } + currentToken = et; + } + return false; +} + +/** + * Consume a token from the semaphore, waiting (with an optional timeout) if + * one is not currently available. Also attempts to count the number of times + * we'll actually have to wait because there are no tokens (permits) available + * in the semaphore, and the number of times the wait times out. + * + * @param ec the event count instance + * @param timeout an optional timeout value to pass to attemptSemaphore() + * + * @return true if a token was consumed, otherwise false only if a timeout + * was specified and we timed out + **/ +static bool consumeWaitToken(EventCount *ec, const RelTime *timeout) +{ + // Try to grab a token without waiting. + if (attemptSemaphore(&ec->semaphore, 0)) { + return true; + } + + + if (timeout == NULL) { + acquireSemaphore(&ec->semaphore); + } else if (!attemptSemaphore(&ec->semaphore, *timeout)) { + return false; + } + return true; +} + +/**********************************************************************/ +int makeEventCount(EventCount **ecPtr) +{ + // The event count will be allocated on a cache line boundary so there will + // not be false sharing of the line with any other data structure. + EventCount *ec = NULL; + int result = ALLOCATE(1, EventCount, "event count", &ec); + if (result != UDS_SUCCESS) { + return result; + } + + atomic64_set(&ec->state, 0); + result = initializeSemaphore(&ec->semaphore, 0); + if (result != UDS_SUCCESS) { + FREE(ec); + return result; + } + + *ecPtr = ec; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeEventCount(EventCount *ec) +{ + if (ec == NULL) { + return; + } + destroySemaphore(&ec->semaphore); + FREE(ec); +} + +/**********************************************************************/ +EventToken eventCountPrepare(EventCount *ec) +{ + return atomic64_add_return(ONE_WAITER, &ec->state); +} + +/**********************************************************************/ +void eventCountCancel(EventCount *ec, EventToken token) +{ + // Decrement the waiter count if the event hasn't been signalled. + if (fastCancel(ec, token)) { + return; + } + // A signaller has already transferred (or promised to transfer) our token + // to the semaphore, so we must consume it from the semaphore by waiting. + eventCountWait(ec, token, NULL); +} + +/**********************************************************************/ +bool eventCountWait(EventCount *ec, EventToken token, const RelTime *timeout) +{ + + for (;;) { + // Wait for a signaller to transfer our wait token to the semaphore. + if (!consumeWaitToken(ec, timeout)) { + // The wait timed out, so we must cancel the token instead. Try to + // decrement the waiter count if the event hasn't been signalled. + if (fastCancel(ec, token)) { + return false; + } + /* + * We timed out, but a signaller came in before we could cancel the + * wait. We have no choice but to wait for the semaphore to be posted. + * Since signaller has promised to do it, the wait will be short. The + * timeout and the signal happened at about the same time, so either + * outcome could be returned. It's simpler to ignore the timeout. + */ + timeout = NULL; + continue; + } + + // A wait token has now been consumed from the semaphore. + + // Stop waiting if the count has changed since the token was acquired. + if (!sameEvent(token, atomic64_read(&ec->state))) { + return true; + } + + // We consumed someone else's wait token. Put it back in the semaphore, + // which will wake another waiter, hopefully one who can stop waiting. + releaseSemaphore(&ec->semaphore); + + // Attempt to give an earlier waiter a shot at the semaphore. + yieldScheduler(); + } +} diff --git a/source/uds/util/eventCount.h b/source/uds/util/eventCount.h new file mode 100644 index 0000000..e3f2a33 --- /dev/null +++ b/source/uds/util/eventCount.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/eventCount.h#1 $ + */ + +#ifndef EVENT_COUNT_H +#define EVENT_COUNT_H + +#include "timeUtils.h" +#include "typeDefs.h" + +/** + * An EventCount is a lock-free equivalent of a condition variable. + * + * Using an EventCount, a lock-free producer/consumer can wait for a state + * change (adding an item to an empty queue, for example) without spinning or + * falling back on the use of mutex-based locks. Signalling is cheap when + * there are no waiters (a memory fence), and preparing to wait is + * also inexpensive (an atomic add instruction). + * + * A lock-free producer should call eventCountBroadcast() after any mutation + * to the lock-free data structure that a consumer might be waiting on. The + * consumers should poll for work like this: + * + * for (;;) { + * // Fast path--no additional cost to consumer. + * if (lockFreeDequeue(&item)) { + * return item; + * } + * // Two-step wait: get current token and poll state, either cancelling + * // the wait or waiting for the token to be signalled. + * EventToken token = eventCountPrepare(ec); + * if (lockFreeDequeue(&item)) { + * eventCountCancel(ec, token); + * return item; + * } + * eventCountWait(ec, token, NULL); + * // State has changed, but must check condition again, so loop. + * } + * + * Once eventCountPrepare() is called, the caller should neither dally, sleep, + * nor perform long-running or blocking actions before passing the token to + * eventCountCancel() or eventCountWait(). The implementation is optimized for + * a short polling window, and will not perform well if there are outstanding + * tokens that have been signalled but not waited upon. + **/ + +typedef struct eventCount EventCount; + +typedef unsigned int EventToken; + +/** + * Allocate and initialize an EventCount. + * + * @param ecPtr a pointer to hold the new EventCount + **/ +__attribute__((warn_unused_result)) +int makeEventCount(EventCount **ecPtr); + +/** + * Free an EventCount. It must no longer be in use. + * + * @param ec the EventCount to free + **/ +void freeEventCount(EventCount *ec); + +/** + * Wake all threads that are waiting for the next event. + * + * @param ec the EventCount to signal + **/ +void eventCountBroadcast(EventCount *ec); + +/** + * Prepare to wait for the EventCount to change by capturing a token of its + * current state. The caller MUST eventually either call eventCountWait() or + * eventCountCancel() exactly once for each token obtained. + * + * @param ec the EventCount on which to prepare to wait + * + * @return an EventToken to be passed to the next eventCountWait() call + **/ +EventToken eventCountPrepare(EventCount *ec) + __attribute__((warn_unused_result)); + +/** + * Cancel a wait token that has been prepared but not waited upon. This must + * be called after eventCountPrepare() when eventCountWait() is not going to + * be invoked on the token. + * + * @param ec the EventCount from which a wait token was obtained + * @param token the wait token that will never be passed to eventCountWait() + **/ +void eventCountCancel(EventCount *ec, EventToken token); + +/** + * Check if the current event count state corresponds to the provided token, + * and if it is, wait for a signal that the state has changed. If an optional + * timeout is provided, the wait will terminate after the timeout has elapsed. + * Timing out automatically cancels the wait token, so callers must not + * attempt to cancel the token on timeout. + * + * @param ec the EventCount on which to wait + * @param token the EventToken returned by eventCountPrepare() + * @param timeout either NULL or a relative timeout for the wait operation + * + * @return true if the state has already changed or if signalled, otherwise + * false if a timeout was provided and the wait timed out + **/ +bool eventCountWait(EventCount *ec, EventToken token, const RelTime *timeout); + +#endif /* EVENT_COUNT_H */ diff --git a/source/uds/util/funnelQueue.c b/source/uds/util/funnelQueue.c new file mode 100644 index 0000000..017e405 --- /dev/null +++ b/source/uds/util/funnelQueue.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/funnelQueue.c#2 $ + */ + +#include "funnelQueue.h" + +#include "memoryAlloc.h" +#include "permassert.h" +#include "uds.h" + +/**********************************************************************/ +int makeFunnelQueue(FunnelQueue **queuePtr) +{ + // Allocate the queue on a cache line boundary so the producer and consumer + // fields in the structure will land on separate cache lines. + FunnelQueue *queue; + int result = ALLOCATE(1, FunnelQueue, "funnel queue", &queue); + if (result != UDS_SUCCESS) { + return result; + } + + // Initialize the stub entry and put it in the queue, establishing the + // invariant that queue->newest and queue->oldest are never null. + queue->stub.next = NULL; + queue->newest = &queue->stub; + queue->oldest = &queue->stub; + + *queuePtr = queue; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeFunnelQueue(FunnelQueue *queue) +{ + FREE(queue); +} + +/**********************************************************************/ +static FunnelQueueEntry *getOldest(FunnelQueue *queue) +{ + /* + * Barrier requirements: We need a read barrier between reading a "next" + * field pointer value and reading anything it points to. There's an + * accompanying barrier in funnelQueuePut between its caller setting up the + * entry and making it visible. + */ + FunnelQueueEntry *oldest = queue->oldest; + FunnelQueueEntry *next = oldest->next; + + if (oldest == &queue->stub) { + // When the oldest entry is the stub and it has no successor, the queue is + // logically empty. + if (next == NULL) { + return NULL; + } + // The stub entry has a successor, so the stub can be dequeued and ignored + // without breaking the queue invariants. + oldest = next; + queue->oldest = oldest; + smp_read_barrier_depends(); + next = oldest->next; + } + + // We have a non-stub candidate to dequeue. If it lacks a successor, we'll + // need to put the stub entry back on the queue first. + if (next == NULL) { + FunnelQueueEntry *newest = queue->newest; + if (oldest != newest) { + // Another thread has already swung queue->newest atomically, but not + // yet assigned previous->next. The queue is really still empty. + return NULL; + } + + // Put the stub entry back on the queue, ensuring a successor will + // eventually be seen. + funnelQueuePut(queue, &queue->stub); + + // Check again for a successor. + next = oldest->next; + if (next == NULL) { + // We lost a race with a producer who swapped queue->newest before we + // did, but who hasn't yet updated previous->next. Try again later. + return NULL; + } + } + return oldest; +} + +/**********************************************************************/ +FunnelQueueEntry *funnelQueuePoll(FunnelQueue *queue) +{ + FunnelQueueEntry *oldest = getOldest(queue); + if (oldest == NULL) { + return oldest; + } + + /* + * Dequeue the oldest entry and return it. Only one consumer thread may call + * this function, so no locking, atomic operations, or fences are needed; + * queue->oldest is owned by the consumer and oldest->next is never used by + * a producer thread after it is swung from NULL to non-NULL. + */ + queue->oldest = oldest->next; + /* + * Make sure the caller sees the proper stored data for this entry. + * + * Since we've already fetched the entry pointer we stored in + * "queue->oldest", this also ensures that on entry to the next call we'll + * properly see the dependent data. + */ + smp_rmb(); + /* + * If "oldest" is a very light-weight work item, we'll be looking + * for the next one very soon, so prefetch it now. + */ + prefetchAddress(queue->oldest, true); + oldest->next = NULL; + return oldest; +} + +/**********************************************************************/ +bool isFunnelQueueEmpty(FunnelQueue *queue) +{ + return getOldest(queue) == NULL; +} + +/**********************************************************************/ +bool isFunnelQueueIdle(FunnelQueue *queue) +{ + /* + * Oldest is not the stub, so there's another entry, though if next is + * NULL we can't retrieve it yet. + */ + if (queue->oldest != &queue->stub) { + return false; + } + + /* + * Oldest is the stub, but newest has been updated by _put(); either + * there's another, retrievable entry in the list, or the list is + * officially empty but in the intermediate state of having an entry + * added. + * + * Whether anything is retrievable depends on whether stub.next has + * been updated and become visible to us, but for idleness we don't + * care. And due to memory ordering in _put(), the update to newest + * would be visible to us at the same time or sooner. + */ + if (queue->newest != &queue->stub) { + return false; + } + + // Otherwise, we're idle. + return true; +} diff --git a/source/uds/util/funnelQueue.h b/source/uds/util/funnelQueue.h new file mode 100644 index 0000000..083d00b --- /dev/null +++ b/source/uds/util/funnelQueue.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/funnelQueue.h#2 $ + */ + +#ifndef FUNNEL_QUEUE_H +#define FUNNEL_QUEUE_H + +#include "atomicDefs.h" +#include "compiler.h" +#include "cpu.h" +#include "typeDefs.h" + +/** + * A FunnelQueue is a simple lock-free (almost) queue that accepts entries + * from multiple threads (multi-producer) and delivers them to a single thread + * (single-consumer). "Funnel" is an attempt to evoke the image of requests + * from more than one producer being "funneled down" to a single consumer. + * + * This is an unsynchronized but thread-safe data structure when used as + * intended. There is no mechanism to ensure that only one thread is consuming + * from the queue, so if that is done mistakenly, it will not be trapped, and + * the resulting behavior is undefined. Clients must not directly access or + * manipulate the internals, which are only exposed for the purpose of + * allowing the very simple enqueue operation to be in-lined. + * + * The implementation requires that a FunnelQueueEntry structure (a link + * pointer) be embedded in the queue entries, and pointers to those structures + * are used exclusively by the queue. No macros are defined to template the + * queue, so the offset of the FunnelQueueEntry in the records placed in the + * queue must all have a fixed offset so the client can derive their structure + * pointer from the entry pointer returned by funnelQueuePoll(). + * + * Callers are wholly responsible for allocating and freeing the entries. + * Entries may be freed as soon as they are returned since this queue is not + * susceptible to the "ABA problem" present in many lock-free data structures. + * The queue is dynamically allocated to ensure cache-line alignment, but no + * other dynamic allocation is used. + * + * The algorithm is not actually 100% lock-free. There is a single point in + * funnelQueuePut() at which a pre-empted producer will prevent the consumers + * from seeing items added to the queue by later producers, and only if the + * queue is short enough or the consumer fast enough for it to reach what was + * the end of the queue at the time of the pre-empt. + * + * The consumer function, funnelQueuePoll(), will return NULL when the queue + * is empty. To wait for data to consume, spin (if safe) or combine the queue + * with an EventCount to signal the presence of new entries. + **/ + +/** + * The queue link structure that must be embedded in client entries. + **/ +typedef struct funnelQueueEntry { + // The next (newer) entry in the queue. + struct funnelQueueEntry * volatile next; +} FunnelQueueEntry; + +/** + * The dynamically allocated queue structure, which is aligned to a cache line + * boundary when allocated. This should be consider opaque; it is exposed here + * so funnelQueuePut() can be in-lined. + **/ +typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) funnelQueue { + // The producers' end of the queue--an atomically exchanged pointer that + // will never be NULL. + FunnelQueueEntry * volatile newest; + + // The consumer's end of the queue. Owned by the consumer and never NULL. + FunnelQueueEntry *oldest __attribute__((aligned(CACHE_LINE_BYTES))); + + // A re-usable dummy entry used to provide the non-NULL invariants above. + FunnelQueueEntry stub; +} FunnelQueue; + +/** + * Construct and initialize a new, empty queue. + * + * @param queuePtr a pointer in which to store the queue + * + * @return UDS_SUCCESS or an error code + **/ +int makeFunnelQueue(FunnelQueue **queuePtr) + __attribute__((warn_unused_result)); + +/** + * Free a queue. + * + * This will not free any entries in the queue. The caller must ensure that + * either the queue will be empty or that any entries in the queue will not be + * leaked by dropping the references from queue. + * + * @param queue the queue to free + **/ +void freeFunnelQueue(FunnelQueue *queue); + +/** + * Put an entry on the end of the queue. + * + * The entry pointer must be to the FunnelQueueEntry embedded in the caller's + * data structure. The caller must be able to derive the address of the start + * of their data structure from the pointer that passed in here, so every + * entry in the queue must have the FunnelQueueEntry at the same offset within + * the client's structure. + * + * @param queue the queue on which to place the entry + * @param entry the entry to be added to the queue + **/ +static INLINE void funnelQueuePut(FunnelQueue *queue, FunnelQueueEntry *entry) +{ + /* + * Barrier requirements: All stores relating to the entry ("next" pointer, + * containing data structure fields) must happen before the previous->next + * store making it visible to the consumer. Also, the entry's "next" field + * initialization to NULL must happen before any other producer threads can + * see the entry (the xchg) and try to update the "next" field. + * + * xchg implements a full barrier. + */ + entry->next = NULL; + /* + * The xchg macro in the PPC kernel calls a function that takes a void* + * argument, triggering a warning about dropping the volatile qualifier. + */ +#pragma GCC diagnostic push +#if __GNUC__ >= 5 +#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers" +#endif + FunnelQueueEntry *previous = xchg(&queue->newest, entry); +#pragma GCC diagnostic pop + // Pre-empts between these two statements hide the rest of the queue from + // the consumer, preventing consumption until the following assignment runs. + previous->next = entry; +} + +/** + * Poll a queue, removing the oldest entry if the queue is not empty. This + * function must only be called from a single consumer thread. + * + * @param queue the queue from which to remove an entry + * + * @return the oldest entry in the queue, or NULL if the queue is empty. + **/ +FunnelQueueEntry *funnelQueuePoll(FunnelQueue *queue) + __attribute__((warn_unused_result)); + +/** + * Check whether the funnel queue is empty or not. This function must only be + * called from a single consumer thread, as with funnelQueuePoll. + * + * If the queue is in a transition state with one or more entries being added + * such that the list view is incomplete, it may not be possible to retrieve an + * entry with the funnelQueuePoll() function. In such states this function will + * report an empty indication. + * + * @param queue the queue which to check for entries. + * + * @return true iff queue contains no entry which can be retrieved + **/ +bool isFunnelQueueEmpty(FunnelQueue *queue) + __attribute__((warn_unused_result)); + +/** + * Check whether the funnel queue is idle or not. This function must only be + * called from a single consumer thread, as with funnel_queue_poll. + * + * If the queue has entries available to be retrieved, it is not idle. If the + * queue is in a transition state with one or more entries being added such + * that the list view is incomplete, it may not be possible to retrieve an + * entry with the funnel_queue_poll() function, but the queue will not be + * considered idle. + * + * @param queue the queue which to check for entries. + * + * @return true iff queue contains no entry which can be retrieved nor is + * known to be having an entry added + **/ +bool isFunnelQueueIdle(FunnelQueue *queue) + __attribute__((warn_unused_result)); + +#endif /* FUNNEL_QUEUE_H */ diff --git a/source/uds/util/radixSort.c b/source/uds/util/radixSort.c new file mode 100644 index 0000000..cae4f90 --- /dev/null +++ b/source/uds/util/radixSort.c @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/radixSort.c#2 $ + */ + +/* + * Radix sort is implemented using an American Flag sort, an unstable, + * in-place 8-bit radix exchange sort. + * + * Adapted from the algorithm in the paper by Peter M. McIlroy, Keith Bostic, + * and M. Douglas McIlroy, "Engineering Radix Sort". + * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf + */ + +#include "radixSort.h" + +#include "compiler.h" +#include "memoryAlloc.h" +#include "stringUtils.h" +#include "typeDefs.h" +#include "uds.h" + +enum { + // Piles smaller than this are handled with a simple insertion sort. + INSERTION_SORT_THRESHOLD = 12 +}; + +// Sort keys are pointers to immutable fixed-length arrays of bytes. +typedef const uint8_t * Key; + +/** + * The keys are separated into piles based on the byte in each + * keys at the current offset, so the number of keys with each + * byte must be counted. + **/ +typedef struct { + uint16_t used; // number of non-empty bins + uint16_t first; // index (key byte) of the first non-empty bin + uint16_t last; // index (key byte) of the last non-empty bin + uint32_t size[256]; // size[byte] == # of occurrences of byte +} Histogram; + +/** + * Sub-tasks are manually managed on a stack, both for performance + * and to put a logarithmic bound on the stack space needed. + **/ +typedef struct { + Key *firstKey; // Pointers to first and last keys to sort, inclusive. + Key *lastKey; + uint16_t offset; // The offset into the key at which to continue sorting. + uint16_t length; // The number of bytes remaining in the sort keys. +} Task; + +struct radixSorter { + unsigned int count; + Histogram bins; + Key *pile[256]; + Task *endOfStack; + Task isList[256]; + Task stack[]; +}; + +/** + * Compare a segment of two fixed-length keys starting an offset. + * + * @param key1 the first key + * @param key2 the second key + * @param offset the offset into the keys of the first byte to compare + * @param length the number of bytes remaining in each key + **/ +static INLINE int compare(Key key1, Key key2, uint16_t offset, uint16_t length) +{ + return memcmp(&key1[offset], &key2[offset], length); +} + +/** + * Insert the next unsorted key into an array of sorted keys. + * + * @param task the description of the keys being sorted + * @param next the pointer to the unsorted key to insert into + * the array of sorted key pointers preceding it + **/ +static INLINE void insertKey(const Task task, Key *next) +{ + // Pull the unsorted key out, freeing up the array slot. + Key unsorted = *next; + // Compare the key to the preceding sorted entries, shifting + // down the ones that are larger. + while ((--next >= task.firstKey) + && (compare(unsorted, next[0], task.offset, task.length) < 0)) { + next[1] = next[0]; + } + // Insert the key into the last slot that was cleared, sorting it. + next[1] = unsorted; +} + +/** + * Sort a range of key segments using an insertion sort. This simple sort is + * faster than the 256-way radix sort when the number of keys to sort is + * small. + * + * @param task the description of the keys to sort + **/ +static INLINE void insertionSort(const Task task) +{ + // (firstKey .. firstKey) is trivially sorted. Repeatedly insert the next + // key into the sorted list of keys preceding it, and voila! + Key *next; + for (next = task.firstKey + 1; next <= task.lastKey; next++) { + insertKey(task, next); + } +} + +/** + * Push a sorting task onto the task stack, increasing the stack pointer. + **/ +static INLINE void pushTask(Task **stackPointer, + Key *firstKey, + uint32_t count, + uint16_t offset, + uint16_t length) +{ + Task *task = (*stackPointer)++; + task->firstKey = firstKey; + task->lastKey = &firstKey[count - 1]; + task->offset = offset; + task->length = length; +} + +/**********************************************************************/ +static INLINE void swapKeys(Key *a, Key *b) +{ + Key c = *a; + *a = *b; + *b = c; +} + +/** + * Count the number of times each byte value appears in in the arrays of keys + * to sort at the current offset, keeping track of the number of non-empty + * bins, and the index of the first and last non-empty bin. + * + * @param task the description of the keys to sort + * @param bins the histogram bins receiving the counts + **/ +static INLINE void measureBins(const Task task, Histogram *bins) +{ + // Set bogus values that will will be replaced by min and max, respectively. + bins->first = UINT8_MAX; + bins->last = 0; + + // Subtle invariant: bins->used and bins->size[] are zero because the + // sorting code clears it all out as it goes. Even though this structure is + // re-used, we don't need to pay to zero it before starting a new tally. + + Key *keyPtr; + for (keyPtr = task.firstKey; keyPtr <= task.lastKey; keyPtr++) { + // Increment the count for the byte in the key at the current offset. + uint8_t bin = (*keyPtr)[task.offset]; + uint32_t size = ++bins->size[bin]; + + // Track non-empty bins when the count transitions from zero to one. + if (size == 1) { + bins->used += 1; + if (bin < bins->first) { + bins->first = bin; + } + if (bin > bins->last) { + bins->last = bin; + } + } + } +} + +/** + * Convert the bin sizes to pointers to where each pile goes. + * + * pile[0] = firstKey + bin->size[0], + * pile[1] = pile[0] + bin->size[1], etc. + * + * After the keys are moved to the appropriate pile, we'll need to sort + * each of the piles by the next radix position. A new task is put on the + * stack for each pile containing lots of keys, or a new task is is put on + * the list for each pile containing few keys. + * + * @param stack pointer the top of the stack + * @param endOfStack the end of the stack + * @param list pointer the head of the list + * @param pile array that will be filled pointers to the end of each pile + * @param bins the histogram of the sizes of each pile + * @param firstKey the first key of the stack + * @param offset the next radix position to sort by + * @param length the number of bytes remaining in the sort keys + * + * @return UDS_SUCCESS or an error code + **/ +static INLINE int pushBins(Task **stack, + Task *endOfStack, + Task **list, + Key *pile[], + Histogram *bins, + Key *firstKey, + uint16_t offset, + uint16_t length) +{ + Key *pileStart = firstKey; + int bin; + for (bin = bins->first; ; bin++) { + uint32_t size = bins->size[bin]; + // Skip empty piles. + if (size == 0) { + continue; + } + // There's no need to sort empty keys. + if (length > 0) { + if (size > INSERTION_SORT_THRESHOLD) { + if (*stack >= endOfStack) { + return UDS_BAD_STATE; + } + pushTask(stack, pileStart, size, offset, length); + } else if (size > 1) { + pushTask(list, pileStart, size, offset, length); + } + } + pileStart += size; + pile[bin] = pileStart; + if (--bins->used == 0) { + break; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeRadixSorter(unsigned int count, RadixSorter **sorter) +{ + unsigned int stackSize = count / INSERTION_SORT_THRESHOLD; + RadixSorter *radixSorter; + int result = ALLOCATE_EXTENDED(RadixSorter, stackSize, Task, __func__, + &radixSorter); + if (result != UDS_SUCCESS) { + return result; + } + radixSorter->count = count; + radixSorter->endOfStack = radixSorter->stack + stackSize; + *sorter = radixSorter; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeRadixSorter(RadixSorter *sorter) +{ + FREE(sorter); +} + +/**********************************************************************/ +int radixSort(RadixSorter *sorter, + const unsigned char *keys[], + unsigned int count, + unsigned short length) +{ + // All zero-length keys are identical and therefore already sorted. + if ((count == 0) || (length == 0)) { + return UDS_SUCCESS; + } + + // The initial task is to sort the entire length of all the keys. + Task start = { + .firstKey = keys, + .lastKey = &keys[count - 1], + .offset = 0, + .length = length, + }; + + if (count <= INSERTION_SORT_THRESHOLD) { + insertionSort(start); + return UDS_SUCCESS; + } + + if (count > sorter->count) { + return UDS_INVALID_ARGUMENT; + } + + Histogram *bins = &sorter->bins; + Key **pile = sorter->pile; + Task *sp = sorter->stack; + + /* + * Repeatedly consume a sorting task from the stack and process it, pushing + * new sub-tasks onto to the stack for each radix-sorted pile. When all + * tasks and sub-tasks have been processed, the stack will be empty and all + * the keys in the starting task will be fully sorted. + */ + for (*sp = start; sp >= sorter->stack; sp--) { + const Task task = *sp; + measureBins(task, bins); + + // Now that we know how large each bin is, generate pointers for each of + // the piles and push a new task to sort each pile by the next radix byte. + Task *lp = sorter->isList; + int result = pushBins(&sp, sorter->endOfStack, &lp, pile, bins, + task.firstKey, task.offset + 1, task.length - 1); + if (result != UDS_SUCCESS) { + memset(bins, 0, sizeof(*bins)); + return result; + } + // Now bins->used is zero again. + + // Don't bother processing the last pile--when piles 0..N-1 are all in + // place, then pile N must also be in place. + Key *end = task.lastKey - bins->size[bins->last]; + bins->size[bins->last] = 0; + + Key *fence; + for (fence = task.firstKey; fence <= end; ) { + uint8_t bin; + Key key = *fence; + // The radix byte of the key tells us which pile it belongs in. Swap it + // for an unprocessed item just below that pile, and repeat. + while (--pile[bin = key[task.offset]] > fence) { + swapKeys(pile[bin], &key); + } + // The pile reached the fence. Put the key at the bottom of that pile. + // completing it, and advance the fence to the next pile. + *fence = key; + fence += bins->size[bin]; + bins->size[bin] = 0; + } + // Now bins->size[] is all zero again. + + // When the number of keys in a task gets small enough, its faster to use + // an insertion sort than to keep subdividing into tiny piles. + while (--lp >= sorter->isList) { + insertionSort(*lp); + } + } + return UDS_SUCCESS; +} diff --git a/source/uds/util/radixSort.h b/source/uds/util/radixSort.h new file mode 100644 index 0000000..55f19ba --- /dev/null +++ b/source/uds/util/radixSort.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/util/radixSort.h#1 $ + */ + +#ifndef RADIX_SORT_H +#define RADIX_SORT_H + +/* + * The implementation uses one large object allocated on the heap. This + * large object can be reused as many times as desired. There is no + * further heap usage by the sorting. + */ +typedef struct radixSorter RadixSorter; + +/** + * Reserve the heap storage needed by the radixSort routine. The amount of + * heap space is logarithmically proportional to the number of keys. + * + * @param count The maximum number of keys to be sorted + * @param sorter The RadixSorter object is returned here + * + * @return UDS_SUCCESS or an error code + **/ +int makeRadixSorter(unsigned int count, RadixSorter **sorter) + __attribute__((warn_unused_result)); + +/** + * Free the heap storage needed by the radixSort routine. + * + * @param sorter The RadixSorter object to free + **/ +void freeRadixSorter(RadixSorter *sorter); + +/** + * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. + * + * The sort implementation is unstable--relative ordering of equal keys is not + * preserved. The implementation does not use any heap allocation. + * + * @param [in] sorter the heap storage used by the sorting + * @param keys the array of key pointers to sort (modified in place) + * @param [in] count the number of keys + * @param [in] length the length of every key, in bytes + * + * @return UDS_SUCCESS or an error code + **/ +int radixSort(RadixSorter *sorter, + const unsigned char *keys[], + unsigned int count, + unsigned short length) + __attribute__((warn_unused_result)); + +#endif /* RADIX_SORT_H */ diff --git a/source/uds/volume.c b/source/uds/volume.c new file mode 100644 index 0000000..4f320c5 --- /dev/null +++ b/source/uds/volume.c @@ -0,0 +1,1383 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/volume.c#23 $ + */ + +#include "volume.h" + +#include "cacheCounters.h" +#include "chapterIndex.h" +#include "compiler.h" +#include "errors.h" +#include "geometry.h" +#include "hashUtils.h" +#include "indexConfig.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "recordPage.h" +#include "request.h" +#include "sparseCache.h" +#include "stringUtils.h" +#include "threads.h" + +enum { + MAX_BAD_CHAPTERS = 100, // max number of contiguous bad chapters + DEFAULT_VOLUME_READ_THREADS = 2, // Default number of reader threads + MAX_VOLUME_READ_THREADS = 16, // Maximum number of reader threads +}; + +/**********************************************************************/ +static unsigned int getReadThreads(const struct uds_parameters *userParams) +{ + unsigned int readThreads = (userParams == NULL + ? DEFAULT_VOLUME_READ_THREADS + : userParams->read_threads); + if (readThreads < 1) { + readThreads = 1; + } + if (readThreads > MAX_VOLUME_READ_THREADS) { + readThreads = MAX_VOLUME_READ_THREADS; + } + return readThreads; +} + +/**********************************************************************/ +static INLINE unsigned int mapToPageNumber(Geometry *geometry, + unsigned int physicalPage) +{ + return ((physicalPage - 1) % geometry->pagesPerChapter); +} + +/**********************************************************************/ +static INLINE unsigned int mapToChapterNumber(Geometry *geometry, + unsigned int physicalPage) +{ + return ((physicalPage - 1) / geometry->pagesPerChapter); +} + +/**********************************************************************/ +static INLINE bool isRecordPage(Geometry *geometry, unsigned int physicalPage) +{ + return (((physicalPage - 1) % geometry->pagesPerChapter) + >= geometry->indexPagesPerChapter); +} + +/**********************************************************************/ +static INLINE unsigned int getZoneNumber(Request *request) +{ + return (request == NULL) ? 0 : request->zoneNumber; +} + +/**********************************************************************/ +int mapToPhysicalPage(const Geometry *geometry, int chapter, int page) +{ + // Page zero is the header page, so the first index page in the + // first chapter is physical page one. + return (1 + (geometry->pagesPerChapter * chapter) + page); +} + +/**********************************************************************/ +static void waitForReadQueueNotFull(Volume *volume, Request *request) +{ + unsigned int zoneNumber = getZoneNumber(request); + InvalidateCounter invalidateCounter = getInvalidateCounter(volume->pageCache, + zoneNumber); + if (searchPending(invalidateCounter)) { + // Increment the invalidate counter to avoid deadlock where the reader + // threads cannot make progress because they are waiting on the counter + // and the index thread cannot because the read queue is full. + endPendingSearch(volume->pageCache, zoneNumber); + } + + while (readQueueIsFull(volume->pageCache)) { + logDebug("Waiting until read queue not full"); + signalCond(&volume->readThreadsCond); + waitCond(&volume->readThreadsReadDoneCond, &volume->readThreadsMutex); + } + + if (searchPending(invalidateCounter)) { + // Increment again so we get back to an odd value. + beginPendingSearch(volume->pageCache, pageBeingSearched(invalidateCounter), + zoneNumber); + } +} + +/**********************************************************************/ +int enqueuePageRead(Volume *volume, Request *request, int physicalPage) +{ + // Don't allow new requests if we are shutting down, but make sure + // to process any requests that are still in the pipeline. + if ((volume->readerState & READER_STATE_EXIT) != 0) { + logInfo("failed to queue read while shutting down"); + return UDS_SHUTTINGDOWN; + } + + // Mark the page as queued in the volume cache, for chapter invalidation to + // be able to cancel a read. + // If we are unable to do this because the queues are full, flush them first + int result; + while ((result = enqueueRead(volume->pageCache, request, physicalPage)) + == UDS_SUCCESS) { + logDebug("Read queues full, waiting for reads to finish"); + waitForReadQueueNotFull(volume, request); + } + + if (result == UDS_QUEUED) { + /* signal a read thread */ + signalCond(&volume->readThreadsCond); + } + + return result; +} + +/**********************************************************************/ +static INLINE void waitToReserveReadQueueEntry(Volume *volume, + unsigned int *queuePos, + Request **requestList, + unsigned int *physicalPage, + bool *invalid) +{ + while (((volume->readerState & READER_STATE_EXIT) == 0) + && (((volume->readerState & READER_STATE_STOP) != 0) + || !reserveReadQueueEntry(volume->pageCache, queuePos, + requestList, physicalPage, invalid))) { + waitCond(&volume->readThreadsCond, &volume->readThreadsMutex); + } +} + +/**********************************************************************/ +static int initChapterIndexPage(const Volume *volume, + byte *indexPage, + unsigned int chapter, + unsigned int indexPageNumber, + DeltaIndexPage *chapterIndexPage) +{ + Geometry *geometry = volume->geometry; + + int result = initializeChapterIndexPage(chapterIndexPage, geometry, + indexPage, volume->nonce); + if (volume->lookupMode == LOOKUP_FOR_REBUILD) { + return result; + } + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "Reading chapter index page for chapter %u" + " page %u", + chapter, indexPageNumber); + } + + IndexPageBounds bounds; + result = getListNumberBounds(volume->indexPageMap, chapter, + indexPageNumber, &bounds); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t ciVirtual = chapterIndexPage->virtualChapterNumber; + unsigned int ciChapter = mapToPhysicalChapter(geometry, ciVirtual); + if ((chapter == ciChapter) + && (bounds.lowestList == chapterIndexPage->lowestListNumber) + && (bounds.highestList == chapterIndexPage->highestListNumber)) { + return UDS_SUCCESS; + } + + logWarning("Index page map updated to %llu", + getLastUpdate(volume->indexPageMap)); + logWarning("Page map expects that chapter %u page %u has range %u to %u, " + "but chapter index page has chapter %" PRIu64 + " with range %u to %u", + chapter, indexPageNumber, bounds.lowestList, bounds.highestList, + ciVirtual, chapterIndexPage->lowestListNumber, + chapterIndexPage->highestListNumber); + return ASSERT_WITH_ERROR_CODE(false, + UDS_CORRUPT_DATA, + "index page map mismatch with chapter index"); +} + +/**********************************************************************/ +static int initializeIndexPage(const Volume *volume, + unsigned int physicalPage, + CachedPage *page) +{ + unsigned int chapter = mapToChapterNumber(volume->geometry, physicalPage); + unsigned int indexPageNumber = mapToPageNumber(volume->geometry, + physicalPage); + int result = initChapterIndexPage(volume, getPageData(&page->cp_pageData), + chapter, indexPageNumber, + &page->cp_indexPage); + return result; +} + +/**********************************************************************/ +static void readThreadFunction(void *arg) +{ + Volume *volume = arg; + unsigned int queuePos; + Request *requestList; + unsigned int physicalPage; + bool invalid = false; + + logDebug("reader starting"); + lockMutex(&volume->readThreadsMutex); + while (true) { + waitToReserveReadQueueEntry(volume, &queuePos, &requestList, &physicalPage, + &invalid); + if ((volume->readerState & READER_STATE_EXIT) != 0) { + break; + } + + volume->busyReaderThreads++; + + bool recordPage = isRecordPage(volume->geometry, physicalPage); + + CachedPage *page = NULL; + int result = UDS_SUCCESS; + if (!invalid) { + // Find a place to put the read queue page we reserved above. + result = selectVictimInCache(volume->pageCache, &page); + if (result == UDS_SUCCESS) { + unlockMutex(&volume->readThreadsMutex); + result = readVolumePage(&volume->volumeStore, physicalPage, + &page->cp_pageData); + if (result != UDS_SUCCESS) { + logWarning("Error reading page %u from volume", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + } + lockMutex(&volume->readThreadsMutex); + } else { + logWarning("Error selecting cache victim for page read"); + } + + if (result == UDS_SUCCESS) { + if (!volume->pageCache->readQueue[queuePos].invalid) { + if (!recordPage) { + result = initializeIndexPage(volume, physicalPage, page); + if (result != UDS_SUCCESS) { + logWarning("Error initializing chapter index page"); + cancelPageInCache(volume->pageCache, physicalPage, page); + } + } + + if (result == UDS_SUCCESS) { + result = putPageInCache(volume->pageCache, physicalPage, page); + if (result != UDS_SUCCESS) { + logWarning("Error putting page %u in cache", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + } + } + } else { + logWarning("Page %u invalidated after read", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + invalid = true; + } + } + } else { + logDebug("Requeuing requests for invalid page"); + } + + if (invalid) { + result = UDS_SUCCESS; + page = NULL; + } + + while (requestList != NULL) { + Request *request = requestList; + requestList = request->nextRequest; + + /* + * If we've read in a record page, we're going to do an immediate search, + * in an attempt to speed up processing when we requeue the request, so + * that it doesn't have to go back into the getRecordFromZone code again. + * However, if we've just read in an index page, we don't want to search. + * We want the request to be processed again and getRecordFromZone to be + * run. We have added new fields in request to allow the index code to + * know whether it can stop processing before getRecordFromZone is called + * again. + */ + if ((result == UDS_SUCCESS) && (page != NULL) && recordPage) { + if (searchRecordPage(getPageData(&page->cp_pageData), + &request->chunkName, volume->geometry, + &request->oldMetadata)) { + request->slLocation = LOC_IN_DENSE; + } else { + request->slLocation = LOC_UNAVAILABLE; + } + request->slLocationKnown = true; + } + + // reflect any read failures in the request status + request->status = result; + restartRequest(request); + } + + releaseReadQueueEntry(volume->pageCache, queuePos); + + volume->busyReaderThreads--; + broadcastCond(&volume->readThreadsReadDoneCond); + } + unlockMutex(&volume->readThreadsMutex); + logDebug("reader done"); +} + +/**********************************************************************/ +static int readPageLocked(Volume *volume, + Request *request, + unsigned int physicalPage, + bool syncRead, + CachedPage **pagePtr) +{ + syncRead |= ((volume->lookupMode == LOOKUP_FOR_REBUILD) + || (request == NULL) + || (request->session == NULL)); + + int result = UDS_SUCCESS; + + CachedPage *page = NULL; + if (syncRead) { + // Find a place to put the page. + result = selectVictimInCache(volume->pageCache, &page); + if (result != UDS_SUCCESS) { + logWarning("Error selecting cache victim for page read"); + return result; + } + result = readVolumePage(&volume->volumeStore, physicalPage, + &page->cp_pageData); + if (result != UDS_SUCCESS) { + logWarning("Error reading page %u from volume", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + if (!isRecordPage(volume->geometry, physicalPage)) { + result = initializeIndexPage(volume, physicalPage, page); + if (result != UDS_SUCCESS) { + if (volume->lookupMode != LOOKUP_FOR_REBUILD) { + logWarning("Corrupt index page %u", physicalPage); + } + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + } + result = putPageInCache(volume->pageCache, physicalPage, page); + if (result != UDS_SUCCESS) { + logWarning("Error putting page %u in cache", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + } else { + result = enqueuePageRead(volume, request, physicalPage); + if (result != UDS_SUCCESS) { + return result; + } + } + + *pagePtr = page; + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getPageLocked(Volume *volume, + Request *request, + unsigned int physicalPage, + CacheProbeType probeType, + CachedPage **pagePtr) +{ + CachedPage *page = NULL; + int result = getPageFromCache(volume->pageCache, physicalPage, probeType, + &page); + if (result != UDS_SUCCESS) { + return result; + } + if (page == NULL) { + result = readPageLocked(volume, request, physicalPage, true, &page); + if (result != UDS_SUCCESS) { + return result; + } + } else if (getZoneNumber(request) == 0) { + // Only 1 zone is responsible for updating LRU + makePageMostRecent(volume->pageCache, page); + } + + *pagePtr = page; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getPageProtected(Volume *volume, + Request *request, + unsigned int physicalPage, + CacheProbeType probeType, + CachedPage **pagePtr) +{ + CachedPage *page = NULL; + int result = getPageFromCache(volume->pageCache, physicalPage, + probeType | CACHE_PROBE_IGNORE_FAILURE, + &page); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int zoneNumber = getZoneNumber(request); + // If we didn't find a page we need to enqueue a read for it, in which + // case we need to grab the mutex. + if (page == NULL) { + endPendingSearch(volume->pageCache, zoneNumber); + lockMutex(&volume->readThreadsMutex); + + /* + * Do the lookup again while holding the read mutex (no longer the fast + * case so this should be ok to repeat). We need to do this because an + * page may have been added to the page map by the reader thread between + * the time searched above and the time we went to actually try to enqueue + * it below. This could result in us enqueuing another read for an page + * which is already in the cache, which would mean we end up with two + * entries in the cache for the same page. + */ + result + = getPageFromCache(volume->pageCache, physicalPage, probeType, &page); + if (result != UDS_SUCCESS) { + /* + * In non-success cases (anything not UDS_SUCCESS, meaning both + * UDS_QUEUED and "real" errors), the caller doesn't get a + * handle on a cache page, so it can't continue the search, and + * we don't need to prevent other threads from messing with the + * cache. + * + * However, we do need to set the "search pending" flag because + * the callers expect it to always be set on return, even if + * they can't actually do the search. + * + * Doing the calls in this order ought to be faster, since we + * let other threads have the reader thread mutex (which can + * require a syscall) ASAP, and set the "search pending" state + * that can block the reader thread as the last thing. + */ + unlockMutex(&volume->readThreadsMutex); + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + return result; + } + + // If we found the page now, we can release the mutex and proceed + // as if this were the fast case. + if (page != NULL) { + /* + * If we found a page (*pagePtr != NULL and return + * UDS_SUCCESS), then we're telling the caller where to look for + * the cache page, and need to switch to "reader thread + * unlocked" and "search pending" state in careful order so no + * other thread can mess with the data before our caller gets to + * look at it. + */ + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + unlockMutex(&volume->readThreadsMutex); + } + } + + if (page == NULL) { + result = readPageLocked(volume, request, physicalPage, false, &page); + if (result != UDS_SUCCESS) { + /* + * This code path is used frequently in the UDS_QUEUED case, so + * the performance gain from unlocking first, while "search + * pending" mode is off, turns out to be significant in some + * cases. + */ + unlockMutex(&volume->readThreadsMutex); + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + return result; + } + + // See above re: ordering requirement. + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + unlockMutex(&volume->readThreadsMutex); + } else { + if (getZoneNumber(request) == 0 ) { + // Only 1 zone is responsible for updating LRU + makePageMostRecent(volume->pageCache, page); + } + } + + *pagePtr = page; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int getPage(Volume *volume, + unsigned int chapter, + unsigned int pageNumber, + CacheProbeType probeType, + byte **dataPtr, + DeltaIndexPage **indexPagePtr) +{ + unsigned int physicalPage + = mapToPhysicalPage(volume->geometry, chapter, pageNumber); + + lockMutex(&volume->readThreadsMutex); + CachedPage *page = NULL; + int result = getPageLocked(volume, NULL, physicalPage, probeType, &page); + unlockMutex(&volume->readThreadsMutex); + + if (dataPtr != NULL) { + *dataPtr = (page != NULL) ? getPageData(&page->cp_pageData) : NULL; + } + if (indexPagePtr != NULL) { + *indexPagePtr = (page != NULL) ? &page->cp_indexPage : NULL; + } + return result; +} + +/** + * Search for a chunk name in a cached index page or chapter index, returning + * the record page number from a chapter index match. + * + * @param volume the volume containing the index page to search + * @param request the request originating the search (may be NULL for + * a direct query from volume replay) + * @param name the name of the block or chunk + * @param chapter the chapter to search + * @param indexPageNumber the index page number of the page to search + * @param recordPageNumber pointer to return the chapter record page number + * (value will be NO_CHAPTER_INDEX_ENTRY if the name + * was not found) + * + * @return UDS_SUCCESS or an error code + **/ +static int searchCachedIndexPage(Volume *volume, + Request *request, + const UdsChunkName *name, + unsigned int chapter, + unsigned int indexPageNumber, + int *recordPageNumber) +{ + unsigned int zoneNumber = getZoneNumber(request); + unsigned int physicalPage + = mapToPhysicalPage(volume->geometry, chapter, indexPageNumber); + + /* + * Make sure the invalidate counter is updated before we try and read from + * the page map. This prevents this thread from reading a page in the + * page map which has already been marked for invalidation by the reader + * thread, before the reader thread has noticed that the invalidateCounter + * has been incremented. + */ + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + + CachedPage *page = NULL; + int result = getPageProtected(volume, request, physicalPage, + cacheProbeType(request, true), &page); + if (result != UDS_SUCCESS) { + endPendingSearch(volume->pageCache, zoneNumber); + return result; + } + + result + = ASSERT_LOG_ONLY(searchPending(getInvalidateCounter(volume->pageCache, + zoneNumber)), + "Search is pending for zone %u", zoneNumber); + if (result != UDS_SUCCESS) { + return result; + } + + result = searchChapterIndexPage(&page->cp_indexPage, volume->geometry, name, + recordPageNumber); + endPendingSearch(volume->pageCache, zoneNumber); + return result; +} + +/**********************************************************************/ +int searchCachedRecordPage(Volume *volume, + Request *request, + const UdsChunkName *name, + unsigned int chapter, + int recordPageNumber, + UdsChunkData *duplicate, + bool *found) +{ + *found = false; + + if (recordPageNumber == NO_CHAPTER_INDEX_ENTRY) { + // No record for that name can exist in the chapter. + return UDS_SUCCESS; + } + + Geometry *geometry = volume->geometry; + int result = ASSERT(((recordPageNumber >= 0) + && ((unsigned int) recordPageNumber + < geometry->recordPagesPerChapter)), + "0 <= %d <= %u", + recordPageNumber, geometry->recordPagesPerChapter); + if (result != UDS_SUCCESS) { + return result; + } + + unsigned int pageNumber = geometry->indexPagesPerChapter + recordPageNumber; + + unsigned int zoneNumber = getZoneNumber(request); + int physicalPage + = mapToPhysicalPage(volume->geometry, chapter, pageNumber); + + /* + * Make sure the invalidate counter is updated before we try and read from + * the page map. This prevents this thread from reading a page in the page + * map which has already been marked for invalidation by the reader thread, + * before the reader thread has noticed that the invalidateCounter has been + * incremented. + */ + beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); + + CachedPage *recordPage; + result = getPageProtected(volume, request, physicalPage, + cacheProbeType(request, false), &recordPage); + if (result != UDS_SUCCESS) { + endPendingSearch(volume->pageCache, zoneNumber); + return result; + } + + if (searchRecordPage(getPageData(&recordPage->cp_pageData), name, geometry, + duplicate)) { + *found = true; + } + endPendingSearch(volume->pageCache, zoneNumber); + return UDS_SUCCESS; +} + +/**********************************************************************/ +int readChapterIndexFromVolume(const Volume *volume, + uint64_t virtualChapter, + struct volume_page volumePages[], + DeltaIndexPage indexPages[]) +{ + const Geometry *geometry = volume->geometry; + unsigned int physicalChapter = mapToPhysicalChapter(geometry, + virtualChapter); + int physicalPage = mapToPhysicalPage(geometry, physicalChapter, 0); + prefetchVolumePages(&volume->volumeStore, physicalPage, + geometry->indexPagesPerChapter); + + unsigned int i; + struct volume_page volumePage; + int result = initializeVolumePage(geometry, &volumePage); + for (i = 0; i < geometry->indexPagesPerChapter; i++) { + int result = readVolumePage(&volume->volumeStore, physicalPage + i, + &volumePages[i]); + if (result != UDS_SUCCESS) { + break; + } + byte *indexPage = getPageData(&volumePages[i]); + result = initChapterIndexPage(volume, indexPage, physicalChapter, i, + &indexPages[i]); + if (result != UDS_SUCCESS) { + break; + } + } + destroyVolumePage(&volumePage); + return result; +} + +/**********************************************************************/ +int searchVolumePageCache(Volume *volume, + Request *request, + const UdsChunkName *name, + uint64_t virtualChapter, + UdsChunkData *metadata, + bool *found) +{ + unsigned int physicalChapter + = mapToPhysicalChapter(volume->geometry, virtualChapter); + unsigned int indexPageNumber; + int result = findIndexPageNumber(volume->indexPageMap, name, physicalChapter, + &indexPageNumber); + if (result != UDS_SUCCESS) { + return result; + } + + int recordPageNumber; + result = searchCachedIndexPage(volume, request, name, physicalChapter, + indexPageNumber, &recordPageNumber); + if (result == UDS_SUCCESS) { + result = searchCachedRecordPage(volume, request, name, physicalChapter, + recordPageNumber, metadata, found); + } + + return result; +} + +/**********************************************************************/ +int forgetChapter(Volume *volume, + uint64_t virtualChapter, + InvalidationReason reason) +{ + logDebug("forgetting chapter %llu", virtualChapter); + unsigned int physicalChapter + = mapToPhysicalChapter(volume->geometry, virtualChapter); + lockMutex(&volume->readThreadsMutex); + int result + = invalidatePageCacheForChapter(volume->pageCache, physicalChapter, + volume->geometry->pagesPerChapter, + reason); + unlockMutex(&volume->readThreadsMutex); + return result; +} + +/** + * Donate index page data to the page cache for an index page that was just + * written to the volume. The caller must already hold the reader thread + * mutex. + * + * @param volume the volume + * @param physicalChapter the physical chapter number of the index page + * @param indexPageNumber the chapter page number of the index page + * @param scratchPage the index page data + **/ +static int donateIndexPageLocked(Volume *volume, + unsigned int physicalChapter, + unsigned int indexPageNumber, + struct volume_page *scratchPage) +{ + unsigned int physicalPage + = mapToPhysicalPage(volume->geometry, physicalChapter, indexPageNumber); + + // Find a place to put the page. + CachedPage *page = NULL; + int result = selectVictimInCache(volume->pageCache, &page); + if (result != UDS_SUCCESS) { + return result; + } + + // Exchange the scratch page with the cache page + swapVolumePages(&page->cp_pageData, scratchPage); + + result = initChapterIndexPage(volume, getPageData(&page->cp_pageData), + physicalChapter, indexPageNumber, + &page->cp_indexPage); + if (result != UDS_SUCCESS) { + logWarning("Error initialize chapter index page"); + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + + result = putPageInCache(volume->pageCache, physicalPage, page); + if (result != UDS_SUCCESS) { + logWarning("Error putting page %u in cache", physicalPage); + cancelPageInCache(volume->pageCache, physicalPage, page); + return result; + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int writeIndexPages(Volume *volume, + int physicalPage, + OpenChapterIndex *chapterIndex, + byte **pages) +{ + Geometry *geometry = volume->geometry; + unsigned int physicalChapterNumber + = mapToPhysicalChapter(geometry, chapterIndex->virtualChapterNumber); + unsigned int deltaListNumber = 0; + + unsigned int indexPageNumber; + for (indexPageNumber = 0; + indexPageNumber < geometry->indexPagesPerChapter; + indexPageNumber++) { + int result = prepareToWriteVolumePage(&volume->volumeStore, + physicalPage + indexPageNumber, + &volume->scratchPage); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "failed to prepare index page"); + } + + // Pack as many delta lists into the index page as will fit. + unsigned int listsPacked; + bool lastPage = ((indexPageNumber + 1) == geometry->indexPagesPerChapter); + result = packOpenChapterIndexPage(chapterIndex, + getPageData(&volume->scratchPage), + deltaListNumber, lastPage, &listsPacked); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, "failed to pack index page"); + } + + result = writeVolumePage(&volume->volumeStore, + physicalPage + indexPageNumber, + &volume->scratchPage); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write chapter index page"); + } + + if (pages != NULL) { + memcpy(pages[indexPageNumber], getPageData(&volume->scratchPage), + geometry->bytesPerPage); + } + + // Tell the index page map the list number of the last delta list that was + // packed into the index page. + if (listsPacked == 0) { + logDebug("no delta lists packed on chapter %u page %u", + physicalChapterNumber, indexPageNumber); + } else { + deltaListNumber += listsPacked; + } + result = updateIndexPageMap(volume->indexPageMap, + chapterIndex->virtualChapterNumber, + physicalChapterNumber, + indexPageNumber, deltaListNumber - 1); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, + "failed to update index page map"); + } + + // Donate the page data for the index page to the page cache. + lockMutex(&volume->readThreadsMutex); + result = donateIndexPageLocked(volume, physicalChapterNumber, + indexPageNumber, &volume->scratchPage); + unlockMutex(&volume->readThreadsMutex); + if (result != UDS_SUCCESS) { + return result; + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int writeRecordPages(Volume *volume, + int physicalPage, + const UdsChunkRecord records[], + byte **pages) +{ + Geometry *geometry = volume->geometry; + // Skip over the index pages, which come before the record pages + physicalPage += geometry->indexPagesPerChapter; + // The record array from the open chapter is 1-based. + const UdsChunkRecord *nextRecord = &records[1]; + + unsigned int recordPageNumber; + for (recordPageNumber = 0; + recordPageNumber < geometry->recordPagesPerChapter; + recordPageNumber++) { + int result = prepareToWriteVolumePage(&volume->volumeStore, + physicalPage + recordPageNumber, + &volume->scratchPage); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to prepare record page"); + } + + // Sort the next page of records and copy them to the record page as a + // binary tree stored in heap order. + result = encodeRecordPage(volume, nextRecord, + getPageData(&volume->scratchPage)); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to encode record page %u", + recordPageNumber); + } + nextRecord += geometry->recordsPerPage; + + result = writeVolumePage(&volume->volumeStore, + physicalPage + recordPageNumber, + &volume->scratchPage); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "failed to write chapter record page"); + } + + if (pages != NULL) { + memcpy(pages[recordPageNumber], getPageData(&volume->scratchPage), + geometry->bytesPerPage); + } + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int writeChapter(Volume *volume, + OpenChapterIndex *chapterIndex, + const UdsChunkRecord records[]) +{ + // Determine the position of the virtual chapter in the volume file. + Geometry *geometry = volume->geometry; + unsigned int physicalChapterNumber + = mapToPhysicalChapter(geometry, chapterIndex->virtualChapterNumber); + int physicalPage = mapToPhysicalPage(geometry, physicalChapterNumber, 0); + + // Pack and write the delta chapter index pages to the volume. + int result = writeIndexPages(volume, physicalPage, chapterIndex, NULL); + if (result != UDS_SUCCESS) { + return result; + } + // Sort and write the record pages to the volume. + result = writeRecordPages(volume, physicalPage, records, NULL); + if (result != UDS_SUCCESS) { + return result; + } + releaseVolumePage(&volume->scratchPage); + // Flush the data to permanent storage. + return syncVolumeStore(&volume->volumeStore); +} + +/**********************************************************************/ +size_t getCacheSize(Volume *volume) +{ + size_t size = getPageCacheSize(volume->pageCache); + if (isSparse(volume->geometry)) { + size += getSparseCacheMemorySize(volume->sparseCache); + } + return size; +} + +/**********************************************************************/ +static int probeChapter(Volume *volume, + unsigned int chapterNumber, + uint64_t *virtualChapterNumber) +{ + const Geometry *geometry = volume->geometry; + unsigned int expectedListNumber = 0; + uint64_t lastVCN = UINT64_MAX; + + prefetchVolumePages(&volume->volumeStore, + mapToPhysicalPage(geometry, chapterNumber, 0), + geometry->indexPagesPerChapter); + + unsigned int i; + for (i = 0; i < geometry->indexPagesPerChapter; ++i) { + DeltaIndexPage *page; + int result = getPage(volume, chapterNumber, i, CACHE_PROBE_INDEX_FIRST, + NULL, &page); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t vcn = page->virtualChapterNumber; + if (lastVCN == UINT64_MAX) { + lastVCN = vcn; + } else if (vcn != lastVCN) { + logError("inconsistent chapter %u index page %u: expected vcn %" + PRIu64 ", got vcn %llu", + chapterNumber, i, lastVCN, vcn); + return UDS_CORRUPT_COMPONENT; + } + + if (expectedListNumber != page->lowestListNumber) { + logError("inconsistent chapter %u index page %u: expected list number %u" + ", got list number %u", + chapterNumber, i, expectedListNumber, page->lowestListNumber); + return UDS_CORRUPT_COMPONENT; + } + expectedListNumber = page->highestListNumber + 1; + + result = validateChapterIndexPage(page, geometry); + if (result != UDS_SUCCESS) { + return result; + } + } + + if (lastVCN == UINT64_MAX) { + logError("no chapter %u virtual chapter number determined", chapterNumber); + return UDS_CORRUPT_COMPONENT; + } + if (chapterNumber != lastVCN % geometry->chaptersPerVolume) { + logError("chapter %u vcn %llu is out of phase (%u)", + chapterNumber, lastVCN, geometry->chaptersPerVolume); + return UDS_CORRUPT_COMPONENT; + } + *virtualChapterNumber = lastVCN; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int probeWrapper(void *aux, + unsigned int chapterNumber, + uint64_t *virtualChapterNumber) +{ + Volume *volume = aux; + int result = probeChapter(volume, chapterNumber, virtualChapterNumber); + if ((result == UDS_CORRUPT_COMPONENT) || (result == UDS_CORRUPT_DATA)) { + *virtualChapterNumber = UINT64_MAX; + return UDS_SUCCESS; + } + return result; +} + +/**********************************************************************/ +static int findRealEndOfVolume(Volume *volume, + unsigned int limit, + unsigned int *limitPtr) +{ + /* + * Start checking from the end of the volume. As long as we hit corrupt + * data, start skipping larger and larger amounts until we find real data. + * If we find real data, reduce the span and try again until we find + * the exact boundary. + */ + unsigned int span = 1; + unsigned int tries = 0; + while (limit > 0) { + unsigned int chapter = (span > limit) ? 0 : limit - span; + uint64_t vcn = 0; + int result = probeChapter(volume, chapter, &vcn); + if (result == UDS_SUCCESS) { + if (span == 1) { + break; + } + span /= 2; + tries = 0; + } else if (result == UDS_CORRUPT_COMPONENT) { + limit = chapter; + if (++tries > 1) { + span *= 2; + } + } else { + return logErrorWithStringError(result, "cannot determine end of volume"); + } + } + + if (limitPtr != NULL) { + *limitPtr = limit; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +int findVolumeChapterBoundaries(Volume *volume, + uint64_t *lowestVCN, + uint64_t *highestVCN, + bool *isEmpty) +{ + unsigned int chapterLimit = volume->geometry->chaptersPerVolume; + + int result = findRealEndOfVolume(volume, chapterLimit, &chapterLimit); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot find end of volume"); + } + + if (chapterLimit == 0) { + *lowestVCN = 0; + *highestVCN = 0; + *isEmpty = true; + return UDS_SUCCESS; + } + + *isEmpty = false; + return findVolumeChapterBoundariesImpl(chapterLimit, MAX_BAD_CHAPTERS, + lowestVCN, highestVCN, probeWrapper, + volume); +} + +/**********************************************************************/ +int findVolumeChapterBoundariesImpl(unsigned int chapterLimit, + unsigned int maxBadChapters, + uint64_t *lowestVCN, + uint64_t *highestVCN, + int (*probeFunc)(void *aux, + unsigned int chapter, + uint64_t *vcn), + void *aux) +{ + if (chapterLimit == 0) { + *lowestVCN = 0; + *highestVCN = 0; + return UDS_SUCCESS; + } + + /* + * This method assumes there is at most one run of contiguous bad chapters + * caused by unflushed writes. Either the bad spot is at the beginning and + * end, or somewhere in the middle. Wherever it is, the highest and lowest + * VCNs are adjacent to it. Otherwise the volume is cleanly saved and + * somewhere in the middle of it the highest VCN immediately preceeds the + * lowest one. + */ + + uint64_t firstVCN = UINT64_MAX; + + // doesn't matter if this results in a bad spot (UINT64_MAX) + int result = (*probeFunc)(aux, 0, &firstVCN); + if (result != UDS_SUCCESS) { + return UDS_SUCCESS; + } + + /* + * Binary search for end of the discontinuity in the monotonically + * increasing virtual chapter numbers; bad spots are treated as a span of + * UINT64_MAX values. In effect we're searching for the index of the + * smallest value less than firstVCN. In the case we go off the end it means + * that chapter 0 has the lowest vcn. + */ + + unsigned int leftChapter = 0; + unsigned int rightChapter = chapterLimit; + + while (leftChapter < rightChapter) { + unsigned int chapter = (leftChapter + rightChapter) / 2; + uint64_t probeVCN; + + result = (*probeFunc)(aux, chapter, &probeVCN); + if (result != UDS_SUCCESS) { + return result; + } + if (firstVCN <= probeVCN) { + leftChapter = chapter + 1; + } else { + rightChapter = chapter; + } + } + + uint64_t lowest = UINT64_MAX; + uint64_t highest = UINT64_MAX; + + result = ASSERT(leftChapter == rightChapter, "leftChapter == rightChapter"); + if (result != UDS_SUCCESS) { + return result; + } + + leftChapter %= chapterLimit; // in case we're at the end + + // At this point, leftChapter is the chapter with the lowest virtual chapter + // number. + + result = (*probeFunc)(aux, leftChapter, &lowest); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT((lowest != UINT64_MAX), "invalid lowest chapter"); + if (result != UDS_SUCCESS) { + return result; + } + + // We now circularly scan backwards, moving over any bad chapters until we + // find the chapter with the highest vcn (the first good chapter we + // encounter). + + unsigned int badChapters = 0; + + for (;;) { + rightChapter = (rightChapter + chapterLimit - 1) % chapterLimit; + result = (*probeFunc)(aux, rightChapter, &highest); + if (result != UDS_SUCCESS) { + return result; + } + if (highest != UINT64_MAX) { + break; + } + if (++badChapters >= maxBadChapters) { + logError("too many bad chapters in volume: %u", badChapters); + return UDS_CORRUPT_COMPONENT; + } + } + + *lowestVCN = lowest; + *highestVCN = highest; + return UDS_SUCCESS; +} + +/** + * Allocate a volume. + * + * @param config The configuration to use + * @param layout The index layout + * @param readQueueMaxSize The maximum size of the read queue + * @param zoneCount The number of zones to use + * @param newVolume A pointer to hold the new volume + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int allocateVolume(const Configuration *config, + IndexLayout *layout, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + Volume **newVolume) +{ + Volume *volume; + int result = ALLOCATE(1, Volume, "volume", &volume); + if (result != UDS_SUCCESS) { + return result; + } + volume->nonce = getVolumeNonce(layout); + // It is safe to call freeVolume now to clean up and close the volume + + result = copyGeometry(config->geometry, &volume->geometry); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return logWarningWithStringError(result, + "failed to allocate geometry: error"); + } + + // Need a buffer for each entry in the page cache + unsigned int reservedBuffers + = config->cacheChapters * config->geometry->recordPagesPerChapter; + // And a buffer for the chapter writer + reservedBuffers += 1; + // And a buffer for each entry in the sparse cache + if (isSparse(volume->geometry)) { + reservedBuffers + += config->cacheChapters * config->geometry->indexPagesPerChapter; + } + result = openVolumeStore(&volume->volumeStore, layout, reservedBuffers, + config->geometry->bytesPerPage); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + result = initializeVolumePage(config->geometry, &volume->scratchPage); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + result = makeRadixSorter(config->geometry->recordsPerPage, + &volume->radixSorter); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + result = ALLOCATE(config->geometry->recordsPerPage, const UdsChunkRecord *, + "record pointers", &volume->recordPointers); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + if (isSparse(volume->geometry)) { + result = makeSparseCache(volume->geometry, config->cacheChapters, + zoneCount, &volume->sparseCache); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + } + result = makePageCache(volume->geometry, config->cacheChapters, + readQueueMaxSize, zoneCount, &volume->pageCache); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + result = makeIndexPageMap(volume->geometry, &volume->indexPageMap); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + *newVolume = volume; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeVolume(const Configuration *config, + IndexLayout *layout, + const struct uds_parameters *userParams, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + Volume **newVolume) +{ + unsigned int volumeReadThreads = getReadThreads(userParams); + + if (readQueueMaxSize <= volumeReadThreads) { + logError("Number of read threads must be smaller than read queue"); + return UDS_INVALID_ARGUMENT; + } + + Volume *volume = NULL; + int result = allocateVolume(config, layout, readQueueMaxSize, zoneCount, + &volume); + if (result != UDS_SUCCESS) { + return result; + } + result = initMutex(&volume->readThreadsMutex); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + result = initCond(&volume->readThreadsReadDoneCond); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + result = initCond(&volume->readThreadsCond); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + + // Start the reader threads. If this allocation succeeds, freeVolume knows + // that it needs to try and stop those threads. + result = ALLOCATE(volumeReadThreads, Thread, "reader threads", + &volume->readerThreads); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + unsigned int i; + for (i = 0; i < volumeReadThreads; i++) { + result = createThread(readThreadFunction, (void *) volume, "reader", + &volume->readerThreads[i]); + if (result != UDS_SUCCESS) { + freeVolume(volume); + return result; + } + // We only stop as many threads as actually got started. + volume->numReadThreads = i + 1; + } + + *newVolume = volume; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void freeVolume(Volume *volume) +{ + if (volume == NULL) { + return; + } + + // If readerThreads is NULL, then we haven't set up the reader threads. + if (volume->readerThreads != NULL) { + // Stop the reader threads. It is ok if there aren't any of them. + lockMutex(&volume->readThreadsMutex); + volume->readerState |= READER_STATE_EXIT; + broadcastCond(&volume->readThreadsCond); + unlockMutex(&volume->readThreadsMutex); + unsigned int i; + for (i = 0; i < volume->numReadThreads; i++) { + joinThreads(volume->readerThreads[i]); + } + FREE(volume->readerThreads); + volume->readerThreads = NULL; + } + + // Must close the volume store AFTER freeing the scratch page and the caches + destroyVolumePage(&volume->scratchPage); + freePageCache(volume->pageCache); + freeSparseCache(volume->sparseCache); + closeVolumeStore(&volume->volumeStore); + + destroyCond(&volume->readThreadsCond); + destroyCond(&volume->readThreadsReadDoneCond); + destroyMutex(&volume->readThreadsMutex); + freeIndexPageMap(volume->indexPageMap); + freeRadixSorter(volume->radixSorter); + FREE(volume->geometry); + FREE(volume->recordPointers); + FREE(volume); +} diff --git a/source/uds/volume.h b/source/uds/volume.h new file mode 100644 index 0000000..82aef00 --- /dev/null +++ b/source/uds/volume.h @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/volume.h#14 $ + */ + +#ifndef VOLUME_H +#define VOLUME_H + +#include "cacheCounters.h" +#include "common.h" +#include "chapterIndex.h" +#include "indexConfig.h" +#include "indexLayout.h" +#include "indexPageMap.h" +#include "pageCache.h" +#include "request.h" +#include "sparseCache.h" +#include "uds.h" +#include "util/radixSort.h" +#include "volumeStore.h" + +typedef enum { + READER_STATE_RUN = 1, + READER_STATE_EXIT = 2, + READER_STATE_STOP = 4 +} ReaderState; + +typedef enum indexLookupMode { + /* Always do lookups in all chapters normally. */ + LOOKUP_NORMAL, + /* + * Don't do lookups in closed chapters; assume records not in the + * open chapter are always new. You don't want this normally; it's + * for programs like albfill. (Even then, with multiple runs using + * the same tag, we may actually duplicate older records, but if + * it's in a separate chapter it won't really matter.) + */ + LOOKUP_CURRENT_CHAPTER_ONLY, + /* + * Only do a subset of lookups needed when rebuilding an index. + * This cannot be set externally. + */ + LOOKUP_FOR_REBUILD +} IndexLookupMode; + +typedef struct volume { + /* The layout of the volume */ + Geometry *geometry; + /* The configuration of the volume */ + Configuration *config; + /* The access to the volume's backing store */ + struct volume_store volumeStore; + /* A single page used for writing to the volume */ + struct volume_page scratchPage; + /* The nonce used to save the volume */ + uint64_t nonce; + /* A single page's records, for sorting */ + const UdsChunkRecord **recordPointers; + /* For sorting record pages */ + RadixSorter *radixSorter; + /* The sparse chapter index cache */ + SparseCache *sparseCache; + /* The page cache */ + PageCache *pageCache; + /* The index page map maps delta list numbers to index page numbers */ + IndexPageMap *indexPageMap; + /* Mutex to sync between read threads and index thread */ + Mutex readThreadsMutex; + /* Condvar to indicate when read threads should start working */ + CondVar readThreadsCond; + /* Condvar to indicate when a read thread has finished a read */ + CondVar readThreadsReadDoneCond; + /* Threads to read data from disk */ + Thread *readerThreads; + /* Number of threads busy with reads */ + unsigned int busyReaderThreads; + /* The state of the reader threads */ + ReaderState readerState; + /* The lookup mode for the index */ + IndexLookupMode lookupMode; + /* Number of read threads to use (run-time parameter) */ + unsigned int numReadThreads; +} Volume; + +/** + * Create a volume. + * + * @param config The configuration to use. + * @param layout The index layout + * @param userParams The index session parameters. If NULL, the default + * session parameters will be used. + * @param readQueueMaxSize The maximum size of the read queue. + * @param zoneCount The number of zones to use. + * @param newVolume A pointer to hold a pointer to the new volume. + * + * @return UDS_SUCCESS or an error code + **/ +int makeVolume(const Configuration *config, + IndexLayout *layout, + const struct uds_parameters *userParams, + unsigned int readQueueMaxSize, + unsigned int zoneCount, + Volume **newVolume) + __attribute__((warn_unused_result)); + +/** + * Clean up a volume and its memory. + * + * @param volume The volume to destroy. + **/ +void freeVolume(Volume *volume); + +/** + * Enqueue a page read. + * + * @param volume the volume + * @param request the request to waiting on the read + * @param physicalPage the page number to read + * + * @return UDS_QUEUED if successful, or an error code + **/ +int enqueuePageRead(Volume *volume, Request *request, int physicalPage) + __attribute__((warn_unused_result)); + +/** + * Find the lowest and highest contiguous chapters and determine their + * virtual chapter numbers. + * + * @param [in] volume The volume to probe. + * @param [out] lowestVCN Pointer for lowest virtual chapter number. + * @param [out] highestVCN Pointer for highest virtual chapter number. + * @param [out] isEmpty Pointer to a bool indicating whether or not the + * volume is empty. + * + * @return UDS_SUCCESS, or an error code. + * + * @note This routine does something similar to a binary search to find + * the location in the volume file where the discontinuity of + * chapter numbers occurs. In a good save, the discontinuity is + * a sharp cliff, but if write failures occured during saving + * there may be one or more chapters which are partially written. + * + * @note This method takes advantage of the fact that the physical + * chapter number in which the index pages are found should have + * headers which state that the virtual chapter number are all + * identical and maintain the invariant that + * pcn == vcn % chaptersPerVolume. + **/ +int findVolumeChapterBoundaries(Volume *volume, + uint64_t *lowestVCN, + uint64_t *highestVCN, + bool *isEmpty) + __attribute__((warn_unused_result)); + +/** + * Find any matching metadata for the given name within a given physical + * chapter. + * + * @param volume The volume. + * @param request The request originating the search. + * @param name The block name of interest. + * @param virtualChapter The number of the chapter to search. + * @param metadata The old metadata for the name. + * @param found A pointer which will be set to + * true if a match was found. + * + * @return UDS_SUCCESS or an error + **/ +int searchVolumePageCache(Volume *volume, + Request *request, + const UdsChunkName *name, + uint64_t virtualChapter, + UdsChunkData *metadata, + bool *found) + __attribute__((warn_unused_result)); + +/** + * Fetch a record page from the cache or read it from the volume and search it + * for a chunk name. + * + * If a match is found, optionally returns the metadata from the stored + * record. If the requested record page is not cached, the page fetch may be + * asynchronously completed on the slow lane, in which case UDS_QUEUED will be + * returned and the request will be requeued for continued processing after + * the page is read and added to the cache. + * + * @param volume the volume containing the record page to search + * @param request the request originating the search (may be NULL for + * a direct query from volume replay) + * @param name the name of the block or chunk + * @param chapter the chapter to search + * @param recordPageNumber the record page number of the page to search + * @param duplicate an array in which to place the metadata of the + * duplicate, if one was found + * @param found a (bool *) which will be set to true if the chunk + * was found + * + * @return UDS_SUCCESS, UDS_QUEUED, or an error code + **/ +int searchCachedRecordPage(Volume *volume, + Request *request, + const UdsChunkName *name, + unsigned int chapter, + int recordPageNumber, + UdsChunkData *duplicate, + bool *found) + __attribute__((warn_unused_result)); + +/** + * Forget the contents of a chapter. Invalidates any cached state for the + * specified chapter. + * + * @param volume the volume containing the chapter + * @param chapter the virtual chapter number + * @param reason the reason for invalidation + * + * @return UDS_SUCCESS or an error code + **/ +int forgetChapter(Volume *volume, + uint64_t chapter, + InvalidationReason reason) + __attribute__((warn_unused_result)); + +/** + * Write a chapter's worth of index pages to a volume + * + * @param volume the volume containing the chapter + * @param physicalPage the page number in the volume for the chapter + * @param chapterIndex the populated delta chapter index + * @param pages pointer to array of page pointers. Used only in testing + * to return what data has been written to disk. + * + * @return UDS_SUCCESS or an error code + **/ +int writeIndexPages(Volume *volume, + int physicalPage, + OpenChapterIndex *chapterIndex, + byte **pages) +__attribute__((warn_unused_result)); + +/** + * Write a chapter's worth of record pages to a volume + * + * @param volume the volume containing the chapter + * @param physicalPage the page number in the volume for the chapter + * @param records a 1-based array of chunk records in the chapter + * @param pages pointer to array of page pointers. Used only in testing + * to return what data has been written to disk. + * + * @return UDS_SUCCESS or an error code + **/ +int writeRecordPages(Volume *volume, + int physicalPage, + const UdsChunkRecord records[], + byte **pages) +__attribute__((warn_unused_result)); + +/** + * Write the index and records from the most recently filled chapter to the + * volume. + * + * @param volume the volume containing the chapter + * @param chapterIndex the populated delta chapter index + * @param records a 1-based array of chunk records in the chapter + * + * @return UDS_SUCCESS or an error code + **/ +int writeChapter(Volume *volume, + OpenChapterIndex *chapterIndex, + const UdsChunkRecord records[]) + __attribute__((warn_unused_result)); + +/** + * Read all the index pages for a chapter from the volume and initialize an + * array of ChapterIndexPages to represent them. + * + * @param [in] volume the volume containing the chapter + * @param [in] virtualChapter the virtual chapter number of the index to read + * @param [out] volumePages an array to receive the raw index page data + * @param [out] indexPages an array of ChapterIndexPages to initialize + * + * @return UDS_SUCCESS or an error code + **/ +int readChapterIndexFromVolume(const Volume *volume, + uint64_t virtualChapter, + struct volume_page volumePages[], + DeltaIndexPage indexPages[]) + __attribute__((warn_unused_result)); + +/** + * Retrieve a page either from the cache (if we can) or from disk. If a read + * from disk is required, this is done immediately in the same thread and the + * page is then returned. + * + * The caller of this function must be holding the volume read mutex before + * calling this function. + * + * As a side-effect, the retrieved page will become the most recent page in + * the cache. + * + * This function is only exposed for the use of unit tests. + * + * @param volume The volume containing the page + * @param request The request originating the search + * @param physicalPage The physical page number + * @param probeType The type of cache access being done + * @param entryPtr A pointer to hold the retrieved cached entry + * + * @return UDS_SUCCESS or an error code + **/ +int getPageLocked(Volume *volume, + Request *request, + unsigned int physicalPage, + CacheProbeType probeType, + CachedPage **entryPtr) + __attribute__((warn_unused_result)); + +/** + * Retrieve a page either from the cache (if we can) or from disk. If a read + * from disk is required, the read request is enqueued for later processing + * by another thread. When that thread finally reads the page into the cache, + * a callback function is called to inform the caller the read is complete. + * + * The caller of this function should not be holding the volume read lock. + * Instead, the caller must call beingPendingSearch() for the given zone + * the request is being processed in. That state will be maintained or + * restored when the call returns, at which point the caller should call + * endPendingSearch(). + * + * As a side-effect, the retrieved page will become the most recent page in + * the cache. + * + * This function is only exposed for the use of unit tests. + * + * @param volume The volume containing the page + * @param request The request originating the search + * @param physicalPage The physical page number + * @param probeType The type of cache access being done + * @param entryPtr A pointer to hold the retrieved cached entry + * + * @return UDS_SUCCESS or an error code + **/ +int getPageProtected(Volume *volume, + Request *request, + unsigned int physicalPage, + CacheProbeType probeType, + CachedPage **entryPtr) + __attribute__((warn_unused_result)); + +/** + * Retrieve a page either from the cache (if we can) or from disk. If a read + * from disk is required, this is done immediately in the same thread and the + * page is then returned. + * + * The caller of this function must not be holding the volume read lock before + * calling this function. This method will grab that lock and release it + * when it returns. + * + * As a side-effect, the retrieved page will become the most recent page in + * the cache. + * + * This function should only be called by areas of the code that do not use + * multi-threading to access the volume. These include rebuild, volume + * explorer, and certain unit tests. + * + * @param volume The volume containing the page + * @param chapter The number of the chapter containing the page + * @param pageNumber The number of the page + * @param probeType The type of cache access being done + * @param dataPtr Pointer to hold the retrieved page, NULL if not wanted + * @param indexPagePtr Pointer to hold the retrieved chapter index page, or + * NULL if not wanted + * + * @return UDS_SUCCESS or an error code + **/ +int getPage(Volume *volume, + unsigned int chapter, + unsigned int pageNumber, + CacheProbeType probeType, + byte **dataPtr, + DeltaIndexPage **indexPagePtr) + __attribute__((warn_unused_result)); + +/**********************************************************************/ +size_t getCacheSize(Volume *volume) __attribute__((warn_unused_result)); + +/**********************************************************************/ +int findVolumeChapterBoundariesImpl(unsigned int chapterLimit, + unsigned int maxBadChapters, + uint64_t *lowestVCN, + uint64_t *highestVCN, + int (*probeFunc)(void *aux, + unsigned int chapter, + uint64_t *vcn), + void *aux) + __attribute__((warn_unused_result)); + +/** + * Map a chapter number and page number to a phsical volume page number. + * + * @param geometry the layout of the volume + * @param chapter the chapter number of the desired page + * @param page the chapter page number of the desired page + * + * @return the physical page number + **/ +int mapToPhysicalPage(const Geometry *geometry, int chapter, int page) + __attribute__((warn_unused_result)); + +#endif /* VOLUME_H */ diff --git a/source/uds/volumeStore.c b/source/uds/volumeStore.c new file mode 100644 index 0000000..8b9f820 --- /dev/null +++ b/source/uds/volumeStore.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/volumeStore.c#2 $ + */ + +#include "geometry.h" +#include "indexLayout.h" +#include "logger.h" +#include "uds-error.h" +#include "volumeStore.h" + + +/*****************************************************************************/ +void closeVolumeStore(struct volume_store *volumeStore) +{ +#ifdef __KERNEL__ + if (volumeStore->vs_client != NULL) { + dm_bufio_client_destroy(volumeStore->vs_client); + volumeStore->vs_client = NULL; + } +#else + if (volumeStore->vs_region != NULL) { + putIORegion(volumeStore->vs_region); + volumeStore->vs_region = NULL; + } +#endif +} + +/*****************************************************************************/ +void destroyVolumePage(struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + releaseVolumePage(volumePage); +#else + FREE(volumePage->vp_data); + volumePage->vp_data = NULL; +#endif +} + +/*****************************************************************************/ +int initializeVolumePage(const struct geometry *geometry, + struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + volumePage->vp_buffer = NULL; + return UDS_SUCCESS; +#else + return ALLOCATE_IO_ALIGNED(geometry->bytesPerPage, byte, __func__, + &volumePage->vp_data); +#endif +} + +/*****************************************************************************/ +int openVolumeStore(struct volume_store *volumeStore, + IndexLayout *layout, + unsigned int reservedBuffers __attribute__((unused)), + size_t bytesPerPage) +{ +#ifdef __KERNEL__ + return openVolumeBufio(layout, bytesPerPage, reservedBuffers, + &volumeStore->vs_client); +#else + volumeStore->vs_bytesPerPage = bytesPerPage; + return openVolumeRegion(layout, &volumeStore->vs_region); +#endif +} + +/*****************************************************************************/ +void prefetchVolumePages(const struct volume_store *vs __attribute__((unused)), + unsigned int physicalPage __attribute__((unused)), + unsigned int pageCount __attribute__((unused))) +{ +#ifdef __KERNEL__ + dm_bufio_prefetch(vs->vs_client, physicalPage, pageCount); +#else + // Nothing to do in user mode +#endif +} + +/*****************************************************************************/ +int prepareToWriteVolumePage(const struct volume_store *volumeStore + __attribute__((unused)), + unsigned int physicalPage + __attribute__((unused)), + struct volume_page *volumePage + __attribute__((unused))) +{ +#ifdef __KERNEL__ + releaseVolumePage(volumePage); + struct dm_buffer *buffer = NULL; + byte *data = dm_bufio_new(volumeStore->vs_client, physicalPage, &buffer); + if (IS_ERR(data)) { + return -PTR_ERR(data); + } + volumePage->vp_buffer = buffer; +#else + // Nothing to do in user mode +#endif + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int readVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + releaseVolumePage(volumePage); + byte *data = dm_bufio_read(volumeStore->vs_client, physicalPage, + &volumePage->vp_buffer); + if (IS_ERR(data)) { + return logWarningWithStringError(-PTR_ERR(data), + "error reading physical page %u", + physicalPage); + } +#else + off_t offset = (off_t) physicalPage * volumeStore->vs_bytesPerPage; + int result = readFromRegion(volumeStore->vs_region, offset, + getPageData(volumePage), + volumeStore->vs_bytesPerPage, NULL); + if (result != UDS_SUCCESS) { + return logWarningWithStringError(result, + "error reading physical page %u", + physicalPage); + } +#endif + return UDS_SUCCESS; +} + +/*****************************************************************************/ +void releaseVolumePage(struct volume_page *volumePage __attribute__((unused))) +{ +#ifdef __KERNEL__ + if (volumePage->vp_buffer != NULL) { + dm_bufio_release(volumePage->vp_buffer); + volumePage->vp_buffer = NULL; + } +#else + // Nothing to do in user mode +#endif +} + +/*****************************************************************************/ +void swapVolumePages(struct volume_page *volumePage1, + struct volume_page *volumePage2) +{ + struct volume_page temp = *volumePage1; + *volumePage1 = *volumePage2; + *volumePage2 = temp; +} + +/*****************************************************************************/ +int syncVolumeStore(const struct volume_store *volumeStore) +{ +#ifdef __KERNEL__ + int result = -dm_bufio_write_dirty_buffers(volumeStore->vs_client); +#else + int result = syncRegionContents(volumeStore->vs_region); +#endif + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "cannot sync chapter to volume"); + } + return UDS_SUCCESS; +} + +/*****************************************************************************/ +int writeVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + dm_bufio_mark_buffer_dirty(volumePage->vp_buffer); + return UDS_SUCCESS; +#else + off_t offset = (off_t) physicalPage * volumeStore->vs_bytesPerPage; + return writeToRegion(volumeStore->vs_region, offset, getPageData(volumePage), + volumeStore->vs_bytesPerPage, + volumeStore->vs_bytesPerPage); +#endif +} diff --git a/source/uds/volumeStore.h b/source/uds/volumeStore.h new file mode 100644 index 0000000..f475427 --- /dev/null +++ b/source/uds/volumeStore.h @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/volumeStore.h#2 $ + */ + +#ifndef VOLUME_STORE_H +#define VOLUME_STORE_H + +#include "common.h" +#include "compiler.h" +#include "memoryAlloc.h" + +#ifdef __KERNEL__ +#include +#else +#include "ioRegion.h" +#endif + +struct geometry; +struct indexLayout; + + +struct volume_store { +#ifdef __KERNEL__ + struct dm_bufio_client *vs_client; +#else + IORegion *vs_region; + size_t vs_bytesPerPage; +#endif +}; + + +struct volume_page { +#ifdef __KERNEL__ + struct dm_buffer *vp_buffer; +#else + byte *vp_data; +#endif +}; + +/** + * Close a volume store. + * + * @param volumeStore The volume store + **/ +void closeVolumeStore(struct volume_store *volumeStore); + +/** + * Uninitialize a volume page buffer. + * + * @param volumePage The volume page buffer + **/ +void destroyVolumePage(struct volume_page *volumePage); + +/** + * Get a pointer to the data contained in a volume page buffer. + * + * @param volumePage The volume page buffer + * + * @return the address of the data + **/ +__attribute__((warn_unused_result)) +static INLINE byte *getPageData(const struct volume_page *volumePage) +{ +#ifdef __KERNEL__ + return dm_bufio_get_block_data(volumePage->vp_buffer); +#else + return volumePage->vp_data; +#endif +} + +/** + * Initialize a volume page buffer. + * + * @param geometry The volume geometry + * @param volumePage The volume page buffer + * + * @return UDS_SUCCESS or an error status + **/ +int initializeVolumePage(const struct geometry *geometry, + struct volume_page *volumePage) + __attribute__((warn_unused_result)); + +/** + * Open a volume store. + * + * @param volumeStore The volume store + * @param layout The index layout + * @param reservedBuffers The number of buffers that can be reserved + * @param bytesPerPage The number of bytes in a volume page + **/ +int openVolumeStore(struct volume_store *volumeStore, + struct indexLayout *layout, + unsigned int reservedBuffers, + size_t bytesPerPage) + __attribute__((warn_unused_result)); + +/** + * Prefetch volume pages into memory. + * + * @param volumeStore The volume store + * @param physicalPage The volume page number of the first desired page + * @param pageCount The number of volume pages to prefetch + **/ +void prefetchVolumePages(const struct volume_store *volumeStore, + unsigned int physicalPage, + unsigned int pageCount); + +/** + * Prepare a buffer to write a page to the volume. + * + * @param volumeStore The volume store + * @param physicalPage The volume page number of the desired page + * @param volumePage The volume page buffer + * + * @return UDS_SUCCESS or an error code + **/ +int prepareToWriteVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) + __attribute__((warn_unused_result)); + +/** + * Read a page from a volume store. + * + * @param volumeStore The volume store + * @param physicalPage The volume page number of the desired page + * @param volumePage The volume page buffer + * + * @return UDS_SUCCESS or an error code + **/ +int readVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) + __attribute__((warn_unused_result)); + +/** + * Release a volume page buffer, because it will no longer be accessed before a + * call to readVolumePage or prepareToWriteVolumePage. + * + * @param volumePage The volume page buffer + **/ +void releaseVolumePage(struct volume_page *volumePage); + +/** + * Swap volume pages. This is used to put the contents of a newly written + * index page (in the scratch page) into the page cache. + * + * @param volumePage1 The volume page buffer + * @param volumePage2 The volume page buffer + **/ +void swapVolumePages(struct volume_page *volumePage1, + struct volume_page *volumePage2); + +/** + * Sync the volume store to storage. + * + * @param volumeStore The volume store + * + * @return UDS_SUCCESS or an error code + **/ +int syncVolumeStore(const struct volume_store *volumeStore) + __attribute__((warn_unused_result)); + +/** + * Write a page to a volume store. + * + * @param volumeStore The volume store + * @param physicalPage The volume page number of the desired page + * @param volumePage The volume page buffer + * + * @return UDS_SUCCESS or an error code + **/ +int writeVolumePage(const struct volume_store *volumeStore, + unsigned int physicalPage, + struct volume_page *volumePage) + __attribute__((warn_unused_result)); + +#endif /* VOLUME_STORE_H */ diff --git a/source/uds/zone.c b/source/uds/zone.c new file mode 100644 index 0000000..cc07674 --- /dev/null +++ b/source/uds/zone.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/zone.c#4 $ + */ + +#include "zone.h" + +#include "logger.h" +#include "threads.h" + +/**********************************************************************/ +unsigned int getZoneCount(const struct uds_parameters *userParams) +{ + unsigned int zoneCount = (userParams == NULL) ? 0 : userParams->zone_count; + if (zoneCount == 0) { + zoneCount = getNumCores() / 2; + } + if (zoneCount < 1) { + zoneCount = 1; + } + if (zoneCount > MAX_ZONES) { + zoneCount = MAX_ZONES; + } + logInfo("Using %u indexing zone%s for concurrency.", zoneCount, + zoneCount == 1 ? "" : "s"); + return zoneCount; +} diff --git a/source/uds/zone.h b/source/uds/zone.h new file mode 100644 index 0000000..99daf40 --- /dev/null +++ b/source/uds/zone.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/uds-releases/jasper/src/uds/zone.h#2 $ + */ + +#ifndef ZONE_H +#define ZONE_H + +#include "uds.h" + +enum { + MAX_ZONES = 16, +}; + +/** + * Return the number of zones. + * + * @param userParams the index session parameters. If NULL, the default + * session parameters will be used. + * + * @return the number of zones + **/ +unsigned int getZoneCount(const struct uds_parameters *userParams) + __attribute__((warn_unused_result)); + +#endif /* ZONE_H */ diff --git a/source/vdo/Makefile b/source/vdo/Makefile new file mode 100644 index 0000000..816c219 --- /dev/null +++ b/source/vdo/Makefile @@ -0,0 +1,31 @@ +VDO_VERSION = 6.2.4.26 + +VDO_VERSION_MAJOR = $(word 1,$(subst ., ,$(VDO_VERSION))) +VDO_VERSION_MINOR = $(word 2,$(subst ., ,$(VDO_VERSION))) +VDO_VERSION_MICRO = $(word 3,$(subst ., ,$(VDO_VERSION))) + +SOURCES = $(addprefix base/,$(notdir $(wildcard $(src)/base/*.c))) +SOURCES += $(addprefix kernel/,$(notdir $(wildcard $(src)/kernel/*.c))) +OBJECTS = $(SOURCES:%.c=%.o) +INCLUDES = -I$(src)/base -I$(src)/kernel -I$(src)/../uds + +EXTRA_CFLAGS = -std=gnu99 \ + -fno-builtin-memset \ + -Werror \ + -Wframe-larger-than=400 \ + -Wno-declaration-after-statement \ + -DVDO_VERSION_MAJOR=$(VDO_VERSION_MAJOR) \ + -DVDO_VERSION_MINOR=$(VDO_VERSION_MINOR) \ + -DVDO_VERSION_MICRO=$(VDO_VERSION_MICRO) \ + -DCURRENT_VERSION=\"$(VDO_VERSION)\" \ + $(INCLUDES) + +CFLAGS_REMOVE_vdoPageCache.o= -std=gnu99 +CFLAGS_REMOVE_vio.o= -std=gnu99 + +CFLAGS_vdoPageCache.o= -std=gnu89 +CFLAGS_vio.o= -std=gnu89 + +obj-m += kvdo.o + +kvdo-objs = $(OBJECTS) diff --git a/source/vdo/base/actionManager.c b/source/vdo/base/actionManager.c new file mode 100644 index 0000000..664131d --- /dev/null +++ b/source/vdo/base/actionManager.c @@ -0,0 +1,399 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/actionManager.c#9 $ + */ + +#include "actionManager.h" + +#include "memoryAlloc.h" + +#include "adminState.h" +#include "completion.h" +#include "types.h" + +/** An action to be performed in each of a set of zones */ +typedef struct action Action; +struct action { + /** Whether this structure is in use */ + bool inUse; + /** The admin operation associated with this action */ + AdminStateCode operation; + /** + * The method to run on the initiator thread before the action is applied to + * each zone. + **/ + ActionPreamble *preamble; + /** The action to be performed in each zone */ + ZoneAction *zoneAction; + /** + * The method to run on the initiator thread after the action has been + * applied to each zone + **/ + ActionConclusion *conclusion; + /** The object to notify when the action is complete */ + VDOCompletion *parent; + /** The action specific context */ + void *context; + /** The action to perform after this one */ + Action *next; +}; + +struct actionManager { + /** The completion for performing actions */ + VDOCompletion completion; + /** The state of this action manager */ + AdminState state; + /** The two action slots*/ + Action actions[2]; + /** The current action slot */ + Action *currentAction; + /** The number of zones in which an action is to be applied */ + ZoneCount zones; + /** A function to schedule a default next action */ + ActionScheduler *scheduler; + /** + * A function to get the id of the thread on which to apply an action to a + * zone + **/ + ZoneThreadGetter *getZoneThreadID; + /** The ID of the thread on which actions may be initiated */ + ThreadID initiatorThreadID; + /** Opaque data associated with this action manager */ + void *context; + /** The zone currently being acted upon */ + ZoneCount actingZone; +}; + +/** + * Convert a generic VDOCompletion to a ActionManager. + * + * @param completion The completion to convert + * + * @return The completion as a ActionManager + **/ +static inline ActionManager *asActionManager(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(ActionManager, completion) == 0); + assertCompletionType(completion->type, ACTION_COMPLETION); + return (ActionManager *) completion; +} + +/** + * An action scheduler which does not schedule an action. + * + *

Implements ActionScheduler. + **/ +static bool noDefaultAction(void *context __attribute__((unused))) +{ + return false; +} + +/** + * A default preamble which does nothing. + * + *

Implements ActionPreamble + **/ +static void noPreamble(void *context __attribute__((unused)), + VDOCompletion *completion) +{ + completeCompletion(completion); +} + +/** + * A default conclusion which does nothing. + * + *

Implements ActionConclusion. + **/ +static int noConclusion(void *context __attribute__((unused))) { + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeActionManager(ZoneCount zones, + ZoneThreadGetter *getZoneThreadID, + ThreadID initiatorThreadID, + void *context, + ActionScheduler *scheduler, + PhysicalLayer *layer, + ActionManager **managerPtr) +{ + ActionManager *manager; + int result = ALLOCATE(1, ActionManager, __func__, &manager); + if (result != VDO_SUCCESS) { + return result; + } + + *manager = (ActionManager) { + .zones = zones, + .scheduler = ((scheduler == NULL) ? noDefaultAction : scheduler), + .getZoneThreadID = getZoneThreadID, + .initiatorThreadID = initiatorThreadID, + .context = context, + }; + + manager->actions[0].next = &manager->actions[1]; + manager->currentAction = manager->actions[1].next = &manager->actions[0]; + + result = initializeEnqueueableCompletion(&manager->completion, + ACTION_COMPLETION, layer); + if (result != VDO_SUCCESS) { + freeActionManager(&manager); + return result; + } + + *managerPtr = manager; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeActionManager(ActionManager **managerPtr) +{ + ActionManager *manager = *managerPtr; + if (manager == NULL) { + return; + } + + destroyEnqueueable(&manager->completion); + FREE(manager); + *managerPtr = NULL; +} + +/**********************************************************************/ +AdminStateCode getCurrentManagerOperation(ActionManager *manager) +{ + return manager->state.state; +} + +/**********************************************************************/ +void *getCurrentActionContext(ActionManager *manager) +{ + return (manager->currentAction->inUse + ? manager->currentAction->context : NULL); +} + +/**********************************************************************/ +static void finishActionCallback(VDOCompletion *completion); +static void applyToZone(VDOCompletion *completion); + +/** + * Get the thread ID for the current zone. + * + * @param manager The action manager + * + * @return The ID of the thread on which to run actions for the current zone + **/ +static ThreadID getActingZoneThreadID(ActionManager *manager) +{ + return manager->getZoneThreadID(manager->context, manager->actingZone); +} + +/** + * Prepare the manager's completion to run on the next zone. + * + * @param manager The action manager + **/ +static void prepareForNextZone(ActionManager *manager) +{ + prepareForRequeue(&manager->completion, applyToZone, + preserveErrorAndContinue, getActingZoneThreadID(manager), + manager->currentAction->parent); +} + +/** + * Prepare the manager's completion to run the conclusion on the initiator + * thread. + * + * @param manager The action manager + **/ +static void prepareForConclusion(ActionManager *manager) +{ + prepareForRequeue(&manager->completion, finishActionCallback, + preserveErrorAndContinue, manager->initiatorThreadID, + manager->currentAction->parent); +} + +/** + * Perform an action on the next zone if there is one. + * + * @param completion The action completion + **/ +static void applyToZone(VDOCompletion *completion) +{ + ActionManager *manager = asActionManager(completion); + ASSERT_LOG_ONLY((getCallbackThreadID() == getActingZoneThreadID(manager)), + "applyToZone() called on acting zones's thread"); + + ZoneCount zone = manager->actingZone++; + if (manager->actingZone == manager->zones) { + // We are about to apply to the last zone. Once that is finished, + // we're done, so go back to the initiator thread and finish up. + prepareForConclusion(manager); + } else { + // Prepare to come back on the next zone + prepareForNextZone(manager); + } + + manager->currentAction->zoneAction(manager->context, zone, completion); +} + +/** + * The error handler for preamble errors. + * + * @param completion The manager completion + **/ +static void handlePreambleError(VDOCompletion *completion) +{ + // Skip the zone actions since the preamble failed. + completion->callback = finishActionCallback; + preserveErrorAndContinue(completion); +} + +/** + * Launch the current action. + * + * @param manager The action manager + **/ +static void launchCurrentAction(ActionManager *manager) +{ + Action *action = manager->currentAction; + int result = startOperation(&manager->state, action->operation); + if (result != VDO_SUCCESS) { + if (action->parent != NULL) { + setCompletionResult(action->parent, result); + } + + // We aren't going to run the preamble, so don't run the conclusion + action->conclusion = noConclusion; + finishActionCallback(&manager->completion); + return; + } + + if (action->zoneAction == NULL) { + prepareForConclusion(manager); + } else { + manager->actingZone = 0; + prepareForRequeue(&manager->completion, applyToZone, handlePreambleError, + getActingZoneThreadID(manager), + manager->currentAction->parent); + } + + action->preamble(manager->context, &manager->completion); +} + +/**********************************************************************/ +bool scheduleDefaultAction(ActionManager *manager) +{ + // Don't schedule a default action if we are operating or not in normal + // operation. + return ((manager->state.state == ADMIN_STATE_NORMAL_OPERATION) + && manager->scheduler(manager->context)); +} + +/** + * Finish an action now that it has been applied to all zones. This + * callback is registered in applyToZone(). + * + * @param completion The action manager completion + **/ +static void finishActionCallback(VDOCompletion *completion) +{ + ActionManager *manager = asActionManager(completion); + Action action = *(manager->currentAction); + manager->currentAction->inUse = false; + manager->currentAction = manager->currentAction->next; + + // We need to check this now to avoid use-after-free issues if running the + // conclusion or notifying the parent results in the manager being freed. + bool hasNextAction = (manager->currentAction->inUse + || scheduleDefaultAction(manager)); + int result = action.conclusion(manager->context); + finishOperation(&manager->state); + if (action.parent != NULL) { + finishCompletion(action.parent, result); + } + + if (hasNextAction) { + launchCurrentAction(manager); + } +} + +/**********************************************************************/ +bool scheduleAction(ActionManager *manager, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + VDOCompletion *parent) +{ + return scheduleOperation(manager, ADMIN_STATE_OPERATING, preamble, + zoneAction, conclusion, parent); +} + +/**********************************************************************/ +bool scheduleOperation(ActionManager *manager, + AdminStateCode operation, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + VDOCompletion *parent) +{ + return scheduleOperationWithContext(manager, operation, preamble, zoneAction, + conclusion, NULL, parent); +} + +/**********************************************************************/ +bool scheduleOperationWithContext(ActionManager *manager, + AdminStateCode operation, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + void *context, + VDOCompletion *parent) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == manager->initiatorThreadID), + "action initiated from correct thread"); + Action *action; + if (!manager->currentAction->inUse) { + action = manager->currentAction; + } else if (!manager->currentAction->next->inUse) { + action = manager->currentAction->next; + } else { + if (parent != NULL) { + finishCompletion(parent, VDO_COMPONENT_BUSY); + } + + return false; + } + + *action = (Action) { + .inUse = true, + .operation = operation, + .preamble = (preamble == NULL) ? noPreamble : preamble, + .zoneAction = zoneAction, + .conclusion = (conclusion == NULL) ? noConclusion : conclusion, + .context = context, + .parent = parent, + .next = action->next, + }; + + if (action == manager->currentAction) { + launchCurrentAction(manager); + } + + return true; +} diff --git a/source/vdo/base/actionManager.h b/source/vdo/base/actionManager.h new file mode 100644 index 0000000..2e0ef13 --- /dev/null +++ b/source/vdo/base/actionManager.h @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/actionManager.h#6 $ + */ + +#ifndef ACTION_MANAGER_H +#define ACTION_MANAGER_H + +#include "adminState.h" +#include "completion.h" +#include "types.h" + +/** + * ActionManager provides a generic mechanism for applying actions to + * multi-zone entities (such as the block map or slab depot). Each action + * manager is tied to a specific context for which it manages actions. The + * manager ensures that only one action is active on that context at a time, + * and supports at most one pending action. Calls to schedule an action when + * there is already a pending action will result in VDO_COMPONENT_BUSY errors. + * Actions may only be submitted to the action manager from a single thread + * (which thread is determined when the action manager is constructed). + * + * A scheduled action consists of four components: + * preamble: an optional method to be run on the initator thread before + * applying the action to all zones + * zoneAction: an optional method to be applied to each of the zones + * conclusion: an optional method to be run on the initiator thread once the + * per-zone method has been applied to all zones + * parent: an optional completion to be finished once the conclusion + * is done + * + * At least one of the three methods must be provided. + **/ + +/** + * A function which is to be applied asynchronously to a set of zones. + * + * @param context The object which holds the per-zone context for the + * action + * @param zoneNumber The number of zone to which the action is being applied + * @param parent The object to notify when the action is complete + **/ +typedef void ZoneAction(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * A function which is to be applied asynchronously on an action manager's + * initiator thread as the preamble of an action. + * + * @param context The object which holds the per-zone context for the action + * @param parent The object to notify when the action is complete + **/ +typedef void ActionPreamble(void *context, VDOCompletion *parent); + +/** + * A function which will run on the action manager's initiator thread as the + * conclusion of an action. + * + * @param context The object which holds the per-zone context for the action + * + * @return VDO_SUCCESS or an error + **/ +typedef int ActionConclusion(void *context); + +/** + * A function to schedule an action. + * + * @param context The object which holds the per-zone context for the action + * + * @return true if an action was scheduled + **/ +typedef bool ActionScheduler(void *context); + +/** + * Get the id of the thread associated with a given zone. + * + * @param context The action context + * @param zoneNumber The number of the zone for which the thread ID is desired + **/ +typedef ThreadID ZoneThreadGetter(void *context, ZoneCount zoneNumber); + +/** + * Make an action manager. + * + * @param [in] zones The number of zones to which actions will be + * applied + * @param [in] getZoneThreadID A function to get the thread id associated + * with a zone + * @param [in] initiatorThreadID The thread on which actions may initiated + * @param [in] context The object which holds the per-zone context + * for the action + * @param [in] scheduler A function to schedule a next action after an + * action concludes if there is no pending + * action (may be NULL) + * @param [in] layer The layer used to make completions + * @param [out] managerPtr A pointer to hold the new action manager + * + * @return VDO_SUCCESS or an error code + **/ +int makeActionManager(ZoneCount zones, + ZoneThreadGetter *getZoneThreadID, + ThreadID initiatorThreadID, + void *context, + ActionScheduler *scheduler, + PhysicalLayer *layer, + ActionManager **managerPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy an action manager and null out the reference to it. + * + * @param managerPtr The reference to the manager to destroy + **/ +void freeActionManager(ActionManager **managerPtr); + +/** + * Get the current operation an action manager is performing. + * + * @param manager The manager to query + * + * @return The manager's current operation + **/ +AdminStateCode getCurrentManagerOperation(ActionManager *manager) + __attribute__((warn_unused_result)); + +/** + * Get the action-specific context for the operation an action manager is + * currently performing. + * + * @param manager The manager to query + * + * @return The action-specific context for the manager's current action or + * NULL if there is no context or no current action + **/ +void *getCurrentActionContext(ActionManager *manager) + __attribute__((warn_unused_result)); + +/** + * Attempt to schedule the default action. If the manager is not operating + * normally, the action will not be scheduled. + * + * @param manager The action manager + * + * @return true if an action was scheduled. + **/ +bool scheduleDefaultAction(ActionManager *manager); + +/** + * Schedule an action to be applied to all zones. The action will be launched + * immediately if there is no current action, or as soon as the current action + * completes. If there is already a pending action, this action will not be + * scheduled, and, if it has a parent, that parent will be notified. At least + * one of the preamble, zoneAction, or conclusion must not be NULL. + * + * @param manager The action manager to schedule the action on + * @param preamble A method to be invoked on the initiator thread once this + * action is started but before applying to each zone; may + * be NULL + * @param zoneAction The action to apply to each zone; may be NULL + * @param conclusion A method to be invoked back on the initiator thread once + * the action has been applied to all zones; may be NULL + * @param parent The object to notify once the action is complete or if + * the action can not be scheduled; may be NULL + * + * @return true if the action was scheduled + **/ +bool scheduleAction(ActionManager *manager, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + VDOCompletion *parent); + +/** + * Schedule an operation to be applied to all zones. The operation's action + * will be launched immediately if there is no current action, or as soon as + * the current action completes. If there is already a pending action, this + * operation will not be scheduled, and, if it has a parent, that parent will + * be notified. At least one of the preamble, zoneAction, or conclusion must + * not be NULL. + * + * @param manager The action manager to schedule the action on + * @param operation The operation this action will perform + * @param preamble A method to be invoked on the initiator thread once this + * action is started but before applying to each zone; may + * be NULL + * @param zoneAction The action to apply to each zone; may be NULL + * @param conclusion A method to be invoked back on the initiator thread once + * the action has been applied to all zones; may be NULL + * @param parent The object to notify once the action is complete or if + * the action can not be scheduled; may be NULL + * + * @return true if the action was scheduled + **/ +bool scheduleOperation(ActionManager *manager, + AdminStateCode operation, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + VDOCompletion *parent); + +/** + * Schedule an operation to be applied to all zones. The operation's action + * will be launched immediately if there is no current action, or as soon as + * the current action completes. If there is already a pending action, this + * operation will not be scheduled, and, if it has a parent, that parent will + * be notified. At least one of the preamble, zoneAction, or conclusion must + * not be NULL. + * + * @param manager The action manager to schedule the action on + * @param operation The operation this action will perform + * @param preamble A method to be invoked on the initiator thread once this + * action is started but before applying to each zone; may + * be NULL + * @param zoneAction The action to apply to each zone; may be NULL + * @param conclusion A method to be invoked back on the initiator thread once + * the action has been applied to all zones; may be NULL + * @param context An action-specific context which may be retrieved via + * getCurrentActionContext(); may be NULL + * @param parent The object to notify once the action is complete or if + * the action can not be scheduled; may be NULL + * + * @return true if the action was scheduled + **/ +bool scheduleOperationWithContext(ActionManager *manager, + AdminStateCode operation, + ActionPreamble *preamble, + ZoneAction *zoneAction, + ActionConclusion *conclusion, + void *context, + VDOCompletion *parent); + +#endif // ACTION_MANAGER_H diff --git a/source/vdo/base/adminCompletion.c b/source/vdo/base/adminCompletion.c new file mode 100644 index 0000000..5c5ed26 --- /dev/null +++ b/source/vdo/base/adminCompletion.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminCompletion.c#4 $ + */ + +#include "adminCompletion.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "atomic.h" +#include "completion.h" +#include "types.h" +#include "vdoInternal.h" + +/**********************************************************************/ +void assertAdminOperationType(AdminCompletion *completion, + AdminOperationType expected) +{ + ASSERT_LOG_ONLY(completion->type == expected, + "admin operation type is %u instead of %u", + completion->type, expected); +} + +/**********************************************************************/ +AdminCompletion *adminCompletionFromSubTask(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(AdminCompletion, completion) == 0); + assertCompletionType(completion->type, SUB_TASK_COMPLETION); + VDOCompletion *parent = completion->parent; + assertCompletionType(parent->type, ADMIN_COMPLETION); + return (AdminCompletion *) parent; +} + +/**********************************************************************/ +void assertAdminPhaseThread(AdminCompletion *adminCompletion, + const char *what, + const char *phaseNames[]) +{ + ThreadID expected = adminCompletion->getThreadID(adminCompletion); + ASSERT_LOG_ONLY((getCallbackThreadID() == expected), + "%s on correct thread for %s", + what, phaseNames[adminCompletion->phase]); +} + +/**********************************************************************/ +VDO *vdoFromAdminSubTask(VDOCompletion *completion, + AdminOperationType expected) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, expected); + return adminCompletion->completion.parent; +} + +/**********************************************************************/ +int initializeAdminCompletion(VDO *vdo, AdminCompletion *adminCompletion) +{ + int result = initializeEnqueueableCompletion(&adminCompletion->completion, + ADMIN_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + return result; + } + + result = initializeEnqueueableCompletion(&adminCompletion->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + uninitializeAdminCompletion(adminCompletion); + return result; + } + + atomicStoreBool(&adminCompletion->busy, false); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void uninitializeAdminCompletion(AdminCompletion *adminCompletion) +{ + destroyEnqueueable(&adminCompletion->subTaskCompletion); + destroyEnqueueable(&adminCompletion->completion); +} + +/**********************************************************************/ +VDOCompletion *resetAdminSubTask(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + resetCompletion(completion); + completion->callbackThreadID = adminCompletion->getThreadID(adminCompletion); + return completion; +} + +/**********************************************************************/ +void prepareAdminSubTaskOnThread(VDO *vdo, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID) +{ + prepareForRequeue(&vdo->adminCompletion.subTaskCompletion, callback, + errorHandler, threadID, &vdo->adminCompletion); +} + +/**********************************************************************/ +void prepareAdminSubTask(VDO *vdo, + VDOAction *callback, + VDOAction *errorHandler) +{ + AdminCompletion *adminCompletion = &vdo->adminCompletion; + prepareAdminSubTaskOnThread(vdo, callback, errorHandler, + adminCompletion->completion.callbackThreadID); +} + +/** + * Callback for admin operations which will notify the layer that the operation + * is complete. + * + * @param completion The admin completion + **/ +static void adminOperationCallback(VDOCompletion *completion) +{ + completion->layer->completeAdminOperation(completion->layer); +} + +/**********************************************************************/ +int performAdminOperation(VDO *vdo, + AdminOperationType type, + ThreadIDGetterForPhase *threadIDGetter, + VDOAction *action, + VDOAction *errorHandler) +{ + AdminCompletion *adminCompletion = &vdo->adminCompletion; + if (!compareAndSwapBool(&adminCompletion->busy, false, true)) { + return logErrorWithStringError(VDO_COMPONENT_BUSY, + "Can't start admin operation of type %u, " + "another operation is already in progress", + type); + } + + prepareCompletion(&adminCompletion->completion, adminOperationCallback, + adminOperationCallback, + getAdminThread(getThreadConfig(vdo)), vdo); + adminCompletion->type = type; + adminCompletion->getThreadID = threadIDGetter; + adminCompletion->phase = 0; + prepareAdminSubTask(vdo, action, errorHandler); + + PhysicalLayer *layer = vdo->layer; + layer->enqueue(adminCompletion->subTaskCompletion.enqueueable); + layer->waitForAdminOperation(layer); + int result = adminCompletion->completion.result; + atomicStoreBool(&adminCompletion->busy, false); + return result; +} diff --git a/source/vdo/base/adminCompletion.h b/source/vdo/base/adminCompletion.h new file mode 100644 index 0000000..50eeecd --- /dev/null +++ b/source/vdo/base/adminCompletion.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminCompletion.h#4 $ + */ + +#ifndef ADMIN_COMPLETION_H +#define ADMIN_COMPLETION_H + +#include "atomic.h" +#include "completion.h" +#include "types.h" + +typedef enum adminOperationType { + ADMIN_OPERATION_UNKNOWN = 0, + ADMIN_OPERATION_GROW_LOGICAL, + ADMIN_OPERATION_GROW_PHYSICAL, + ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, + ADMIN_OPERATION_LOAD, + ADMIN_OPERATION_RESUME, + ADMIN_OPERATION_SAVE, + ADMIN_OPERATION_SUSPEND, +} AdminOperationType; + +typedef struct adminCompletion AdminCompletion; + +/** + * A function which gets the ID of the thread on which the current phase of an + * admin operation should be run. + * + * @param adminCompletion The AdminCompletion + * + * @return The ID of the thread on which the current phase should be performed + **/ +typedef ThreadID ThreadIDGetterForPhase(AdminCompletion *adminCompletion); + +struct adminCompletion { + /** The completion */ + VDOCompletion completion; + /** The sub-task completion */ + VDOCompletion subTaskCompletion; + /** Whether this completion is in use */ + AtomicBool busy; + /** The operation type */ + AdminOperationType type; + /** Method to get the ThreadID for the current phase */ + ThreadIDGetterForPhase *getThreadID; + /** The current phase of the operation */ + uint32_t phase; +}; + +/** + * Check that an AdminCompletion's type is as expected. + * + * @param completion The AdminCompletion to check + * @param expected The expected type + **/ +void assertAdminOperationType(AdminCompletion *completion, + AdminOperationType expected); + +/** + * Convert the sub-task completion of an AdminCompletion to an AdminCompletion. + * + * @param completion the AdminCompletion's sub-task completion + * + * @return The sub-task completion as its enclosing AdminCompletion + **/ +AdminCompletion *adminCompletionFromSubTask(VDOCompletion *completion) + __attribute__((warn_unused_result)); + +/** + * Assert that we are operating on the correct thread for the current phase. + * + * @param adminCompletion The AdminCompletion to check + * @param what The method doing the phase check + * @param phaseNames The names of the phases of the current operation + **/ +void assertAdminPhaseThread(AdminCompletion *adminCompletion, + const char *what, + const char *phaseNames[]); + +/** + * Get the VDO from the sub-task completion of its AdminCompletion. + * + * @param completion the sub-task completion + * @param expected the expected operation type of the AdminCompletion + * + * @return The VDO + **/ +VDO *vdoFromAdminSubTask(VDOCompletion *completion, + AdminOperationType expected) + __attribute__((warn_unused_result)); + +/** + * Initialize an admin completion. + * + * @param vdo The VDO which owns the completion + * @param adminCompletion The AdminCompletion to initialize + * + * @return VDO_SUCCESS or an error + **/ +int initializeAdminCompletion(VDO *vdo, AdminCompletion *adminCompletion) + __attribute__((warn_unused_result)); + +/** + * Clean up an admin completion's resources. + * + * @param adminCompletion The AdminCompletion to uninitialize + **/ +void uninitializeAdminCompletion(AdminCompletion *adminCompletion); + +/** + * Reset an AdminCompletion's sub-task completion. + * + * @param completion The AdminCompletion's sub-task completion + * + * @return The sub-task completion for the convenience of callers + **/ +VDOCompletion *resetAdminSubTask(VDOCompletion *completion); + +/** + * Prepare the sub-task completion of a VDO's AdminCompletion + * + * @param vdo The VDO + * @param callback The callback for the sub-task + * @param errorHandler The error handler for the sub-task + * @param threadID The ID of the thread on which to run the callback + **/ +void prepareAdminSubTaskOnThread(VDO *vdo, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID); + +/** + * Prepare the sub-task completion of a VDO's AdminCompletion to run on the + * same thread as the AdminCompletion's main completion. + * + * @param vdo The VDO + * @param callback The callback for the sub-task + * @param errorHandler The error handler for the sub-task + **/ +void prepareAdminSubTask(VDO *vdo, + VDOAction *callback, + VDOAction *errorHandler); + +/** + * Perform an administrative operation (load, suspend, grow logical, or grow + * physical). This method should not be called from base threads unless it is + * certain the calling thread won't be needed to perform the operation. It may + * (and should) be called from non-base threads. + * + * @param vdo The VDO on which to perform the operation + * @param type The type of operation to perform + * @param threadIDGetter A function for getting the ID of the thread on which + * a given phase should be run + * @param action The action which starts the operation + * @param errorHandler The error handler for the operation + * + * @return The result of the operation + **/ +int performAdminOperation(VDO *vdo, + AdminOperationType type, + ThreadIDGetterForPhase *threadIDGetter, + VDOAction *action, + VDOAction *errorHandler) + __attribute__((warn_unused_result)); + +#endif /* ADMIN_COMPLETION_H */ diff --git a/source/vdo/base/adminState.c b/source/vdo/base/adminState.c new file mode 100644 index 0000000..6b30315 --- /dev/null +++ b/source/vdo/base/adminState.c @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminState.c#14 $ + */ + +#include "adminState.h" + +#include "logger.h" +#include "permassert.h" + +#include "completion.h" +#include "types.h" + +/**********************************************************************/ +const char *getAdminStateCodeName(AdminStateCode code) +{ + switch (code) { + case ADMIN_STATE_NORMAL_OPERATION: + return "ADMIN_STATE_NORMAL_OPERATION"; + + case ADMIN_STATE_OPERATING: + return "ADMIN_STATE_OPERATING"; + + case ADMIN_STATE_FORMATTING: + return "ADMIN_STATE_FORMATTING"; + + case ADMIN_STATE_LOADING: + return "ADMIN_STATE_LOADING"; + + case ADMIN_STATE_LOADING_FOR_RECOVERY: + return "ADMIN_STATE_LOADING_FOR_RECOVERY"; + + case ADMIN_STATE_LOADING_FOR_REBUILD: + return "ADMIN_STATE_LOADING_FOR_REBUILD"; + + case ADMIN_STATE_NEW: + return "ADMIN_STATE_NEW"; + + case ADMIN_STATE_WAITING_FOR_RECOVERY: + return "ADMIN_STATE_WAITING_FOR_RECOVERY"; + + case ADMIN_STATE_RECOVERING: + return "ADMIN_STATE_RECOVERING"; + + case ADMIN_STATE_REBUILDING: + return "ADMIN_STATE_REBUILDING"; + + case ADMIN_STATE_SAVING: + return "ADMIN_STATE_SAVING"; + + case ADMIN_STATE_SAVED: + return "ADMIN_STATE_SAVED"; + + case ADMIN_STATE_SCRUBBING: + return "ADMIN_STATE_SCRUBBING"; + + case ADMIN_STATE_SAVE_FOR_SCRUBBING: + return "ADMIN_STATE_SAVE_FOR_SCRUBBING"; + + case ADMIN_STATE_SUSPENDING: + return "ADMIN_STATE_SUSPENDING"; + + case ADMIN_STATE_SUSPENDED: + return "ADMIN_STATE_SUSPENDED"; + + case ADMIN_STATE_SUSPENDED_OPERATION: + return "ADMIN_STATE_SUSPENDED_OPERATION"; + + case ADMIN_STATE_RESUMING: + return "ADMIN_STATE_RESUMING"; + + default: + return "INVALID ADMIN_STATE"; + } +} + +/**********************************************************************/ +const char *getAdminStateName(const AdminState *state) +{ + return getAdminStateCodeName(state->state); +} + +/**********************************************************************/ +static AdminStateCode getNextState(AdminStateCode previousState, + AdminStateCode operation) +{ + if (isQuiescingCode(operation)) { + return ((operation & ADMIN_TYPE_MASK) | ADMIN_FLAG_QUIESCENT); + } + + if (operation == ADMIN_STATE_SUSPENDED_OPERATION) { + return previousState; + } + + return ADMIN_STATE_NORMAL_OPERATION; +} + +/** + * Finish an operation if one is in progress. If there is a waiter, it will be + * notified. + * + * @param state The AdminState + * @param result The result of the operation + * + * @return true if an operation was in progress and has been + * finished. + **/ +static bool endOperation(AdminState *state, int result) +{ + if (!isOperating(state)) { + return false; + } + + if (state->starting) { + state->complete = true; + if (state->waiter != NULL) { + setCompletionResult(state->waiter, result); + } + } else { + state->complete = false; + state->state = state->nextState; + releaseCompletionWithResult(&state->waiter, result); + } + + return true; +} + +/** + * Begin an operation if it may be started given the current state. + * + * @param state The AdminState + * @param operation The operation to begin + * @param waiter A completion to notify when the operation is complete; may + * be NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int beginOperation(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + int result; + if (isOperating(state) + || (isQuiescent(state) != isQuiescentOperation(operation))) { + result = logErrorWithStringError(VDO_INVALID_ADMIN_STATE, + "Can't start %s from %s", + getAdminStateCodeName(operation), + getAdminStateName(state)); + } else if (state->waiter != NULL) { + result = logErrorWithStringError(VDO_COMPONENT_BUSY, + "Can't start %s with extant waiter", + getAdminStateCodeName(operation)); + } else { + state->waiter = waiter; + state->nextState = getNextState(state->state, operation); + state->state = operation; + if (initiator != NULL) { + state->starting = true; + initiator(state); + state->starting = false; + if (state->complete) { + endOperation(state, VDO_SUCCESS); + } + } + + return VDO_SUCCESS; + } + + if (waiter != NULL) { + finishCompletion(waiter, result); + } + + return result; +} + +/** + * Check the result of a state validation. If the result failed, log an invalid + * state error and, if there is a waiter, notify it. + * + * @param valid true if the code is of an appropriate type + * @param code The code which failed to be of the correct type + * @param what What the code failed to be, for logging + * @param waiter The completion to notify of the error; may be NULL + * + * @return The result of the check + **/ +static bool checkCode(bool valid, + AdminStateCode code, + const char *what, + VDOCompletion *waiter) +{ + if (valid) { + return true; + } + + int result = logErrorWithStringError(VDO_INVALID_ADMIN_STATE, + "%s is not a %s", + getAdminStateCodeName(code), what); + if (waiter != NULL) { + finishCompletion(waiter, result); + } + + return false; +} + +/**********************************************************************/ +bool assertDrainOperation(AdminStateCode operation, VDOCompletion *waiter) +{ + return checkCode(isDrainOperation(operation), operation, "drain operation", + waiter); +} + +/**********************************************************************/ +bool startDraining(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + return (assertDrainOperation(operation, waiter) + && (beginOperation(state, operation, waiter, initiator) + == VDO_SUCCESS)); +} + +/**********************************************************************/ +bool finishDraining(AdminState *state) +{ + return finishDrainingWithResult(state, VDO_SUCCESS); +} + +/**********************************************************************/ +bool finishDrainingWithResult(AdminState *state, int result) +{ + return (isDraining(state) && endOperation(state, result)); +} + +/**********************************************************************/ +bool assertLoadOperation(AdminStateCode operation, VDOCompletion *waiter) +{ + return checkCode(isLoadOperation(operation), operation, "load operation", + waiter); +} + +/**********************************************************************/ +bool startLoading(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + return (assertLoadOperation(operation, waiter) + && (beginOperation(state, operation, waiter, initiator) + == VDO_SUCCESS)); +} + +/**********************************************************************/ +bool finishLoading(AdminState *state) +{ + return finishLoadingWithResult(state, VDO_SUCCESS); +} + +/**********************************************************************/ +bool finishLoadingWithResult(AdminState *state, int result) +{ + return (isLoading(state) && endOperation(state, result)); +} + +/**********************************************************************/ +bool assertResumeOperation(AdminStateCode operation, VDOCompletion *waiter) +{ + return checkCode(isResumeOperation(operation), operation, "resume operation", + waiter); +} + +/**********************************************************************/ +bool startResuming(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + return (assertResumeOperation(operation, waiter) + && (beginOperation(state, operation, waiter, initiator) + == VDO_SUCCESS)); +} + +/**********************************************************************/ +bool finishResuming(AdminState *state) +{ + return finishResumingWithResult(state, VDO_SUCCESS); +} + +/**********************************************************************/ +bool finishResumingWithResult(AdminState *state, int result) +{ + return (isResuming(state) && endOperation(state, result)); +} + +/**********************************************************************/ +int resumeIfQuiescent(AdminState *state) +{ + if (!isQuiescent(state)) { + return VDO_INVALID_ADMIN_STATE; + } + + state->state = ADMIN_STATE_NORMAL_OPERATION; + return VDO_SUCCESS; +} + +/** + * Check whether an AdminStateCode is an operation. + * + * @param code The operation to check + * @param waiter The completion to notify if the code is not an operation; may + * be NULL + * + * @return true if the code is an operation + **/ +static bool assertOperation(AdminStateCode code, VDOCompletion *waiter) +{ + return checkCode(isOperation(code), code, "operation", waiter); +} + +/**********************************************************************/ +int startOperation(AdminState *state, AdminStateCode operation) +{ + return (assertOperation(operation, NULL) + ? beginOperation(state, operation, NULL, NULL) + : VDO_INVALID_ADMIN_STATE); +} + +/**********************************************************************/ +bool startOperationWithWaiter(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator) +{ + return (assertOperation(operation, waiter) + && (beginOperation(state, operation, waiter, initiator) + == VDO_SUCCESS)); +} + +/**********************************************************************/ +bool finishOperation(AdminState *state) +{ + return finishOperationWithResult(state, VDO_SUCCESS); +} + +/**********************************************************************/ +bool finishOperationWithResult(AdminState *state, int result) +{ + return endOperation(state, result); +} diff --git a/source/vdo/base/adminState.h b/source/vdo/base/adminState.h new file mode 100644 index 0000000..5ab13cb --- /dev/null +++ b/source/vdo/base/adminState.h @@ -0,0 +1,666 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminState.h#17 $ + */ + +#ifndef ADMIN_STATE_H +#define ADMIN_STATE_H + +#include "completion.h" +#include "types.h" + +/** + * The list of state types. + **/ +typedef enum { + /** Normal operation, DataVIOs may be active */ + ADMIN_TYPE_NORMAL = 0, + /** + * Format: an operation for formatting a new VDO. + **/ + ADMIN_TYPE_FORMAT, + /** + * Recover: a recovery operation. + **/ + ADMIN_TYPE_RECOVER, + /** + * Rebuild: write data necessary for a full rebuild, drain outstanding I/O, + * and return to normal operation. + **/ + ADMIN_TYPE_REBUILD, + /** + * Save: write all dirty metadata thereby restoring the VDO to a clean state, + * drain outstanding I/O, and become quiescent. + **/ + ADMIN_TYPE_SAVE, + /** + * Scrub: load and/or save state necessary to scrub a slab. + **/ + ADMIN_TYPE_SCRUB, + /** + * Suspend: write enough dirty metadata to perform resize transactions, + * drain outstanding I/O, and become quiescent. + **/ + ADMIN_TYPE_SUSPEND, + /** + * Resume: return to normal from a quiescent state + **/ + ADMIN_TYPE_RESUME, + /** The mask for extracting the AdminType from and AdminStateCode */ + ADMIN_TYPE_MASK = 0xff, +} AdminType; + + +/** + * The bit position of flags used to categorize states. + **/ +typedef enum { + ADMIN_FLAG_BIT_START = 8, + /** Flag indicating that I/O is draining */ + ADMIN_FLAG_BIT_DRAINING = ADMIN_FLAG_BIT_START, + /** Flag indicating a load operation */ + ADMIN_FLAG_BIT_LOADING, + /** Flag indicating that the next state will be a quiescent state */ + ADMIN_FLAG_BIT_QUIESCING, + /** Flag indicating that the state is quiescent */ + ADMIN_FLAG_BIT_QUIESCENT, + /** + * Flag indicating that an operation is in progress and so no other + * operation may be started. + **/ + ADMIN_FLAG_BIT_OPERATING, +} AdminFlagBit; + +/** + * The flags themselves. + **/ +typedef enum { + ADMIN_FLAG_DRAINING = (uint32_t) (1 << ADMIN_FLAG_BIT_DRAINING), + ADMIN_FLAG_LOADING = (uint32_t) (1 << ADMIN_FLAG_BIT_LOADING), + ADMIN_FLAG_QUIESCING = (uint32_t) (1 << ADMIN_FLAG_BIT_QUIESCING), + ADMIN_FLAG_QUIESCENT = (uint32_t) (1 << ADMIN_FLAG_BIT_QUIESCENT), + ADMIN_FLAG_OPERATING = (uint32_t) (1 << ADMIN_FLAG_BIT_OPERATING), +} AdminFlag; + +/** + * The state codes. + **/ +typedef enum { + ADMIN_STATE_NORMAL_OPERATION = ADMIN_TYPE_NORMAL, + ADMIN_STATE_OPERATING = (ADMIN_TYPE_NORMAL + | ADMIN_FLAG_OPERATING), + ADMIN_STATE_FORMATTING = (ADMIN_TYPE_FORMAT + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_LOADING = (ADMIN_TYPE_NORMAL + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_LOADING_FOR_RECOVERY = (ADMIN_TYPE_RECOVER + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_LOADING_FOR_REBUILD = (ADMIN_TYPE_REBUILD + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_WAITING_FOR_RECOVERY = (ADMIN_TYPE_RECOVER + | ADMIN_FLAG_OPERATING), + ADMIN_STATE_NEW = (ADMIN_TYPE_NORMAL + | ADMIN_FLAG_QUIESCENT), + ADMIN_STATE_RECOVERING = (ADMIN_TYPE_RECOVER + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING), + ADMIN_STATE_REBUILDING = (ADMIN_TYPE_REBUILD + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING), + ADMIN_STATE_SAVING = (ADMIN_TYPE_SAVE + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING + | ADMIN_FLAG_QUIESCING), + ADMIN_STATE_SAVED = (ADMIN_TYPE_SAVE + | ADMIN_FLAG_QUIESCENT), + ADMIN_STATE_SCRUBBING = (ADMIN_TYPE_SCRUB + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING + | ADMIN_FLAG_LOADING), + ADMIN_STATE_SAVE_FOR_SCRUBBING = (ADMIN_TYPE_SCRUB + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING), + ADMIN_STATE_SUSPENDING = (ADMIN_TYPE_SUSPEND + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_DRAINING + | ADMIN_FLAG_QUIESCING), + ADMIN_STATE_SUSPENDED = (ADMIN_TYPE_SUSPEND + | ADMIN_FLAG_QUIESCENT), + ADMIN_STATE_SUSPENDED_OPERATION = (ADMIN_TYPE_SUSPEND + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_QUIESCENT), + ADMIN_STATE_RESUMING = (ADMIN_TYPE_RESUME + | ADMIN_FLAG_OPERATING + | ADMIN_FLAG_QUIESCENT), +} AdminStateCode; + +typedef struct { + /** The current administrative state */ + AdminStateCode state; + /** The next administrative state (when the current operation finishes */ + AdminStateCode nextState; + /** A completion waiting on a state change */ + VDOCompletion *waiter; + /** Whether an operation is being initiated */ + bool starting; + /** Whether an operation has completed in the initiator */ + bool complete; +} AdminState; + +/** + * A method to be called once an admin operation may be initiated. + **/ +typedef void AdminInitiator(AdminState *state); + +/** + * Get the name of an AdminStateCode for logging purposes. + * + * @param code The AdminStateCode + * + * @return The name of the state's code + **/ +const char *getAdminStateCodeName(AdminStateCode code) + __attribute__((warn_unused_result)); + +/** + * Get the name of an AdminState's code for logging purposes. + * + * @param state The AdminState + * + * @return The name of the state's code + **/ +const char *getAdminStateName(const AdminState *state) + __attribute__((warn_unused_result)); + +/** + * Check whether an AdminState is in normal operation. + * + * @param state The AdminState to query + * + * @return true if the state is normal + **/ +__attribute__((warn_unused_result)) +static inline bool isNormal(AdminState *state) +{ + return ((state->state & ADMIN_TYPE_MASK) == ADMIN_TYPE_NORMAL); +} + +/** + * Check whether an AdminStateCode is an operation. + * + * @param code The code to check + * + * @return true if the code is an operation + **/ +__attribute__((warn_unused_result)) +static inline bool isOperation(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_OPERATING) == ADMIN_FLAG_OPERATING); +} + +/** + * Check whether an AdminState is operating. + * + * @param state The AdminState to query + * + * @return true if the state is operating + **/ +__attribute__((warn_unused_result)) +static inline bool isOperating(AdminState *state) +{ + return isOperation(state->state); +} + +/** + * Check whether an AdminState is suspending. + * + * @param state The AdminState to query + * + * @return true if the state is suspending + **/ +__attribute__((warn_unused_result)) +static inline bool isSuspending(AdminState *state) +{ + return (state->state == ADMIN_STATE_SUSPENDING); +} + +/** + * Check whether an AdminState is suspended. + * + * @param state The AdminState to query + * + * @return true if the state is suspended + **/ +__attribute__((warn_unused_result)) +static inline bool isSuspended(AdminState *state) +{ + return (state->state == ADMIN_STATE_SUSPENDED); +} + +/** + * Check whether an AdminState is saving. + * + * @param state The AdminState to query + * + * @return true if the state is saving + **/ +__attribute__((warn_unused_result)) +static inline bool isSaving(AdminState *state) +{ + return (state->state == ADMIN_STATE_SAVING); +} + +/** + * Check whether an AdminState is saved. + * + * @param state The AdminState to query + * + * @return true if the state is saved + **/ +__attribute__((warn_unused_result)) +static inline bool isSaved(AdminState *state) +{ + return (state->state == ADMIN_STATE_SAVED); +} + +/** + * Check whether an AdminStateCode is a drain operation. + * + * @param code The AdminStateCode to check + * + * @return true if the code is for a drain operation + **/ +__attribute__((warn_unused_result)) +static inline bool isDrainOperation(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_DRAINING) == ADMIN_FLAG_DRAINING); +} + +/** + * Check whether an AdminState is draining. + * + * @param state The AdminState to query + * + * @return true if the state is draining + **/ +__attribute__((warn_unused_result)) +static inline bool isDraining(AdminState *state) +{ + return isDrainOperation(state->state); +} + +/** + * Check whether an AdminStateCode is a load operation. + * + * @param code The AdminStateCode to check + * + * @return true if the code is for a load operation + **/ +__attribute__((warn_unused_result)) +static inline bool isLoadOperation(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_LOADING) == ADMIN_FLAG_LOADING); +} + +/** + * Check whether an AdminState is loading. + * + * @param state The AdminState to query + * + * @return true if the state is loading + **/ +__attribute__((warn_unused_result)) +static inline bool isLoading(AdminState *state) +{ + return isLoadOperation(state->state); +} + +/** + * Check whether an AdminStateCode is a resume operation. + * + * @param code The AdminStateCode to check + * + * @return true if the code is for a resume operation + **/ +__attribute__((warn_unused_result)) +static inline bool isResumeOperation(AdminStateCode code) +{ + return ((code & ADMIN_TYPE_MASK) == ADMIN_TYPE_RESUME); +} + +/** + * Check whether an AdminState is resumeing. + * + * @param state The AdminState to query + * + * @return true if the state is resumeing + **/ +__attribute__((warn_unused_result)) +static inline bool isResuming(AdminState *state) +{ + return isResumeOperation(state->state); +} + +/** + * Check whether an AdminState is doing a clean load. + * + * @param state The AdminState to query + * + * @return true if the state is a clean load + **/ +__attribute__((warn_unused_result)) +static inline bool isCleanLoad(AdminState *state) +{ + return ((state->state == ADMIN_STATE_FORMATTING) + || (state->state == ADMIN_STATE_LOADING)); +} + +/** + * Check whether an AdminStateCode is quiescing. + * + * param code The AdminStateCode to check + * + * @return true is the state is quiescing + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescingCode(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_QUIESCING) == ADMIN_FLAG_QUIESCING); +} + +/** + * Check whether an AdminState is quiescing. + * + * @param state The AdminState to check + * + * @return true if the state is quiescing + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescing(AdminState *state) +{ + return isQuiescingCode(state->state); +} + +/** + * Check where an AdminStateCode is quiescent. + * + * param code The AdminStateCode to check + * + * @return true is the state is quiescent + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescentCode(AdminStateCode code) +{ + return ((code & ADMIN_FLAG_QUIESCENT) == ADMIN_FLAG_QUIESCENT); +} + +/** + * Check whether an AdminState is quiescent. + * + * @param state The AdminState to query + * + * @return true is the state is quiescent + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescent(AdminState *state) +{ + return isQuiescentCode(state->state); +} + +/** + * Check whether an AdminStateCode is a quiescent operation. + * + * @param code The code to check + * + * @return true if the code is a quiescent operation + **/ +__attribute__((warn_unused_result)) +static inline bool isQuiescentOperation(AdminStateCode code) +{ + return (isQuiescentCode(code) && isOperation(code)); +} + +/** + * Check that an operation is a drain. + * + * @param operation The operation to check + * @param waiter The completion to finish with an error if the operation is + * not a drain + * + * @return true if the specified operation is a drain + **/ +bool assertDrainOperation(AdminStateCode operation, VDOCompletion *waiter) + __attribute__((warn_unused_result)); + +/** + * Initiate a drain operation if the current state permits it. + * + * @param state The AdminState + * @param operation The type of drain to initiate + * @param waiter The completion to notify when the drain is complete; may + * be NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return true if the drain was initiated, if not the waiter + * will be notified + **/ +bool startDraining(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator); + +/** + * Finish a drain operation if one was in progress. + * + * @param state The AdminState to query + * + * @return true if the state was draining; will notify the waiter + * if so + **/ +bool finishDraining(AdminState *state); + +/** + * Finish a drain operation with a status code. + * + * @param state The AdminState to query + * @param result The result of the drain operation + * + * @return true if the state was draining; will notify the + * waiter if so + **/ +bool finishDrainingWithResult(AdminState *state, int result); + +/** + * Check that an operation is a load. + * + * @param operation The operation to check + * @param waiter The completion to finish with an error if the operation is + * not a load + * + * @return true if the specified operation is a load + **/ +bool assertLoadOperation(AdminStateCode operation, VDOCompletion *waiter) + __attribute__((warn_unused_result)); + +/** + * Initiate a load operation if the current state permits it. + * + * @param state The AdminState + * @param operation The type of load to initiate + * @param waiter The completion to notify when the load is complete; may be + * NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return true if the load was initiated, if not the waiter + * will be notified + **/ +bool startLoading(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator); + +/** + * Finish a load operation if one was in progress. + * + * @param state The AdminState to query + * + * @return true if the state was loading; will notify the waiter + * if so + **/ +bool finishLoading(AdminState *state); + +/** + * Finish a load operation with a status code. + * + * @param state The AdminState to query + * @param result The result of the load operation + * + * @return true if the state was loading; will notify the + * waiter if so + **/ +bool finishLoadingWithResult(AdminState *state, int result); + +/** + * Check whether an AdminStateCode is a resume operation. + * + * @param operation The operation to check + * @param waiter The completion to notify if the operation is not a resume + * operation; may be NULL + * + * @return true if the code is a resume operation + **/ +bool assertResumeOperation(AdminStateCode operation, VDOCompletion *waiter); + +/** + * Initiate a resume operation if the current state permits it. + * + * @param state The AdminState + * @param operation The type of resume to start + * @param waiter The completion to notify when the resume is complete; may + * be NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return true if the resume was initiated, if not the waiter + * will be notified + **/ +bool startResuming(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator); + +/** + * Finish a resume operation if one was in progress. + * + * @param state The AdminState to query + * + * @return true if the state was resuming; will notify the waiter + * if so + **/ +bool finishResuming(AdminState *state); + +/** + * Finish a resume operation with a status code. + * + * @param state The AdminState to query + * @param result The result of the resume operation + * + * @return true if the state was resuming; will notify the + * waiter if so + **/ +bool finishResumingWithResult(AdminState *state, int result); + +/** + * Change the state to normal operation if the current state is quiescent. + * + * @param state The AdminState to resume + * + * @return VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise + **/ +int resumeIfQuiescent(AdminState *state); + +/** + * Attempt to start an operation. + * + * @param state the AdminState + * @param operation the operation to start + * + * @return VDO_SUCCESS if the operation was started + * VDO_INVALID_ADMIN_STATE if not + **/ +int startOperation(AdminState *state, AdminStateCode operation); + +/** + * Attempt to start an operation. + * + * @param state the AdminState + * @param operation the operation to start + * @param waiter the completion to notify when the operation completes or + * fails to start; may be NULL + * @param initiator The AdminInitiator to call if the operation may begin; may + * be NULL + * + * @return true if the operation was started + **/ +bool startOperationWithWaiter(AdminState *state, + AdminStateCode operation, + VDOCompletion *waiter, + AdminInitiator *initiator); + +/** + * Finish the current operation. Will notify the operation waiter if there is + * one. This method should be used for operations started with + * startOperation(). For operations which were started with startDraining(), + * use finishDraining() instead. + * + * @param state The state whose operation is to be finished + * + * @return true if there was an operation to finish + **/ +bool finishOperation(AdminState *state); + +/** + * Finish the current operation with a status code. Will notify the operation + * waiter if there is one. + * + * @param state The state whose operation is to be finished + * @param result The result of the operation + **/ +bool finishOperationWithResult(AdminState *state, int result); + +/** + * Set a result for the current operation. + * + * @param state the AdminState + * @param result the result to set; if there is no waiter, this is a no-op + **/ +static inline void setOperationResult(AdminState *state, int result) +{ + if (state->waiter != NULL) { + setCompletionResult(state->waiter, result); + } +} + +#endif // ADMIN_STATE_H diff --git a/source/vdo/base/allocatingVIO.c b/source/vdo/base/allocatingVIO.c new file mode 100644 index 0000000..4e0ffa8 --- /dev/null +++ b/source/vdo/base/allocatingVIO.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocatingVIO.c#4 $ + */ + +#include "allocatingVIO.h" + +#include "logger.h" + +#include "allocationSelector.h" +#include "blockAllocator.h" +#include "dataVIO.h" +#include "pbnLock.h" +#include "slabDepot.h" +#include "types.h" +#include "vdoInternal.h" +#include "vioWrite.h" + +/** + * Make a single attempt to acquire a write lock on a newly-allocated PBN. + * + * @param allocatingVIO The AllocatingVIO that wants a write lock for its + * newly allocated block + * + * @return VDO_SUCCESS or an error code + **/ +static int attemptPBNWriteLock(AllocatingVIO *allocatingVIO) +{ + assertInPhysicalZone(allocatingVIO); + + ASSERT_LOG_ONLY(allocatingVIO->allocationLock == NULL, + "must not acquire a lock while already referencing one"); + + PBNLock *lock; + int result = attemptPBNLock(allocatingVIO->zone, allocatingVIO->allocation, + allocatingVIO->writeLockType, &lock); + if (result != VDO_SUCCESS) { + return result; + } + + if (lock->holderCount > 0) { + // This block is already locked, which should be impossible. + return logErrorWithStringError(VDO_LOCK_ERROR, + "Newly allocated block %" PRIu64 + " was spuriously locked (holderCount=%u)", + allocatingVIO->allocation, + lock->holderCount); + } + + // We've successfully acquired a new lock, so mark it as ours. + lock->holderCount += 1; + allocatingVIO->allocationLock = lock; + assignProvisionalReference(lock); + return VDO_SUCCESS; +} + +/** + * Attempt to allocate and lock a physical block. If successful, continue + * along the write path. + * + * @param allocatingVIO The AllocatingVIO which needs an allocation + * + * @return VDO_SUCCESS or an error if a block could not be allocated + **/ +static int allocateAndLockBlock(AllocatingVIO *allocatingVIO) +{ + BlockAllocator *allocator = getBlockAllocator(allocatingVIO->zone); + int result = allocateBlock(allocator, &allocatingVIO->allocation); + if (result != VDO_SUCCESS) { + return result; + } + + result = attemptPBNWriteLock(allocatingVIO); + if (result != VDO_SUCCESS) { + return result; + } + + // We got a block! + VIO *vio = allocatingVIOAsVIO(allocatingVIO); + vio->physical = allocatingVIO->allocation; + allocatingVIO->allocationCallback(allocatingVIO); + return VDO_SUCCESS; +} + +static void allocateBlockForWrite(VDOCompletion *completion); + +/** + * Retry allocating a block for write. + * + * @param waiter The AllocatingVIO that was waiting to allocate + * @param context The context (unused) + **/ +static void +retryAllocateBlockForWrite(Waiter *waiter, + void *context __attribute__((unused))) +{ + AllocatingVIO *allocatingVIO = waiterAsAllocatingVIO(waiter); + allocateBlockForWrite(allocatingVIOAsCompletion(allocatingVIO)); +} + +/** + * Attempt to enqueue an AllocatingVIO to wait for a slab to be scrubbed in the + * current allocation zone. + * + * @param allocatingVIO The AllocatingVIO which wants to allocate a block + * + * @return VDO_SUCCESS if the AllocatingVIO was queued, VDO_NO_SPACE if there + * are no slabs to be scrubbed in the current zone, or some other + * error + **/ +static int waitForCleanSlab(AllocatingVIO *allocatingVIO) +{ + Waiter *waiter = allocatingVIOAsWaiter(allocatingVIO); + waiter->callback = retryAllocateBlockForWrite; + + BlockAllocator *allocator = getBlockAllocator(allocatingVIO->zone); + int result = enqueueForCleanSlab(allocator, waiter); + if (result != VDO_SUCCESS) { + return result; + } + + // We've successfully enqueued, when we come back, pretend like we've + // never tried this allocation before. + allocatingVIO->waitForCleanSlab = false; + allocatingVIO->allocationAttempts = 0; + return VDO_SUCCESS; +} + +/** + * Attempt to allocate a block in an AllocatingVIO's current allocation zone. + * + * @param allocatingVIO The AllocatingVIO + * + * @return VDO_SUCCESS or an error + **/ +static int allocateBlockInZone(AllocatingVIO *allocatingVIO) +{ + allocatingVIO->allocationAttempts++; + int result = allocateAndLockBlock(allocatingVIO); + if (result != VDO_NO_SPACE) { + return result; + } + + if (allocatingVIO->waitForCleanSlab) { + result = waitForCleanSlab(allocatingVIO); + if (result != VDO_NO_SPACE) { + return result; + } + } + + VDO *vdo = getVDOFromAllocatingVIO(allocatingVIO); + const ThreadConfig *threadConfig = getThreadConfig(vdo); + if (allocatingVIO->allocationAttempts >= threadConfig->physicalZoneCount) { + if (allocatingVIO->waitForCleanSlab) { + // There were no free blocks in any zone, and no zone had slabs to + // scrub. + allocatingVIO->allocationCallback(allocatingVIO); + return VDO_SUCCESS; + } + + allocatingVIO->waitForCleanSlab = true; + allocatingVIO->allocationAttempts = 0; + } + + // Try the next zone + ZoneCount zoneNumber = getPhysicalZoneNumber(allocatingVIO->zone) + 1; + if (zoneNumber == threadConfig->physicalZoneCount) { + zoneNumber = 0; + } + allocatingVIO->zone = vdo->physicalZones[zoneNumber]; + launchPhysicalZoneCallback(allocatingVIO, allocateBlockForWrite, + THIS_LOCATION("$F;cb=allocBlockInZone")); + return VDO_SUCCESS; +} + +/** + * Attempt to allocate a block. This callback is registered in + * allocateDataBlock() and allocateBlockInZone(). + * + * @param completion The AllocatingVIO needing an allocation + **/ +static void allocateBlockForWrite(VDOCompletion *completion) +{ + AllocatingVIO *allocatingVIO = asAllocatingVIO(completion); + assertInPhysicalZone(allocatingVIO); + allocatingVIOAddTraceRecord(allocatingVIO, THIS_LOCATION(NULL)); + int result = allocateBlockInZone(allocatingVIO); + if (result != VDO_SUCCESS) { + setCompletionResult(completion, result); + allocatingVIO->allocationCallback(allocatingVIO); + } +} + +/**********************************************************************/ +void allocateDataBlock(AllocatingVIO *allocatingVIO, + AllocationSelector *selector, + PBNLockType writeLockType, + AllocationCallback *callback) +{ + allocatingVIO->writeLockType = writeLockType; + allocatingVIO->allocationCallback = callback; + allocatingVIO->allocationAttempts = 0; + allocatingVIO->allocation = ZERO_BLOCK; + + VIO *vio = allocatingVIOAsVIO(allocatingVIO); + allocatingVIO->zone + = vio->vdo->physicalZones[getNextAllocationZone(selector)]; + + launchPhysicalZoneCallback(allocatingVIO, allocateBlockForWrite, + THIS_LOCATION("$F;cb=allocDataBlock")); +} + +/**********************************************************************/ +void releaseAllocationLock(AllocatingVIO *allocatingVIO) +{ + assertInPhysicalZone(allocatingVIO); + PhysicalBlockNumber lockedPBN = allocatingVIO->allocation; + if (hasProvisionalReference(allocatingVIO->allocationLock)) { + allocatingVIO->allocation = ZERO_BLOCK; + } + + releasePBNLock(allocatingVIO->zone, lockedPBN, + &allocatingVIO->allocationLock); +} + +/**********************************************************************/ +void resetAllocation(AllocatingVIO *allocatingVIO) +{ + ASSERT_LOG_ONLY(allocatingVIO->allocationLock == NULL, + "must not reset allocation while holding a PBN lock"); + + allocatingVIOAsVIO(allocatingVIO)->physical = ZERO_BLOCK; + allocatingVIO->zone = NULL; + allocatingVIO->allocation = ZERO_BLOCK; + allocatingVIO->allocationAttempts = 0; + allocatingVIO->waitForCleanSlab = false; +} diff --git a/source/vdo/base/allocatingVIO.h b/source/vdo/base/allocatingVIO.h new file mode 100644 index 0000000..a2f2b7b --- /dev/null +++ b/source/vdo/base/allocatingVIO.h @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocatingVIO.h#4 $ + */ + +#ifndef ALLOCATING_VIO_H +#define ALLOCATING_VIO_H + +#include "atomic.h" +#include "pbnLock.h" +#include "physicalZone.h" +#include "types.h" +#include "vio.h" +#include "waitQueue.h" + +typedef void AllocationCallback(AllocatingVIO *allocationVIO); + +/** + * A VIO which can receive an allocation from the block allocator. Currently, + * these are used both for servicing external data requests and for compressed + * block writes. + **/ +struct allocatingVIO { + /** The underlying VIO */ + VIO vio; + + /** The WaitQueue entry structure */ + Waiter waiter; + + /** The physical zone in which to allocate a physical block */ + PhysicalZone *zone; + + /** The block allocated to this VIO */ + PhysicalBlockNumber allocation; + + /** + * If non-NULL, the pooled PBN lock held on the allocated block. Must be a + * write lock until the block has been written, after which it will become a + * read lock. + **/ + PBNLock *allocationLock; + + /** The type of write lock to obtain on the allocated block */ + PBNLockType writeLockType; + + /** The number of zones in which this VIO has attempted an allocation */ + ZoneCount allocationAttempts; + + /** Whether this VIO should wait for a clean slab */ + bool waitForCleanSlab; + + /** The function to call once allocation is complete */ + AllocationCallback *allocationCallback; +}; + +/** + * Convert a VIO to an AllocatingVIO. + * + * @param vio The VIO to convert + * + * @return The VIO as an AllocatingVIO + **/ +static inline AllocatingVIO *vioAsAllocatingVIO(VIO *vio) +{ + STATIC_ASSERT(offsetof(AllocatingVIO, vio) == 0); + ASSERT_LOG_ONLY(((vio->type == VIO_TYPE_DATA) + || (vio->type == VIO_TYPE_COMPRESSED_BLOCK)), + "VIO is an AllocatingVIO"); + return (AllocatingVIO *) vio; +} + +/** + * Convert an AllocatingVIO to a VIO. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return The AllocatingVIO as a VIO + **/ +static inline VIO *allocatingVIOAsVIO(AllocatingVIO *allocatingVIO) +{ + return &allocatingVIO->vio; +} + +/** + * Convert a generic VDOCompletion to an AllocatingVIO. + * + * @param completion The completion to convert + * + * @return The completion as an AllocatingVIO + **/ +static inline AllocatingVIO *asAllocatingVIO(VDOCompletion *completion) +{ + return vioAsAllocatingVIO(asVIO(completion)); +} + +/** + * Convert an AllocatingVIO to a generic completion. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return The AllocatingVIO as a completion + **/ +static inline +VDOCompletion *allocatingVIOAsCompletion(AllocatingVIO *allocatingVIO) +{ + return vioAsCompletion(allocatingVIOAsVIO(allocatingVIO)); +} + +/** + * Convert an AllocatingVIO to a generic wait queue entry. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return The AllocatingVIO as a wait queue entry + **/ +static inline Waiter *allocatingVIOAsWaiter(AllocatingVIO *allocatingVIO) +{ + return &allocatingVIO->waiter; +} + +/** + * Convert an AllocatingVIO's generic wait queue entry back to the + * AllocatingVIO. + * + * @param waiter The wait queue entry to convert + * + * @return The wait queue entry as an AllocatingVIO + **/ +static inline AllocatingVIO *waiterAsAllocatingVIO(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + + return + (AllocatingVIO *) ((uintptr_t) waiter - offsetof(AllocatingVIO, waiter)); +} + +/** + * Check whether an AllocatingVIO is a compressed block write. + * + * @param allocatingVIO The AllocatingVIO to check + * + * @return true if the AllocatingVIO is a compressed block write + **/ +static inline bool isCompressedWriteAllocatingVIO(AllocatingVIO *allocatingVIO) +{ + return isCompressedWriteVIO(allocatingVIOAsVIO(allocatingVIO)); +} + +/** + * Add a trace record for the current source location. + * + * @param allocatingVIO The AllocatingVIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void allocatingVIOAddTraceRecord(AllocatingVIO *allocatingVIO, + TraceLocation location) +{ + vioAddTraceRecord(allocatingVIOAsVIO(allocatingVIO), location); +} + +/** + * Get the VDO from an AllocatingVIO. + * + * @param allocatingVIO The AllocatingVIO from which to get the VDO + * + * @return The VDO to which an AllocatingVIO belongs + **/ +static inline VDO *getVDOFromAllocatingVIO(AllocatingVIO *allocatingVIO) +{ + return allocatingVIOAsVIO(allocatingVIO)->vdo; +} + +/** + * Check that an AllocatingVIO is running on the physical zone thread in + * which it did its allocation. + * + * @param allocatingVIO The AllocatingVIO in question + **/ +static inline void assertInPhysicalZone(AllocatingVIO *allocatingVIO) +{ + ThreadID expected = getPhysicalZoneThreadID(allocatingVIO->zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "AllocatingVIO for allocated physical block %" PRIu64 + " on thread %u, should be on thread %u", + allocatingVIO->allocation, threadID, expected); +} + +/** + * Set a callback as a physical block operation in an AllocatingVIO's allocated + * zone. + * + * @param allocatingVIO The AllocatingVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setPhysicalZoneCallback(AllocatingVIO *allocatingVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(allocatingVIOAsCompletion(allocatingVIO), callback, + getPhysicalZoneThreadID(allocatingVIO->zone)); + allocatingVIOAddTraceRecord(allocatingVIO, location); +} + +/** + * Set a callback as a physical block operation in an AllocatingVIO's allocated + * zone and invoke it immediately. + * + * @param allocatingVIO The AllocatingVIO + * @param callback The callback to invoke + * @param location The tracing info for the call site + **/ +static inline void launchPhysicalZoneCallback(AllocatingVIO *allocatingVIO, + VDOAction *callback, + TraceLocation location) +{ + setPhysicalZoneCallback(allocatingVIO, callback, location); + invokeCallback(allocatingVIOAsCompletion(allocatingVIO)); +} + +/** + * Allocate a data block to an AllocatingVIO. + * + * @param allocatingVIO The AllocatingVIO which needs an allocation + * @param selector The allocation selector for deciding which physical + * zone to allocate from + * @param writeLockType The type of write lock to obtain on the block + * @param callback The function to call once the allocation is complete + **/ +void allocateDataBlock(AllocatingVIO *allocatingVIO, + AllocationSelector *selector, + PBNLockType writeLockType, + AllocationCallback *callback); + +/** + * Release the PBN lock on the allocated block. If the reference to the locked + * block is still provisional, it will be released as well. + * + * @param allocatingVIO The lock holder + **/ +void releaseAllocationLock(AllocatingVIO *allocatingVIO); + +/** + * Reset an AllocatingVIO after it has done an allocation. + * + * @param allocatingVIO The AllocatingVIO + **/ +void resetAllocation(AllocatingVIO *allocatingVIO); + +#endif // ALLOCATING_VIO_H diff --git a/source/vdo/base/allocationSelector.c b/source/vdo/base/allocationSelector.c new file mode 100644 index 0000000..e703d09 --- /dev/null +++ b/source/vdo/base/allocationSelector.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelector.c#1 $ + */ + +#include "allocationSelector.h" +#include "allocationSelectorInternals.h" + +#include "memoryAlloc.h" + +#include "types.h" + +enum { + ALLOCATIONS_PER_ZONE = 128, +}; + +/**********************************************************************/ +int makeAllocationSelector(ZoneCount physicalZoneCount, + ThreadID threadID, + AllocationSelector **selectorPtr) +{ + AllocationSelector *selector; + int result = ALLOCATE(1, AllocationSelector, __func__, &selector); + if (result != VDO_SUCCESS) { + return result; + } + + *selector = (AllocationSelector) { + .nextAllocationZone = threadID % physicalZoneCount, + .lastPhysicalZone = physicalZoneCount - 1, + }; + + *selectorPtr = selector; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeAllocationSelector(AllocationSelector **selectorPtr) +{ + AllocationSelector *selector = *selectorPtr; + if (selector == NULL) { + return; + } + + FREE(selector); + *selectorPtr = NULL; +} + +/**********************************************************************/ +ZoneCount getNextAllocationZone(AllocationSelector *selector) +{ + if (selector->lastPhysicalZone > 0) { + if (selector->allocationCount < ALLOCATIONS_PER_ZONE) { + selector->allocationCount++; + } else { + selector->allocationCount = 1; + if (selector->nextAllocationZone < selector->lastPhysicalZone) { + selector->nextAllocationZone++; + } else { + selector->nextAllocationZone = 0; + } + } + } + + return selector->nextAllocationZone; +} diff --git a/source/vdo/base/allocationSelector.h b/source/vdo/base/allocationSelector.h new file mode 100644 index 0000000..7b922e9 --- /dev/null +++ b/source/vdo/base/allocationSelector.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelector.h#1 $ + */ + +#ifndef ALLOCATION_SELECTOR_H +#define ALLOCATION_SELECTOR_H + +#include "completion.h" + +/** + * An AllocationSelector is used by any zone which does data block allocations. + * The selector is used to round-robin allocation requests to different + * physical zones. Currently, 128 allocations will be made to a given physical + * zone before switching to the next. + **/ + +/** + * Make a new allocation selector. + * + * @param [in] physicalZoneCount The number of physical zones + * @param [in] threadID The ID of the thread using this selector + * @param [out] selectorPtr A pointer to receive the new selector + * + * @return VDO_SUCCESS or an error + **/ +int makeAllocationSelector(ZoneCount physicalZoneCount, + ThreadID threadID, + AllocationSelector **selectorPtr) + __attribute__((warn_unused_result)); + +/** + * Free an AllocationSelector and null out the reference to it. + * + * @param selectorPtr A reference to the selector to free + **/ +void freeAllocationSelector(AllocationSelector **selectorPtr); + +/** + * Get number of the physical zone from which to allocate next. + * + * @param selector The selector to query + * + * @return The number of the physical zone from which to allocate + **/ +ZoneCount getNextAllocationZone(AllocationSelector *selector) + __attribute__((warn_unused_result)); + +#endif /* ALLOCATION_SELECTOR_H */ diff --git a/source/vdo/base/allocationSelectorInternals.h b/source/vdo/base/allocationSelectorInternals.h new file mode 100644 index 0000000..13df50f --- /dev/null +++ b/source/vdo/base/allocationSelectorInternals.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelectorInternals.h#1 $ + */ + +#ifndef ALLOCATION_SELECTOR_INTERNALS_H +#define ALLOCATION_SELECTOR_INTERNALS_H + +#include "types.h" + +/** Structure used to select which physical zone to allocate from */ +struct allocationSelector { + /** The number of allocations done in the current zone */ + BlockCount allocationCount; + /** The physical zone to allocate from next */ + ZoneCount nextAllocationZone; + /** The number of the last physical zone */ + ZoneCount lastPhysicalZone; +}; + +#endif /* ALLOCATION_SELECTOR_INTERNALS_H */ diff --git a/source/vdo/base/atomic.h b/source/vdo/base/atomic.h new file mode 100644 index 0000000..93b7318 --- /dev/null +++ b/source/vdo/base/atomic.h @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/atomic.h#2 $ + */ + +#ifndef ATOMIC_H +#define ATOMIC_H + +#include "atomicDefs.h" +#include "compiler.h" +#include "typeDefs.h" + +#define ATOMIC_INITIALIZER(value) { (value) } + +typedef struct { + atomic_t value; +} __attribute__((aligned(4))) Atomic32; + +typedef struct { + atomic64_t value; +} __attribute__((aligned(8))) Atomic64; + +typedef struct { + Atomic32 value; +} __attribute__((aligned(4))) AtomicBool; + +/** + * Memory load operations that precede this fence will be prevented from + * changing order with any that follow this fence, by either the compiler or + * the CPU. This can be used to ensure that the load operations accessing + * the fields of a structure are not re-ordered so they actually take effect + * before a pointer to the structure is resolved. + **/ +static INLINE void loadFence(void) +{ + smp_rmb(); +} + +/** + * Memory store operations that precede this fence will be prevented from + * changing order with any that follow this fence, by either the compiler or + * the CPU. This can be used to ensure that the store operations initializing + * the fields of a structure are not re-ordered so they actually take effect + * after a pointer to the structure is published. + **/ +static INLINE void storeFence(void) +{ + smp_wmb(); +} + +/** + * Generate a full memory fence for the compiler and CPU. Load and store + * operations issued before the fence will not be re-ordered with operations + * issued after the fence. + **/ +static INLINE void memoryFence(void) +{ + smp_mb(); +} + +/** + * Access the value of a 32-bit atomic variable, ensuring that the load is not + * re-ordered by the compiler or CPU with any subsequent load operations. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE uint32_t atomicLoad32(const Atomic32 *atom) +{ + uint32_t value = atomic_read(&atom->value); + loadFence(); + return value; +} + +/** + * Access the value of a 64-bit atomic variable, ensuring that the memory load + * is not re-ordered by the compiler or CPU with any subsequent load + * operations. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE uint64_t atomicLoad64(const Atomic64 *atom) +{ + uint64_t value = atomic64_read(&atom->value); + loadFence(); + return value; +} + +/** + * Access the value of a boolean atomic variable, ensuring that the load is not + * re-ordered by the compiler or CPU with any subsequent load operations. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE bool atomicLoadBool(const AtomicBool *atom) +{ + return (atomicLoad32(&atom->value) > 0); +} + +/** + * Set the value of a 32-bit atomic variable, ensuring that the memory store + * operation is not re-ordered by the compiler or CPU with any preceding store + * operations. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void atomicStore32(Atomic32 *atom, uint32_t newValue) +{ + storeFence(); + atomic_set(&atom->value, newValue); +} + +/** + * Set the value of a 64-bit atomic variable, ensuring that the memory store + * operation is not re-ordered by the compiler or CPU with any preceding store + * operations. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void atomicStore64(Atomic64 *atom, uint64_t newValue) +{ + storeFence(); + atomic64_set(&atom->value, newValue); +} + +/** + * Set the value of a boolean atomic variable, ensuring that the memory store + * operation is not re-ordered by the compiler or CPU with any preceding store + * operations. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void atomicStoreBool(AtomicBool *atom, bool newValue) +{ + atomicStore32(&atom->value, (newValue ? 1 : 0)); +} + +/** + * Add a 32-bit signed delta to a 32-bit atomic variable. + * + * @param atom a pointer to the atomic variable + * @param delta the value to be added (or subtracted) from the variable + * + * @return the new value of the atom after the add operation + **/ +static INLINE uint32_t atomicAdd32(Atomic32 *atom, int32_t delta) +{ + return atomic_add_return(delta, &atom->value); +} + +/** + * Add a 64-bit signed delta to a 64-bit atomic variable. + * + * @param atom a pointer to the atomic variable + * @param delta the value to be added (or subtracted) from the variable + * + * @return the new value of the atom after the add operation + **/ +static INLINE uint64_t atomicAdd64(Atomic64 *atom, int64_t delta) +{ + return atomic64_add_return(delta, &atom->value); +} + +/** + * Atomic 32-bit compare-and-swap. If the atom is identical to a required + * value, atomically replace it with the new value and return true, otherwise + * do nothing and return false. + * + * @param atom a pointer to the atomic variable + * @param requiredValue the value that must be present to perform the swap + * @param newValue the value to be swapped for the required value + * + * @return true if the atom was changed, false otherwise + **/ +static INLINE bool compareAndSwap32(Atomic32 *atom, + uint32_t requiredValue, + uint32_t newValue) +{ + /* + * Our initial implementation, for x86, effectively got a full + * memory barrier because of how "lock cmpxchg" operates. The + * atomic_cmpxchg interface provides for a full barrier *if* the + * exchange is done, but not necessarily if it is not. + * + * Do we need the full barrier always? We need to investigate that, + * as part of (eventually) converting to using that API directly. + * For now, play it safe, and ensure the same behavior on other + * architectures too. + */ +#ifndef __x86_64__ + smp_mb(); +#endif + int oldValue = atomic_cmpxchg(&atom->value, requiredValue, newValue); +#ifndef __x86_64__ + smp_mb(); +#endif + return requiredValue == (uint32_t) oldValue; +} + +/** + * Atomic 64-bit compare-and-swap. If the atom is identical to a required + * value, atomically replace it with the new value and return true, otherwise + * do nothing and return false. + * + * @param atom a pointer to the atomic variable + * @param requiredValue the value that must be present to perform the swap + * @param newValue the value to be swapped for the required value + * + * @return true if the atom was changed, false otherwise + **/ +static INLINE bool compareAndSwap64(Atomic64 *atom, + uint64_t requiredValue, + uint64_t newValue) +{ +#ifndef __x86_64__ + smp_mb(); +#endif + long oldValue = atomic64_cmpxchg(&atom->value, requiredValue, newValue); +#ifndef __x86_64__ + smp_mb(); +#endif + return requiredValue == (uint64_t) oldValue; +} + +/** + * Atomic boolean compare-and-swap. If the atom is identical to a required + * value, atomically replace it with the new value and return true, otherwise + * do nothing and return false. + * + * @param atom a pointer to the atomic variable + * @param requiredValue the value that must be present to perform the swap + * @param newValue the value to be swapped for the required value + * + * @return true if the atom was changed, false otherwise + **/ +static INLINE bool compareAndSwapBool(AtomicBool *atom, + bool requiredValue, + bool newValue) +{ + return compareAndSwap32(&atom->value, (requiredValue ? 1 : 0), + (newValue ? 1 : 0)); +} + +/** + * Access the value of a 32-bit atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE uint32_t relaxedLoad32(const Atomic32 *atom) +{ + return atomic_read(&atom->value); +} + +/** + * Access the value of a 64-bit atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE uint64_t relaxedLoad64(const Atomic64 *atom) +{ + return atomic64_read(&atom->value); +} + +/** + * Access the value of a boolean atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to access + * + * @return the value that was in the atom at the moment it was accessed + **/ +static INLINE bool relaxedLoadBool(const AtomicBool *atom) +{ + return (relaxedLoad32(&atom->value) > 0); +} + +/** + * Set the value of a 32-bit atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void relaxedStore32(Atomic32 *atom, uint32_t newValue) +{ + atomic_set(&atom->value, newValue); +} + +/** + * Set the value of a 64-bit atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void relaxedStore64(Atomic64 *atom, uint64_t newValue) +{ + atomic64_set(&atom->value, newValue); +} + +/** + * Set the value of a boolean atomic variable using relaxed memory order, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable to modify + * @param newValue the value to assign to the atomic variable + **/ +static INLINE void relaxedStoreBool(AtomicBool *atom, bool newValue) +{ + relaxedStore32(&atom->value, (newValue ? 1 : 0)); +} + +/** + * Non-atomically add a 32-bit signed delta to a 32-bit atomic variable, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable + * @param delta the value to be added (or subtracted) from the variable + * + * @return the new value of the atom after the add operation + **/ +static INLINE uint32_t relaxedAdd32(Atomic32 *atom, int32_t delta) +{ + uint32_t newValue = (relaxedLoad32(atom) + delta); + relaxedStore32(atom, newValue); + return newValue; +} + +/** + * Non-atomically add a 64-bit signed delta to a 64-bit atomic variable, + * without any compiler or CPU fences. + * + * @param atom a pointer to the atomic variable + * @param delta the value to be added (or subtracted) from the variable + * + * @return the new value of the atom after the add operation + **/ +static INLINE uint64_t relaxedAdd64(Atomic64 *atom, int64_t delta) +{ + uint64_t newValue = (relaxedLoad64(atom) + delta); + relaxedStore64(atom, newValue); + return newValue; +} + +#endif /* ATOMIC_H */ diff --git a/source/vdo/base/blockAllocator.c b/source/vdo/base/blockAllocator.c new file mode 100644 index 0000000..a1eaae4 --- /dev/null +++ b/source/vdo/base/blockAllocator.c @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocator.c#22 $ + */ + +#include "blockAllocatorInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminState.h" +#include "heap.h" +#include "numUtils.h" +#include "priorityTable.h" +#include "readOnlyNotifier.h" +#include "refCounts.h" +#include "slab.h" +#include "slabDepotInternals.h" +#include "slabIterator.h" +#include "slabJournalEraser.h" +#include "slabJournalInternals.h" +#include "slabScrubber.h" +#include "slabSummary.h" +#include "vdoRecovery.h" +#include "vio.h" +#include "vioPool.h" + +/** + * Assert that a block allocator function was called from the correct thread. + * + * @param threadID The allocator's thread id + * @param functionName The name of the function + **/ +static inline void assertOnAllocatorThread(ThreadID threadID, + const char *functionName) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == threadID), + "%s called on correct thread", functionName); +} + +/** + * Get the priority for a slab in the allocator's slab queue. Slabs are + * essentially prioritized by an approximation of the number of free blocks in + * the slab so slabs with lots of free blocks with be opened for allocation + * before slabs that have few free blocks. + * + * @param slab The slab whose queue priority is desired + * + * @return the queue priority of the slab + **/ +static unsigned int calculateSlabPriority(Slab *slab) +{ + BlockCount freeBlocks = getSlabFreeBlockCount(slab); + + // Slabs that are completely full must be the only ones with the lowest + // priority: zero. + if (freeBlocks == 0) { + return 0; + } + + /* + * Slabs that have never been opened (empty, newly initialized, never been + * written to) have lower priority than previously opened slabs that have a + * signficant number of free blocks. This ranking causes VDO to avoid + * writing physical blocks for the first time until there are very few free + * blocks that have been previously written to. That policy makes VDO a + * better client of any underlying storage that is thinly-provisioned + * [VDOSTORY-123]. + */ + unsigned int unopenedSlabPriority = slab->allocator->unopenedSlabPriority; + if (isSlabJournalBlank(slab->journal)) { + return unopenedSlabPriority; + } + + /* + * For all other slabs, the priority is derived from the logarithm of the + * number of free blocks. Slabs with the same order of magnitude of free + * blocks have the same priority. With 2^23 blocks, the priority will range + * from 1 to 25. The reserved unopenedSlabPriority divides the range and is + * skipped by the logarithmic mapping. + */ + unsigned int priority = (1 + logBaseTwo(freeBlocks)); + return ((priority < unopenedSlabPriority) ? priority : priority + 1); +} + +/** + * Add a slab to the priority queue of slabs available for allocation. + * + * @param slab The slab to prioritize + **/ +static void prioritizeSlab(Slab *slab) +{ + ASSERT_LOG_ONLY(isRingEmpty(&slab->ringNode), + "a slab must not already be on a ring when prioritizing"); + slab->priority = calculateSlabPriority(slab); + priorityTableEnqueue(slab->allocator->prioritizedSlabs, slab->priority, + &slab->ringNode); +} + +/**********************************************************************/ +void registerSlabWithAllocator(BlockAllocator *allocator, Slab *slab) +{ + allocator->slabCount++; + allocator->lastSlab = slab->slabNumber; +} + +/** + * Get an iterator over all the slabs in the allocator. + * + * @param allocator The allocator + * + * @return An iterator over the allocator's slabs + **/ +static SlabIterator getSlabIterator(const BlockAllocator *allocator) +{ + return iterateSlabs(allocator->depot->slabs, allocator->lastSlab, + allocator->zoneNumber, allocator->depot->zoneCount); +} + +/** + * Notify a block allocator that the VDO has entered read-only mode. + * + * Implements ReadOnlyNotification. + * + * @param listener The block allocator + * @param parent The completion to notify in order to acknowledge the + * notification + **/ +static void notifyBlockAllocatorOfReadOnlyMode(void *listener, + VDOCompletion *parent) +{ + BlockAllocator *allocator = listener; + assertOnAllocatorThread(allocator->threadID, __func__); + SlabIterator iterator = getSlabIterator(allocator); + while (hasNextSlab(&iterator)) { + Slab *slab = nextSlab(&iterator); + abortSlabJournalWaiters(slab->journal); + } + + completeCompletion(parent); +} + +/**********************************************************************/ +int makeAllocatorPoolVIOs(PhysicalLayer *layer, + void *parent, + void *buffer, + VIO **vioPtr) +{ + return createVIO(layer, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, parent, + buffer, vioPtr); +} + +/** + * Allocate those component of the block allocator which are needed only at + * load time, not at format time. + * + * @param allocator The allocator + * @param layer The physical layer below this allocator + * @param vioPoolSize The VIO pool size + * + * @return VDO_SUCCESS or an error + **/ +static int allocateComponents(BlockAllocator *allocator, + PhysicalLayer *layer, + BlockCount vioPoolSize) +{ + /* + * If createVIO is NULL, the block allocator is only being used to format + * or audit the VDO. These only require the SuperBlock component, so we can + * just skip allocating all the memory needed for runtime components. + */ + if (layer->createMetadataVIO == NULL) { + return VDO_SUCCESS; + } + + int result = registerReadOnlyListener(allocator->readOnlyNotifier, + allocator, + notifyBlockAllocatorOfReadOnlyMode, + allocator->threadID); + if (result != VDO_SUCCESS) { + return result; + } + + SlabDepot *depot = allocator->depot; + result = initializeEnqueueableCompletion(&allocator->completion, + BLOCK_ALLOCATOR_COMPLETION, layer); + if (result != VDO_SUCCESS) { + return result; + } + + allocator->summary = getSlabSummaryForZone(depot, allocator->zoneNumber); + + result = makeVIOPool(layer, vioPoolSize, allocator->threadID, + makeAllocatorPoolVIOs, NULL, &allocator->vioPool); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount slabJournalSize = depot->slabConfig.slabJournalBlocks; + result = makeSlabScrubber(layer, slabJournalSize, + allocator->readOnlyNotifier, + &allocator->slabScrubber); + if (result != VDO_SUCCESS) { + return result; + } + + // The number of data blocks is the maximum number of free blocks that could + // be used in calculateSlabPriority(). + BlockCount maxFreeBlocks = depot->slabConfig.dataBlocks; + unsigned int maxPriority = (2 + logBaseTwo(maxFreeBlocks)); + result = makePriorityTable(maxPriority, &allocator->prioritizedSlabs); + if (result != VDO_SUCCESS) { + return result; + } + + /* + * VDOSTORY-123 requires that we try to open slabs that already have + * allocated blocks in preference to slabs that have never been opened. For + * reasons we have not been able to fully understand, performance tests on + * SSD harvards have been very sensitive (50% reduction in test throughput) + * to very slight differences in the timing and locality of block + * allocation. Assigning a low priority to unopened slabs (maxPriority/2, + * say) would be ideal for the story, but anything less than a very high + * threshold (maxPriority - 1) hurts PMI results. + * + * This sets the free block threshold for preferring to open an unopened + * slab to the binary floor of 3/4ths the total number of datablocks in a + * slab, which will generally evaluate to about half the slab size, but + * avoids degenerate behavior in unit tests where the number of data blocks + * is artificially constrained to a power of two. + */ + allocator->unopenedSlabPriority = (1 + logBaseTwo((maxFreeBlocks * 3) / 4)); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeBlockAllocator(SlabDepot *depot, + ZoneCount zoneNumber, + ThreadID threadID, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + BlockAllocator **allocatorPtr) +{ + + BlockAllocator *allocator; + int result = ALLOCATE(1, BlockAllocator, __func__, &allocator); + if (result != VDO_SUCCESS) { + return result; + } + + allocator->depot = depot; + allocator->zoneNumber = zoneNumber; + allocator->threadID = threadID; + allocator->nonce = nonce; + allocator->readOnlyNotifier = readOnlyNotifier; + initializeRing(&allocator->dirtySlabJournals); + + result = allocateComponents(allocator, layer, vioPoolSize); + if (result != VDO_SUCCESS) { + freeBlockAllocator(&allocator); + return result; + } + + *allocatorPtr = allocator; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeBlockAllocator(BlockAllocator **blockAllocatorPtr) +{ + BlockAllocator *allocator = *blockAllocatorPtr; + if (allocator == NULL) { + return; + } + + freeSlabScrubber(&allocator->slabScrubber); + freeVIOPool(&allocator->vioPool); + freePriorityTable(&allocator->prioritizedSlabs); + destroyEnqueueable(&allocator->completion); + FREE(allocator); + *blockAllocatorPtr = NULL; +} + +/**********************************************************************/ +int replaceVIOPool(BlockAllocator *allocator, + size_t size, + PhysicalLayer *layer) +{ + freeVIOPool(&allocator->vioPool); + return makeVIOPool(layer, size, allocator->threadID, makeAllocatorPoolVIOs, + NULL, &allocator->vioPool); +} + +/** + * Get the maximum number of data blocks that can be allocated. + * + * @param allocator The block allocator to query + * + * @return The number of data blocks that can be allocated + **/ +__attribute__((warn_unused_result)) +static inline BlockCount getDataBlockCount(const BlockAllocator *allocator) +{ + return (allocator->slabCount * allocator->depot->slabConfig.dataBlocks); +} + +/**********************************************************************/ +BlockCount getAllocatedBlocks(const BlockAllocator *allocator) +{ + return relaxedLoad64(&allocator->statistics.allocatedBlocks); +} + +/**********************************************************************/ +BlockCount getUnrecoveredSlabCount(const BlockAllocator *allocator) +{ + return getScrubberSlabCount(allocator->slabScrubber); +} + +/**********************************************************************/ +void queueSlab(Slab *slab) +{ + ASSERT_LOG_ONLY(isRingEmpty(&slab->ringNode), + "a requeued slab must not already be on a ring"); + BlockAllocator *allocator = slab->allocator; + BlockCount freeBlocks = getSlabFreeBlockCount(slab); + int result = ASSERT((freeBlocks <= allocator->depot->slabConfig.dataBlocks), + "rebuilt slab %u must have a valid free block count" + " (has %llu, expected maximum %llu)", + slab->slabNumber, freeBlocks, + allocator->depot->slabConfig.dataBlocks); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(allocator->readOnlyNotifier, result); + return; + } + + if (isUnrecoveredSlab(slab)) { + registerSlabForScrubbing(allocator->slabScrubber, slab, false); + return; + } + + if (!isSlabResuming(slab)) { + // If the slab is resuming, we've already accounted for it here, so don't + // do it again. + relaxedAdd64(&allocator->statistics.allocatedBlocks, -freeBlocks); + if (!isSlabJournalBlank(slab->journal)) { + relaxedAdd64(&allocator->statistics.slabsOpened, 1); + } + } + + // All slabs are kept in a priority queue for allocation. + prioritizeSlab(slab); +} + +/**********************************************************************/ +void adjustFreeBlockCount(Slab *slab, bool increment) +{ + BlockAllocator *allocator = slab->allocator; + // The sense of increment is reversed since allocations are being counted. + relaxedAdd64(&allocator->statistics.allocatedBlocks, (increment ? -1 : 1)); + + // The open slab doesn't need to be reprioritized until it is closed. + if (slab == allocator->openSlab) { + return; + } + + // The slab priority rarely changes; if no change, then don't requeue it. + if (slab->priority == calculateSlabPriority(slab)) { + return; + } + + // Reprioritize the slab to reflect the new free block count by removing it + // from the table and re-enqueuing it with the new priority. + priorityTableRemove(allocator->prioritizedSlabs, &slab->ringNode); + prioritizeSlab(slab); +} + +/** + * Allocate the next free physical block in a slab. + * + * The block allocated will have a provisional reference and the + * reference must be either confirmed with a subsequent call to + * incrementReferenceCount() or vacated with a subsequent call to + * decrementReferenceCount(). + * + * @param [in] slab The slab + * @param [out] blockNumberPtr A pointer to receive the allocated block number + * + * @return UDS_SUCCESS or an error code + **/ +static int allocateSlabBlock(Slab *slab, PhysicalBlockNumber *blockNumberPtr) +{ + PhysicalBlockNumber pbn; + int result = allocateUnreferencedBlock(slab->referenceCounts, &pbn); + if (result != VDO_SUCCESS) { + return result; + } + + adjustFreeBlockCount(slab, false); + + *blockNumberPtr = pbn; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int allocateBlock(BlockAllocator *allocator, + PhysicalBlockNumber *blockNumberPtr) +{ + if (allocator->openSlab != NULL) { + // Try to allocate the next block in the currently open slab. + int result = allocateSlabBlock(allocator->openSlab, blockNumberPtr); + if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE)) { + return result; + } + + // Put the exhausted open slab back into the priority table. + prioritizeSlab(allocator->openSlab); + } + + // Remove the highest priority slab from the priority table and make it + // the open slab. + allocator->openSlab + = slabFromRingNode(priorityTableDequeue(allocator->prioritizedSlabs)); + + if (isSlabJournalBlank(allocator->openSlab->journal)) { + relaxedAdd64(&allocator->statistics.slabsOpened, 1); + dirtyAllReferenceBlocks(allocator->openSlab->referenceCounts); + } else { + relaxedAdd64(&allocator->statistics.slabsReopened, 1); + } + + // Try allocating again. If we're out of space immediately after opening a + // slab, then every slab must be fully allocated. + return allocateSlabBlock(allocator->openSlab, blockNumberPtr); +} + +/**********************************************************************/ +void releaseBlockReference(BlockAllocator *allocator, + PhysicalBlockNumber pbn, + const char *why) +{ + if (pbn == ZERO_BLOCK) { + return; + } + + Slab *slab = getSlab(allocator->depot, pbn); + ReferenceOperation operation = { + .type = DATA_DECREMENT, + .pbn = pbn, + }; + int result = modifySlabReferenceCount(slab, NULL, operation); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, + "Failed to release reference to %s " + "physical block %llu", + why, pbn); + } +} + +/** + * This is a HeapComparator function that orders SlabStatuses using the + * 'isClean' field as the primary key and the 'emptiness' field as the + * secondary key. + * + * Slabs need to be pushed onto the rings in the same order they are to be + * popped off. Popping should always get the most empty first, so pushing + * should be from most empty to least empty. Thus, the comparator order is + * the usual sense since Heap returns larger elements before smaller ones. + * + * @param item1 The first item to compare + * @param item2 The second item to compare + * + * @return 1 if the first item is cleaner or emptier than the second; + * 0 if the two items are equally clean and empty; + -1 otherwise + **/ +static int compareSlabStatuses(const void *item1, const void *item2) +{ + const SlabStatus *info1 = (const SlabStatus *) item1; + const SlabStatus *info2 = (const SlabStatus *) item2; + + if (info1->isClean != info2->isClean) { + return (info1->isClean ? 1 : -1); + } + if (info1->emptiness != info2->emptiness) { + return ((info1->emptiness > info2->emptiness) ? 1 : -1); + } + return ((info1->slabNumber < info2->slabNumber) ? 1 : -1); +} + +/** + * Swap two SlabStatus structures. Implements HeapSwapper. + **/ +static void swapSlabStatuses(void *item1, void *item2) +{ + SlabStatus *info1 = item1; + SlabStatus *info2 = item2; + SlabStatus temp = *info1; + *info1 = *info2; + *info2 = temp; +} + +/** + * Inform the allocator that a slab action has finished on some slab. This + * callback is registered in applyToSlabs(). + * + * @param completion The allocator completion + **/ +static void slabActionCallback(VDOCompletion *completion) +{ + BlockAllocator *allocator = container_of(completion, BlockAllocator, + completion); + SlabActor *actor = &allocator->slabActor; + if (--actor->slabActionCount == 0) { + actor->callback(completion); + return; + } + + resetCompletion(completion); +} + +/** + * Preserve the error from part of an administrative action and continue. + * + * @param completion The allocator completion + **/ +static void handleOperationError(VDOCompletion *completion) +{ + BlockAllocator *allocator = (BlockAllocator *) completion; + setOperationResult(&allocator->state, completion->result); + completion->callback(completion); +} + +/** + * Perform an administrative action on each of an allocator's slabs in + * parallel. + * + * @param allocator The allocator + * @param callback The method to call when the action is complete on every + * slab + **/ +static void applyToSlabs(BlockAllocator *allocator, VDOAction *callback) +{ + prepareCompletion(&allocator->completion, slabActionCallback, + handleOperationError, allocator->threadID, NULL); + allocator->completion.requeue = false; + + // Since we are going to dequeue all of the slabs, the open slab will become + // invalid, so clear it. + allocator->openSlab = NULL; + + // Ensure that we don't finish before we're done starting. + allocator->slabActor = (SlabActor) { + .slabActionCount = 1, + .callback = callback, + }; + + SlabIterator iterator = getSlabIterator(allocator); + while (hasNextSlab(&iterator)) { + Slab *slab = nextSlab(&iterator); + unspliceRingNode(&slab->ringNode); + allocator->slabActor.slabActionCount++; + startSlabAction(slab, allocator->state.state, &allocator->completion); + } + + slabActionCallback(&allocator->completion); +} + +/** + * Inform the allocator that all load I/O has finished. + * + * @param completion The allocator completion + **/ +static void finishLoadingAllocator(VDOCompletion *completion) +{ + BlockAllocator *allocator = (BlockAllocator *) completion; + if (allocator->state.state == ADMIN_STATE_LOADING_FOR_RECOVERY) { + void *context = getCurrentActionContext(allocator->depot->actionManager); + replayIntoSlabJournals(allocator, completion, context); + return; + } + + finishLoading(&allocator->state); +} + +/** + * Initiate a load. + * + * Implements AdminInitiator. + **/ +static void initiateLoad(AdminState *state) +{ + BlockAllocator *allocator = container_of(state, BlockAllocator, state); + if (state->state == ADMIN_STATE_LOADING_FOR_REBUILD) { + prepareCompletion(&allocator->completion, finishLoadingAllocator, + handleOperationError, allocator->threadID, NULL); + eraseSlabJournals(allocator->depot, getSlabIterator(allocator), + &allocator->completion); + return; + } + + applyToSlabs(allocator, finishLoadingAllocator); +} + +/**********************************************************************/ +void loadBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + startLoading(&allocator->state, + getCurrentManagerOperation(allocator->depot->actionManager), + parent, initiateLoad); +} + +/**********************************************************************/ +void notifySlabJournalsAreRecovered(BlockAllocator *allocator, int result) +{ + finishLoadingWithResult(&allocator->state, result); +} + +/**********************************************************************/ +int prepareSlabsForAllocation(BlockAllocator *allocator) +{ + relaxedStore64(&allocator->statistics.allocatedBlocks, + getDataBlockCount(allocator)); + + SlabDepot *depot = allocator->depot; + SlabCount slabCount = depot->slabCount; + + SlabStatus *slabStatuses; + int result = ALLOCATE(slabCount, SlabStatus, __func__, &slabStatuses); + if (result != VDO_SUCCESS) { + return result; + } + + getSummarizedSlabStatuses(allocator->summary, slabCount, slabStatuses); + + // Sort the slabs by cleanliness, then by emptiness hint. + Heap heap; + initializeHeap(&heap, compareSlabStatuses, swapSlabStatuses, + slabStatuses, slabCount, sizeof(SlabStatus)); + buildHeap(&heap, slabCount); + + SlabStatus currentSlabStatus; + while (popMaxHeapElement(&heap, ¤tSlabStatus)) { + Slab *slab = depot->slabs[currentSlabStatus.slabNumber]; + if (slab->allocator != allocator) { + continue; + } + + if ((depot->loadType == REBUILD_LOAD) + || (!mustLoadRefCounts(allocator->summary, slab->slabNumber) + && currentSlabStatus.isClean)) { + queueSlab(slab); + continue; + } + + markSlabUnrecovered(slab); + bool highPriority + = ((currentSlabStatus.isClean && (depot->loadType == NORMAL_LOAD)) + || requiresScrubbing(slab->journal)); + registerSlabForScrubbing(allocator->slabScrubber, slab, highPriority); + } + FREE(slabStatuses); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void prepareAllocatorToAllocate(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + int result = prepareSlabsForAllocation(allocator); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + scrubHighPrioritySlabs(allocator->slabScrubber, + isPriorityTableEmpty(allocator->prioritizedSlabs), + parent, finishParentCallback, finishParentCallback); +} + +/**********************************************************************/ +void registerNewSlabsForAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + SlabDepot *depot = allocator->depot; + for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { + Slab *slab = depot->newSlabs[i]; + if (slab->allocator == allocator) { + registerSlabWithAllocator(allocator, slab); + } + } + completeCompletion(parent); +} + +/** + * Perform a step in draining the allocator. This method is its own callback. + * + * @param completion The allocator's completion + **/ +static void doDrainStep(VDOCompletion *completion) +{ + BlockAllocator *allocator = (BlockAllocator *) completion; + prepareForRequeue(&allocator->completion, doDrainStep, handleOperationError, + allocator->threadID, NULL); + switch (++allocator->drainStep) { + case DRAIN_ALLOCATOR_STEP_SCRUBBER: + stopScrubbing(allocator->slabScrubber, completion); + return; + + case DRAIN_ALLOCATOR_STEP_SLABS: + applyToSlabs(allocator, doDrainStep); + return; + + case DRAIN_ALLOCATOR_STEP_SUMMARY: + drainSlabSummaryZone(allocator->summary, allocator->state.state, + completion); + return; + + case DRAIN_ALLOCATOR_STEP_FINISHED: + ASSERT_LOG_ONLY(!isVIOPoolBusy(allocator->vioPool), "VIO Pool not busy"); + finishDrainingWithResult(&allocator->state, completion->result); + return; + + default: + finishDrainingWithResult(&allocator->state, UDS_BAD_STATE); + } +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + BlockAllocator *allocator = container_of(state, BlockAllocator, state); + allocator->drainStep = DRAIN_ALLOCATOR_START; + doDrainStep(&allocator->completion); +} + +/**********************************************************************/ +void drainBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + startDraining(&allocator->state, + getCurrentManagerOperation(allocator->depot->actionManager), + parent, initiateDrain); +} + +/** + * Perform a step in resuming a quiescent allocator. This method is its own + * callback. + * + * @param completion The allocator's completion + **/ +static void doResumeStep(VDOCompletion *completion) +{ + BlockAllocator *allocator = (BlockAllocator *) completion; + prepareForRequeue(&allocator->completion, doResumeStep, handleOperationError, + allocator->threadID, NULL); + switch (--allocator->drainStep) { + case DRAIN_ALLOCATOR_STEP_SUMMARY: + resumeSlabSummaryZone(allocator->summary, completion); + return; + + case DRAIN_ALLOCATOR_STEP_SLABS: + applyToSlabs(allocator, doResumeStep); + return; + + case DRAIN_ALLOCATOR_STEP_SCRUBBER: + resumeScrubbing(allocator->slabScrubber, completion); + return; + + case DRAIN_ALLOCATOR_START: + finishResumingWithResult(&allocator->state, completion->result); + return; + + default: + finishResumingWithResult(&allocator->state, UDS_BAD_STATE); + } +} + +/** + * Initiate a resume. + * + * Implements AdminInitiator. + **/ +static void initiateResume(AdminState *state) +{ + BlockAllocator *allocator = container_of(state, BlockAllocator, state); + allocator->drainStep = DRAIN_ALLOCATOR_STEP_FINISHED; + doResumeStep(&allocator->completion); +} + +/**********************************************************************/ +void resumeBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + startResuming(&allocator->state, + getCurrentManagerOperation(allocator->depot->actionManager), + parent, initiateResume); +} + +/**********************************************************************/ +void releaseTailBlockLocks(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + RingNode *ring = &allocator->dirtySlabJournals; + while (!isRingEmpty(ring)) { + if (!releaseRecoveryJournalLock(slabJournalFromDirtyNode(ring->next), + allocator->depot->activeReleaseRequest)) { + break; + } + } + completeCompletion(parent); +} + +/**********************************************************************/ +SlabSummaryZone *getSlabSummaryZone(const BlockAllocator *allocator) +{ + return allocator->summary; +} + +/**********************************************************************/ +int acquireVIO(BlockAllocator *allocator, Waiter *waiter) +{ + return acquireVIOFromPool(allocator->vioPool, waiter); +} + +/**********************************************************************/ +void returnVIO(BlockAllocator *allocator, VIOPoolEntry *entry) +{ + returnVIOToPool(allocator->vioPool, entry); +} + +/**********************************************************************/ +void scrubAllUnrecoveredSlabsInZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); + scrubSlabs(allocator->slabScrubber, allocator->depot, + notifyZoneFinishedScrubbing, noopCallback); + completeCompletion(parent); +} + +/**********************************************************************/ +int enqueueForCleanSlab(BlockAllocator *allocator, Waiter *waiter) +{ + return enqueueCleanSlabWaiter(allocator->slabScrubber, waiter); +} + +/**********************************************************************/ +void increaseScrubbingPriority(Slab *slab) +{ + registerSlabForScrubbing(slab->allocator->slabScrubber, slab, true); +} + +/**********************************************************************/ +void allocateFromAllocatorLastSlab(BlockAllocator *allocator) +{ + ASSERT_LOG_ONLY(allocator->openSlab == NULL, "mustn't have an open slab"); + Slab *lastSlab = allocator->depot->slabs[allocator->lastSlab]; + priorityTableRemove(allocator->prioritizedSlabs, &lastSlab->ringNode); + allocator->openSlab = lastSlab; +} + +/**********************************************************************/ +BlockAllocatorStatistics +getBlockAllocatorStatistics(const BlockAllocator *allocator) +{ + const AtomicAllocatorStatistics *atoms = &allocator->statistics; + return (BlockAllocatorStatistics) { + .slabCount = allocator->slabCount, + .slabsOpened = relaxedLoad64(&atoms->slabsOpened), + .slabsReopened = relaxedLoad64(&atoms->slabsReopened), + }; +} + +/**********************************************************************/ +SlabJournalStatistics getSlabJournalStatistics(const BlockAllocator *allocator) +{ + const AtomicSlabJournalStatistics *atoms = &allocator->slabJournalStatistics; + return (SlabJournalStatistics) { + .diskFullCount = atomicLoad64(&atoms->diskFullCount), + .flushCount = atomicLoad64(&atoms->flushCount), + .blockedCount = atomicLoad64(&atoms->blockedCount), + .blocksWritten = atomicLoad64(&atoms->blocksWritten), + .tailBusyCount = atomicLoad64(&atoms->tailBusyCount), + }; +} + +/**********************************************************************/ +RefCountsStatistics getRefCountsStatistics(const BlockAllocator *allocator) +{ + const AtomicRefCountStatistics *atoms = &allocator->refCountStatistics; + return (RefCountsStatistics) { + .blocksWritten = atomicLoad64(&atoms->blocksWritten), + }; +} + +/**********************************************************************/ +void dumpBlockAllocator(const BlockAllocator *allocator) +{ + unsigned int pauseCounter = 0; + logInfo("BlockAllocator zone %u", allocator->zoneNumber); + SlabIterator iterator = getSlabIterator(allocator); + while (hasNextSlab(&iterator)) { + dumpSlab(nextSlab(&iterator)); + + // Wait for a while after each batch of 32 slabs dumped, allowing the + // kernel log a chance to be flushed instead of being overrun. + if (pauseCounter++ == 31) { + pauseCounter = 0; + pauseForLogger(); + } + } + + dumpSlabScrubber(allocator->slabScrubber); +} diff --git a/source/vdo/base/blockAllocator.h b/source/vdo/base/blockAllocator.h new file mode 100644 index 0000000..cd8eb39 --- /dev/null +++ b/source/vdo/base/blockAllocator.h @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocator.h#12 $ + */ + +#ifndef BLOCK_ALLOCATOR_H +#define BLOCK_ALLOCATOR_H + +#include "completion.h" +#include "fixedLayout.h" +#include "statistics.h" +#include "types.h" +#include "vioPool.h" +#include "waitQueue.h" + +/** + * Create a block allocator. + * + * @param [in] depot The slab depot for this allocator + * @param [in] zoneNumber The physical zone number for this allocator + * @param [in] threadID The thread ID for this allocator's zone + * @param [in] nonce The nonce of the VDO + * @param [in] vioPoolSize The size of the VIO pool + * @param [in] layer The physical layer below this allocator + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [out] allocatorPtr A pointer to hold the allocator + * + * @return A success or error code + **/ +int makeBlockAllocator(SlabDepot *depot, + ZoneCount zoneNumber, + ThreadID threadID, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + BlockAllocator **allocatorPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a block allocator and null out the reference to it. + * + * @param blockAllocatorPtr The reference to the allocator to destroy + **/ +void freeBlockAllocator(BlockAllocator **blockAllocatorPtr); + +/** + * Queue a slab for allocation or scrubbing. + * + * @param slab The slab to queue + **/ +void queueSlab(Slab *slab); + +/** + * Update the block allocator to reflect an increment or decrement of the free + * block count in a slab. This adjusts the allocated block count and + * reprioritizes the slab when appropriate. + * + * @param slab The slab whose free block count changed + * @param increment True if the free block count went up by one, + * false if it went down by one + **/ +void adjustFreeBlockCount(Slab *slab, bool increment); + +/** + * Allocate a physical block. + * + * The block allocated will have a provisional reference and the + * reference must be either confirmed with a subsequent call to + * incrementReferenceCount() or vacated with a subsequent call to + * decrementReferenceCount(). + * + * @param [in] allocator The block allocator + * @param [out] blockNumberPtr A pointer to receive the allocated block number + * + * @return UDS_SUCCESS or an error code + **/ +int allocateBlock(BlockAllocator *allocator, + PhysicalBlockNumber *blockNumberPtr) + __attribute__((warn_unused_result)); + +/** + * Release an unused provisional reference. + * + * @param allocator The block allocator + * @param pbn The block to dereference + * @param why Why the block was referenced (for logging) + **/ +void releaseBlockReference(BlockAllocator *allocator, + PhysicalBlockNumber pbn, + const char *why); + +/** + * Get the number of allocated blocks, which is the total number of + * blocks in all slabs that have a non-zero reference count. + * + * @param allocator The block allocator + * + * @return The number of blocks with a non-zero reference count + **/ +BlockCount getAllocatedBlocks(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Get the number of unrecovered slabs. + * + * @param allocator The block allocator + * + * @return The number of slabs that are unrecovered + **/ +BlockCount getUnrecoveredSlabCount(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Load the state of an allocator from disk. + * + *

Implements ZoneAction. + **/ +void loadBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Inform a block allocator that its slab journals have been recovered from the + * recovery journal. + * + * @param allocator The allocator to inform + * @param result The result of the recovery operation + **/ +void notifySlabJournalsAreRecovered(BlockAllocator *allocator, int result); + +/** + * Prepare the block allocator to come online and start allocating blocks. + * + *

Implements ZoneAction. + **/ +void prepareAllocatorToAllocate(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Register a slab with the allocator, ready for use. + * + * @param allocator The allocator to use + * @param slab The slab in question + **/ +void registerSlabWithAllocator(BlockAllocator *allocator, Slab *slab); + +/** + * Register the new slabs belonging to this allocator. + * + *

Implements ZoneAction. + **/ +void registerNewSlabsForAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Drain all allocator I/O. Depending upon the type of drain, some or all + * dirty metadata may be written to disk. The type of drain will be determined + * from the state of the allocator's depot. + * + *

Implements ZoneAction. + **/ +void drainBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Resume a quiescent allocator. + * + *

Implements ZoneAction. + **/ +void resumeBlockAllocator(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Request a commit of all dirty tail blocks which are locking a given recovery + * journal block. + * + *

Implements ZoneAction. + **/ +void releaseTailBlockLocks(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Get the slab summary zone for an allocator. + * + * @param allocator The allocator + * + * @return The SlabSummaryZone for that allocator + **/ +SlabSummaryZone *getSlabSummaryZone(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Acquire a VIO from a block allocator's VIO pool (asynchronous). + * + * @param allocator The allocator from which to get a VIO + * @param waiter The object requesting the VIO + * + * @return VDO_SUCCESS or an error + **/ +int acquireVIO(BlockAllocator *allocator, Waiter *waiter) + __attribute__((warn_unused_result)); + +/** + * Return a VIO to a block allocator's VIO pool + * + * @param allocator The block allocator which owns the VIO + * @param entry The VIO being returned + **/ +void returnVIO(BlockAllocator *allocator, VIOPoolEntry *entry); + +/** + * Initiate scrubbing all unrecovered slabs. + * + *

Implements ZoneAction. + **/ +void scrubAllUnrecoveredSlabsInZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent); + +/** + * Queue a waiter for a clean slab. + * + * @param allocator The allocator to wait on + * @param waiter The waiter + * + * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no + * slabs to scrub, and some other error otherwise + **/ +int enqueueForCleanSlab(BlockAllocator *allocator, Waiter *waiter) + __attribute__((warn_unused_result)); + +/** + * Increase the scrubbing priority of a slab. + * + * @param slab The slab + **/ +void increaseScrubbingPriority(Slab *slab); + +/** + * Get the statistics for this allocator. + * + * @param allocator The allocator to query + * + * @return A copy of the current statistics for the allocator + **/ +BlockAllocatorStatistics +getBlockAllocatorStatistics(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Get the aggregated slab journal statistics for the slabs in this allocator. + * + * @param allocator The allocator to query + * + * @return A copy of the current statistics for the allocator + **/ +SlabJournalStatistics getSlabJournalStatistics(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Get the cumulative RefCounts statistics for the slabs in this allocator. + * + * @param allocator The allocator to query + * + * @return A copy of the current statistics for the allocator + **/ +RefCountsStatistics getRefCountsStatistics(const BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Dump information about a block allocator to the log for debugging. + * + * @param allocator The allocator to dump + **/ +void dumpBlockAllocator(const BlockAllocator *allocator); + +#endif // BLOCK_ALLOCATOR_H diff --git a/source/vdo/base/blockAllocatorInternals.h b/source/vdo/base/blockAllocatorInternals.h new file mode 100644 index 0000000..83db684 --- /dev/null +++ b/source/vdo/base/blockAllocatorInternals.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocatorInternals.h#11 $ + */ + +#ifndef BLOCK_ALLOCATOR_INTERNALS_H +#define BLOCK_ALLOCATOR_INTERNALS_H + +#include "adminState.h" +#include "atomic.h" +#include "blockAllocator.h" +#include "priorityTable.h" +#include "ringNode.h" +#include "slabScrubber.h" +#include "vioPool.h" + +enum { + /* + * The number of VIOs in the VIO pool is proportional to the throughput of + * the VDO. + */ + VIO_POOL_SIZE = 128, +}; + +typedef enum { + DRAIN_ALLOCATOR_START = 0, + DRAIN_ALLOCATOR_STEP_SCRUBBER, + DRAIN_ALLOCATOR_STEP_SLABS, + DRAIN_ALLOCATOR_STEP_SUMMARY, + DRAIN_ALLOCATOR_STEP_FINISHED, +} BlockAllocatorDrainStep; + +/** + * A sub-structure for applying actions in parallel to all an allocator's + * slabs. + **/ +typedef struct { + /** The number of slabs performing a slab action */ + SlabCount slabActionCount; + /** The method to call when a slab action has been completed by all slabs */ + VDOAction *callback; +} SlabActor; + +/** + * These fields are only modified by the physical zone thread, but are queried + * by other threads. + **/ +typedef struct atomicAllocatorStatistics { + /** The count of allocated blocks in this zone */ + Atomic64 allocatedBlocks; + /** The number of slabs from which blocks have ever been allocated */ + Atomic64 slabsOpened; + /** The number of times since loading that a slab been re-opened */ + Atomic64 slabsReopened; +} AtomicAllocatorStatistics; + +/** + * The statistics for all the slab journals in the slabs owned by this + * allocator. These fields are all mutated only by the physical zone thread, + * but are read by other threads when gathering statistics for the entire + * depot. + **/ +typedef struct atomicSlabJournalStatistics { + /** Number of times the on-disk journal was full */ + Atomic64 diskFullCount; + /** Number of times an entry was added over the flush threshold */ + Atomic64 flushCount; + /** Number of times an entry was added over the block threshold */ + Atomic64 blockedCount; + /** Number of times the tail block was written */ + Atomic64 blocksWritten; + /** Number of times we had to wait for the tail block commit */ + Atomic64 tailBusyCount; +} AtomicSlabJournalStatistics; + +/** + * The statistics for all the RefCounts in the slabs owned by this + * allocator. These fields are all mutated only by the physical zone thread, + * but are read by other threads when gathering statistics for the entire + * depot. + **/ +typedef struct atomicRefCountStatistics { + /** Number of blocks written */ + Atomic64 blocksWritten; +} AtomicRefCountStatistics; + +struct blockAllocator { + VDOCompletion completion; + /** The slab depot for this allocator */ + SlabDepot *depot; + /** The slab summary zone for this allocator */ + SlabSummaryZone *summary; + /** The notifier for entering read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The nonce of the VDO */ + Nonce nonce; + /** The physical zone number of this allocator */ + ZoneCount zoneNumber; + /** The thread ID for this allocator's physical zone */ + ThreadID threadID; + /** The number of slabs in this allocator */ + SlabCount slabCount; + /** The number of the last slab owned by this allocator */ + SlabCount lastSlab; + /** The reduced priority level used to preserve unopened slabs */ + unsigned int unopenedSlabPriority; + /** The state of this allocator */ + AdminState state; + /** The actor for applying an action to all slabs */ + SlabActor slabActor; + + /** The slab from which blocks are currently being allocated */ + Slab *openSlab; + /** A priority queue containing all slabs available for allocation */ + PriorityTable *prioritizedSlabs; + /** The slab scrubber */ + SlabScrubber *slabScrubber; + /** What phase of the close operation the allocator is to perform */ + BlockAllocatorDrainStep drainStep; + /** Statistics for this block allocator */ + AtomicAllocatorStatistics statistics; + /** Cumulative statistics for the slab journals in this zone */ + AtomicSlabJournalStatistics slabJournalStatistics; + /** Cumulative statistics for the RefCounts in this zone */ + AtomicRefCountStatistics refCountStatistics; + + /** + * This is the head of a queue of slab journals which have entries in their + * tail blocks which have not yet started to commit. When the recovery + * journal is under space pressure, slab journals which have uncommitted + * entries holding a lock on the recovery journal head are forced to commit + * their blocks early. This list is kept in order, with the tail containing + * the slab journal holding the most recent recovery journal lock. + **/ + RingNode dirtySlabJournals; + + /** The VIO pool for reading and writing block allocator metadata */ + VIOPool *vioPool; +}; + +/** + * Construct allocator metadata VIOs. Exposed for unit tests. + * + * Implements VIOConstructor + **/ +int makeAllocatorPoolVIOs(PhysicalLayer *layer, + void *parent, + void *buffer, + VIO **vioPtr) + __attribute__((warn_unused_result)); + +/** + * Replace the VIO pool in a block allocator. This method exists for unit + * tests. + * + * @param allocator The block allocator + * @param size The number of entries in the pool + * @param layer The physical layer from which to allocate VIOs + * + * @return VDO_SUCCESS or an error + **/ +int replaceVIOPool(BlockAllocator *allocator, + size_t size, + PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Prepare slabs for allocation or scrubbing. This method is exposed for + * testing. + * + * @param allocator The allocator to prepare + * + * @return VDO_SUCCESS or an error code + **/ +int prepareSlabsForAllocation(BlockAllocator *allocator) + __attribute__((warn_unused_result)); + +/** + * Start allocating from the highest numbered slab. + * + * @param allocator The allocator + **/ +void allocateFromAllocatorLastSlab(BlockAllocator *allocator); + +#endif // BLOCK_ALLOCATOR_INTERNALS_H diff --git a/source/vdo/base/blockMap.c b/source/vdo/base/blockMap.c new file mode 100644 index 0000000..9a13c30 --- /dev/null +++ b/source/vdo/base/blockMap.c @@ -0,0 +1,861 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMap.c#24 $ + */ + +#include "blockMap.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "actionManager.h" +#include "adminState.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "blockMapTree.h" +#include "constants.h" +#include "dataVIO.h" +#include "forest.h" +#include "numUtils.h" +#include "recoveryJournal.h" +#include "statusCodes.h" +#include "types.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" + +typedef struct { + PhysicalBlockNumber flatPageOrigin; + BlockCount flatPageCount; + PhysicalBlockNumber rootOrigin; + BlockCount rootCount; +} __attribute__((packed)) BlockMapState2_0; + +static const Header BLOCK_MAP_HEADER_2_0 = { + .id = BLOCK_MAP, + .version = { + .majorVersion = 2, + .minorVersion = 0, + }, + .size = sizeof(BlockMapState2_0), +}; + +/** + * State associated which each block map page while it is in the VDO page + * cache. + **/ +typedef struct { + /** + * The earliest recovery journal block containing uncommitted updates to the + * block map page associated with this context. A reference (lock) is held + * on that block to prevent it from being reaped. When this value changes, + * the reference on the old value must be released and a reference on the + * new value must be acquired. + **/ + SequenceNumber recoveryLock; +} BlockMapPageContext; + +/** + * Implements VDOPageReadFunction. + **/ +static int validatePageOnRead(void *buffer, + PhysicalBlockNumber pbn, + BlockMapZone *zone, + void *pageContext) +{ + BlockMapPage *page = buffer; + BlockMapPageContext *context = pageContext; + Nonce nonce = zone->blockMap->nonce; + + BlockMapPageValidity validity = validateBlockMapPage(page, nonce, pbn); + if (validity == BLOCK_MAP_PAGE_BAD) { + return logErrorWithStringError(VDO_BAD_PAGE, + "Expected page %" PRIu64 + " but got page %llu instead", + pbn, getBlockMapPagePBN(page)); + } + + if (validity == BLOCK_MAP_PAGE_INVALID) { + formatBlockMapPage(page, nonce, pbn, false); + } + + context->recoveryLock = 0; + return VDO_SUCCESS; +} + +/** + * Handle journal updates and torn write protection. + * + * Implements VDOPageWriteFunction. + **/ +static bool handlePageWrite(void *rawPage, + BlockMapZone *zone, + void *pageContext) +{ + BlockMapPage *page = rawPage; + BlockMapPageContext *context = pageContext; + + if (markBlockMapPageInitialized(page, true)) { + // Cause the page to be re-written. + return true; + } + + // Release the page's references on the recovery journal. + releaseRecoveryJournalBlockReference(zone->blockMap->journal, + context->recoveryLock, + ZONE_TYPE_LOGICAL, zone->zoneNumber); + context->recoveryLock = 0; + return false; +} + +/**********************************************************************/ +PageCount computeBlockMapPageCount(BlockCount entries) +{ + return computeBucketCount(entries, BLOCK_MAP_ENTRIES_PER_PAGE); +} + +/**********************************************************************/ +int makeBlockMap(BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockCount flatPageCount, + PhysicalBlockNumber rootOrigin, + BlockCount rootCount, + BlockMap **mapPtr) +{ + STATIC_ASSERT(BLOCK_MAP_ENTRIES_PER_PAGE + == ((VDO_BLOCK_SIZE - sizeof(BlockMapPage)) + / sizeof(BlockMapEntry))); + + BlockMap *map; + int result = ALLOCATE_EXTENDED(BlockMap, threadConfig->logicalZoneCount, + BlockMapZone, __func__, &map); + if (result != UDS_SUCCESS) { + return result; + } + + map->flatPageCount = flatPageCount; + map->rootOrigin = rootOrigin; + map->rootCount = rootCount; + map->entryCount = logicalBlocks; + + ZoneCount zoneCount = threadConfig->logicalZoneCount; + for (ZoneCount zone = 0; zone < zoneCount; zone++) { + BlockMapZone *blockMapZone = &map->zones[zone]; + blockMapZone->zoneNumber = zone; + blockMapZone->threadID = getLogicalZoneThread(threadConfig, zone); + blockMapZone->blockMap = map; + map->zoneCount++; + } + + *mapPtr = map; + return VDO_SUCCESS; +} + +/** + * Decode block map component state version 2.0 from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param state The state structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeBlockMapState_2_0(Buffer *buffer, BlockMapState2_0 *state) +{ + size_t initialLength = contentLength(buffer); + + PhysicalBlockNumber flatPageOrigin; + int result = getUInt64LEFromBuffer(buffer, &flatPageOrigin); + if (result != UDS_SUCCESS) { + return result; + } + + BlockCount flatPageCount; + result = getUInt64LEFromBuffer(buffer, &flatPageCount); + if (result != UDS_SUCCESS) { + return result; + } + + PhysicalBlockNumber rootOrigin; + result = getUInt64LEFromBuffer(buffer, &rootOrigin); + if (result != UDS_SUCCESS) { + return result; + } + + BlockCount rootCount; + result = getUInt64LEFromBuffer(buffer, &rootCount); + if (result != UDS_SUCCESS) { + return result; + } + + *state = (BlockMapState2_0) { + .flatPageOrigin = flatPageOrigin, + .flatPageCount = flatPageCount, + .rootOrigin = rootOrigin, + .rootCount = rootCount, + }; + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(BLOCK_MAP_HEADER_2_0.size == decodedSize, + "decoded block map component size must match header size"); +} + +/**********************************************************************/ +int decodeBlockMap(Buffer *buffer, + BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockMap **mapPtr) +{ + Header header; + int result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&BLOCK_MAP_HEADER_2_0, &header, true, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + BlockMapState2_0 state; + result = decodeBlockMapState_2_0(buffer, &state); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(state.flatPageOrigin == BLOCK_MAP_FLAT_PAGE_ORIGIN, + "Flat page origin must be %u (recorded as %llu)", + BLOCK_MAP_FLAT_PAGE_ORIGIN, state.flatPageOrigin); + if (result != UDS_SUCCESS) { + return result; + } + + BlockMap *map; + result = makeBlockMap(logicalBlocks, threadConfig, + state.flatPageCount, state.rootOrigin, + state.rootCount, &map); + if (result != VDO_SUCCESS) { + return result; + } + + *mapPtr = map; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int decodeSodiumBlockMap(Buffer *buffer, + BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockMap **mapPtr) +{ + // Sodium uses state version 2.0. + return decodeBlockMap(buffer, logicalBlocks, threadConfig, mapPtr); +} + +/** + * Initialize the per-zone portions of the block map. + * + * @param zone The zone to initialize + * @param layer The physical layer on which the zone resides + * @param readOnlyNotifier The read-only context for the VDO + * @param cacheSize The size of the page cache for the zone + * @param maximumAge The number of journal blocks before a dirtied page + * is considered old and must be written out + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int initializeBlockMapZone(BlockMapZone *zone, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + PageCount cacheSize, + BlockCount maximumAge) +{ + zone->readOnlyNotifier = readOnlyNotifier; + int result = initializeTreeZone(zone, layer, maximumAge); + if (result != VDO_SUCCESS) { + return result; + } + + return makeVDOPageCache(layer, cacheSize, validatePageOnRead, + handlePageWrite, sizeof(BlockMapPageContext), + maximumAge, zone, &zone->pageCache); +} + +/**********************************************************************/ +BlockMapZone *getBlockMapZone(BlockMap *map, ZoneCount zoneNumber) +{ + return &map->zones[zoneNumber]; +} + +/** + * Get the ID of the thread on which a given block map zone operates. + * + *

Implements ZoneThreadGetter. + **/ +static ThreadID getBlockMapZoneThreadID(void *context, ZoneCount zoneNumber) +{ + return getBlockMapZone(context, zoneNumber)->threadID; +} + +/** + * Prepare for an era advance. + * + *

Implements ActionPreamble. + **/ +static void prepareForEraAdvance(void *context, VDOCompletion *parent) +{ + BlockMap *map = context; + map->currentEraPoint = map->pendingEraPoint; + completeCompletion(parent); +} + +/** + * Update the progress of the era in a zone. + * + *

Implements ZoneAction. + **/ +static void advanceBlockMapZoneEra(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockMapZone *zone = getBlockMapZone(context, zoneNumber); + advanceVDOPageCachePeriod(zone->pageCache, zone->blockMap->currentEraPoint); + advanceZoneTreePeriod(&zone->treeZone, zone->blockMap->currentEraPoint); + finishCompletion(parent, VDO_SUCCESS); +} + +/** + * Schedule an era advance if necessary. This method should not be called + * directly. Rather, call scheduleDefaultAction() on the block map's action + * manager. + * + *

Implements ActionScheduler. + **/ +static bool scheduleEraAdvance(void *context) +{ + BlockMap *map = context; + if (map->currentEraPoint == map->pendingEraPoint) { + return false; + } + + return scheduleAction(map->actionManager, prepareForEraAdvance, + advanceBlockMapZoneEra, NULL, NULL); +} + +/**********************************************************************/ +int makeBlockMapCaches(BlockMap *map, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *journal, + Nonce nonce, + PageCount cacheSize, + BlockCount maximumAge) +{ + int result = ASSERT(cacheSize > 0, "block map cache size is specified"); + if (result != UDS_SUCCESS) { + return result; + } + + map->journal = journal; + map->nonce = nonce; + + result = makeForest(map, map->entryCount); + if (result != VDO_SUCCESS) { + return result; + } + + replaceForest(map); + for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { + result = initializeBlockMapZone(&map->zones[zone], layer, readOnlyNotifier, + cacheSize / map->zoneCount, maximumAge); + if (result != VDO_SUCCESS) { + return result; + } + } + + return makeActionManager(map->zoneCount, getBlockMapZoneThreadID, + getRecoveryJournalThreadID(journal), map, + scheduleEraAdvance, layer, + &map->actionManager); +} + +/** + * Clean up a BlockMapZone. + * + * @param zone The zone to uninitialize + **/ +static void uninitializeBlockMapZone(BlockMapZone *zone) +{ + uninitializeBlockMapTreeZone(&zone->treeZone); + freeVDOPageCache(&zone->pageCache); +} + +/**********************************************************************/ +void freeBlockMap(BlockMap **mapPtr) +{ + BlockMap *map = *mapPtr; + if (map == NULL) { + return; + } + + for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { + uninitializeBlockMapZone(&map->zones[zone]); + } + + abandonBlockMapGrowth(map); + freeForest(&map->forest); + freeActionManager(&map->actionManager); + + FREE(map); + *mapPtr = NULL; +} + +/**********************************************************************/ +size_t getBlockMapEncodedSize(void) +{ + return ENCODED_HEADER_SIZE + sizeof(BlockMapState2_0); +} + +/**********************************************************************/ +int encodeBlockMap(const BlockMap *map, Buffer *buffer) +{ + int result = encodeHeader(&BLOCK_MAP_HEADER_2_0, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = putUInt64LEIntoBuffer(buffer, BLOCK_MAP_FLAT_PAGE_ORIGIN); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, map->flatPageCount); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, map->rootOrigin); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, map->rootCount); + if (result != UDS_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + return ASSERT(BLOCK_MAP_HEADER_2_0.size == encodedSize, + "encoded block map component size must match header size"); +} + +/**********************************************************************/ +void initializeBlockMapFromJournal(BlockMap *map, RecoveryJournal *journal) +{ + map->currentEraPoint = getCurrentJournalSequenceNumber(journal); + map->pendingEraPoint = map->currentEraPoint; + + for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { + setTreeZoneInitialPeriod(&map->zones[zone].treeZone, map->currentEraPoint); + setVDOPageCacheInitialPeriod(map->zones[zone].pageCache, + map->currentEraPoint); + } +} + +/**********************************************************************/ +ZoneCount computeLogicalZone(DataVIO *dataVIO) +{ + BlockMap *map = getBlockMap(getVDOFromDataVIO(dataVIO)); + TreeLock *treeLock = &dataVIO->treeLock; + PageNumber pageNumber = computePageNumber(dataVIO->logical.lbn); + treeLock->treeSlots[0].pageIndex = pageNumber; + treeLock->rootIndex = pageNumber % map->rootCount; + return (treeLock->rootIndex % map->zoneCount); +} + +/**********************************************************************/ +void findBlockMapSlotAsync(DataVIO *dataVIO, + VDOAction *callback, + ThreadID threadID) +{ + BlockMap *map = getBlockMap(getVDOFromDataVIO(dataVIO)); + if (dataVIO->logical.lbn >= map->entryCount) { + finishDataVIO(dataVIO, VDO_OUT_OF_RANGE); + return; + } + + TreeLock *treeLock = &dataVIO->treeLock; + BlockMapTreeSlot *slot = &treeLock->treeSlots[0]; + slot->blockMapSlot.slot = computeSlot(dataVIO->logical.lbn); + if (slot->pageIndex < map->flatPageCount) { + slot->blockMapSlot.pbn = slot->pageIndex + BLOCK_MAP_FLAT_PAGE_ORIGIN; + launchCallback(dataVIOAsCompletion(dataVIO), callback, threadID); + return; + } + + treeLock->callback = callback; + treeLock->threadID = threadID; + lookupBlockMapPBN(dataVIO); +} + +/**********************************************************************/ +PageCount getNumberOfFixedBlockMapPages(const BlockMap *map) +{ + return (map->flatPageCount + map->rootCount); +} + +/**********************************************************************/ +BlockCount getNumberOfBlockMapEntries(const BlockMap *map) +{ + return map->entryCount; +} + +/**********************************************************************/ +void advanceBlockMapEra(BlockMap *map, SequenceNumber recoveryBlockNumber) +{ + if (map == NULL) { + return; + } + + map->pendingEraPoint = recoveryBlockNumber; + scheduleDefaultAction(map->actionManager); +} + +/**********************************************************************/ +void checkForDrainComplete(BlockMapZone *zone) +{ + if (isDraining(&zone->state) + && !isTreeZoneActive(&zone->treeZone) + && !isPageCacheActive(zone->pageCache)) { + finishDrainingWithResult(&zone->state, + (isReadOnly(zone->readOnlyNotifier) + ? VDO_READ_ONLY : VDO_SUCCESS)); + } +} + +/** + * Initiate a drain of the trees and page cache of a block map zone. + * + * Implements AdminInitiator + **/ +static void initiateDrain(AdminState *state) +{ + BlockMapZone *zone = container_of(state, BlockMapZone, state); + drainZoneTrees(&zone->treeZone); + drainVDOPageCache(zone->pageCache); + checkForDrainComplete(zone); +} + +/** + * Drain a zone of the block map. + * + *

Implements ZoneAction. + **/ +static void drainZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockMapZone *zone = getBlockMapZone(context, zoneNumber); + startDraining(&zone->state, + getCurrentManagerOperation(zone->blockMap->actionManager), + parent, initiateDrain); +} + +/**********************************************************************/ +void drainBlockMap(BlockMap *map, + AdminStateCode operation, + VDOCompletion *parent) +{ + scheduleOperation(map->actionManager, operation, NULL, drainZone, NULL, + parent); +} + +/** + * Resume a zone of the block map. + * + *

Implements ZoneAction. + **/ +static void resumeBlockMapZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + BlockMapZone *zone = getBlockMapZone(context, zoneNumber); + finishCompletion(parent, resumeIfQuiescent(&zone->state)); +} + +/**********************************************************************/ +void resumeBlockMap(BlockMap *map, VDOCompletion *parent) +{ + scheduleOperation(map->actionManager, ADMIN_STATE_RESUMING, NULL, + resumeBlockMapZone, NULL, parent); +} + +/**********************************************************************/ +int prepareToGrowBlockMap(BlockMap *map, BlockCount newLogicalBlocks) +{ + if (map->nextEntryCount == newLogicalBlocks) { + return VDO_SUCCESS; + } + + if (map->nextEntryCount > 0) { + abandonBlockMapGrowth(map); + } + + if (newLogicalBlocks < map->entryCount) { + map->nextEntryCount = map->entryCount; + return VDO_SUCCESS; + } + + return makeForest(map, newLogicalBlocks); +} + +/**********************************************************************/ +BlockCount getNewEntryCount(BlockMap *map) +{ + return map->nextEntryCount; +} + +/** + * Grow the block map by replacing the forest with the one which was prepared. + * + * Implements ActionPreamble + **/ +static void growForest(void *context, VDOCompletion *completion) +{ + replaceForest(context); + completeCompletion(completion); +} + +/**********************************************************************/ +void growBlockMap(BlockMap *map, VDOCompletion *parent) +{ + scheduleOperation(map->actionManager, ADMIN_STATE_SUSPENDED_OPERATION, + growForest, NULL, NULL, parent); +} + +/**********************************************************************/ +void abandonBlockMapGrowth(BlockMap *map) +{ + abandonForest(map); +} + +/** + * Finish processing a block map get or put operation. This function releases + * the page completion and then continues the requester. + * + * @param completion The completion for the page fetch + * @param result The result of the block map operation + **/ +static inline void finishProcessingPage(VDOCompletion *completion, int result) +{ + VDOCompletion *parent = completion->parent; + releaseVDOPageCompletion(completion); + continueCompletion(parent, result); +} + +/** + * Handle an error fetching a page from the cache. This error handler is + * registered in setupMappedBlock(). + * + * @param completion The page completion which got an error + **/ +static void handlePageError(VDOCompletion *completion) +{ + finishProcessingPage(completion, completion->result); +} + +/** + * Get the mapping page for a get/put mapped block operation and dispatch to + * the appropriate handler. + * + * @param dataVIO The dataVIO + * @param modifiable Whether we intend to modify the mapping + * @param action The handler to process the mapping page + **/ +static void setupMappedBlock(DataVIO *dataVIO, + bool modifiable, + VDOAction *action) +{ + BlockMapZone *zone = getBlockMapForZone(dataVIO->logical.zone); + if (isDraining(&zone->state)) { + finishDataVIO(dataVIO, VDO_SHUTTING_DOWN); + return; + } + + initVDOPageCompletion(&dataVIO->pageCompletion, zone->pageCache, + dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn, + modifiable, dataVIOAsCompletion(dataVIO), action, + handlePageError); + getVDOPageAsync(&dataVIO->pageCompletion.completion); +} + +/** + * Decode and validate a block map entry and attempt to use it to set the + * mapped location of a DataVIO. + * + * @param dataVIO The DataVIO to update with the map entry + * @param entry The block map entry for the logical block + * + * @return VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid + * or an error code for any other failure + **/ +__attribute__((warn_unused_result)) +static int setMappedEntry(DataVIO *dataVIO, const BlockMapEntry *entry) +{ + // Unpack the PBN for logging purposes even if the entry is invalid. + DataLocation mapped = unpackBlockMapEntry(entry); + + if (isValidLocation(&mapped)) { + int result = setMappedLocation(dataVIO, mapped.pbn, mapped.state); + /* + * Return success and all errors not specifically known to be errors from + * validating the location. Yes, this expression is redundant; it is + * intentional. + */ + if ((result == VDO_SUCCESS) + || ((result != VDO_OUT_OF_RANGE) && (result != VDO_BAD_MAPPING))) { + return result; + } + } + + // Log the corruption even if we wind up ignoring it for write VIOs, + // converting all cases to VDO_BAD_MAPPING. + logErrorWithStringError(VDO_BAD_MAPPING, "PBN %" PRIu64 + " with state %u read from the block map was invalid", + mapped.pbn, mapped.state); + + // A read VIO has no option but to report the bad mapping--reading + // zeros would be hiding known data loss. + if (isReadDataVIO(dataVIO)) { + return VDO_BAD_MAPPING; + } + + // A write VIO only reads this mapping to decref the old block. Treat + // this as an unmapped entry rather than fail the write. + clearMappedLocation(dataVIO); + return VDO_SUCCESS; +} + +/** + * This callback is registered in getMappedBlockAsync(). + **/ +static void getMappingFromFetchedPage(VDOCompletion *completion) +{ + if (completion->result != VDO_SUCCESS) { + finishProcessingPage(completion, completion->result); + return; + } + + const BlockMapPage *page = dereferenceReadableVDOPage(completion); + int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { + finishProcessingPage(completion, result); + return; + } + + DataVIO *dataVIO = asDataVIO(completion->parent); + BlockMapTreeSlot *treeSlot = &dataVIO->treeLock.treeSlots[0]; + const BlockMapEntry *entry = &page->entries[treeSlot->blockMapSlot.slot]; + + result = setMappedEntry(dataVIO, entry); + finishProcessingPage(completion, result); +} + +/** + * This callback is registered in putMappedBlockAsync(). + **/ +static void putMappingInFetchedPage(VDOCompletion *completion) +{ + if (completion->result != VDO_SUCCESS) { + finishProcessingPage(completion, completion->result); + return; + } + + BlockMapPage *page = dereferenceWritableVDOPage(completion); + int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { + finishProcessingPage(completion, result); + return; + } + + DataVIO *dataVIO = asDataVIO(completion->parent); + BlockMapPageContext *context = getVDOPageCompletionContext(completion); + SequenceNumber oldLock = context->recoveryLock; + updateBlockMapPage(page, dataVIO, dataVIO->newMapped.pbn, + dataVIO->newMapped.state, &context->recoveryLock); + markCompletedVDOPageDirty(completion, oldLock, context->recoveryLock); + finishProcessingPage(completion, VDO_SUCCESS); +} + +/**********************************************************************/ +void getMappedBlockAsync(DataVIO *dataVIO) +{ + if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) { + // We know that the block map page for this LBN has not been allocated, + // so the block must be unmapped. + clearMappedLocation(dataVIO); + continueDataVIO(dataVIO, VDO_SUCCESS); + return; + } + + setupMappedBlock(dataVIO, false, getMappingFromFetchedPage); +} + +/**********************************************************************/ +void putMappedBlockAsync(DataVIO *dataVIO) +{ + setupMappedBlock(dataVIO, true, putMappingInFetchedPage); +} + +/**********************************************************************/ +BlockMapStatistics getBlockMapStatistics(BlockMap *map) +{ + BlockMapStatistics stats; + memset(&stats, 0, sizeof(BlockMapStatistics)); + + for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { + const AtomicPageCacheStatistics *atoms + = getVDOPageCacheStatistics(map->zones[zone].pageCache); + stats.dirtyPages += atomicLoad64(&atoms->counts.dirtyPages); + stats.cleanPages += atomicLoad64(&atoms->counts.cleanPages); + stats.freePages += atomicLoad64(&atoms->counts.freePages); + stats.failedPages += atomicLoad64(&atoms->counts.failedPages); + stats.incomingPages += atomicLoad64(&atoms->counts.incomingPages); + stats.outgoingPages += atomicLoad64(&atoms->counts.outgoingPages); + + stats.cachePressure += atomicLoad64(&atoms->cachePressure); + stats.readCount += atomicLoad64(&atoms->readCount); + stats.writeCount += atomicLoad64(&atoms->writeCount); + stats.failedReads += atomicLoad64(&atoms->failedReads); + stats.failedWrites += atomicLoad64(&atoms->failedWrites); + stats.reclaimed += atomicLoad64(&atoms->reclaimed); + stats.readOutgoing += atomicLoad64(&atoms->readOutgoing); + stats.foundInCache += atomicLoad64(&atoms->foundInCache); + stats.discardRequired += atomicLoad64(&atoms->discardRequired); + stats.waitForPage += atomicLoad64(&atoms->waitForPage); + stats.fetchRequired += atomicLoad64(&atoms->fetchRequired); + stats.pagesLoaded += atomicLoad64(&atoms->pagesLoaded); + stats.pagesSaved += atomicLoad64(&atoms->pagesSaved); + stats.flushCount += atomicLoad64(&atoms->flushCount); + } + + return stats; +} diff --git a/source/vdo/base/blockMap.h b/source/vdo/base/blockMap.h new file mode 100644 index 0000000..48073a9 --- /dev/null +++ b/source/vdo/base/blockMap.h @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMap.h#4 $ + */ + +#ifndef BLOCK_MAP_H +#define BLOCK_MAP_H + +#include "adminState.h" +#include "blockMapEntry.h" +#include "completion.h" +#include "fixedLayout.h" +#include "statistics.h" +#include "types.h" + +/** + * Create a block map. + * + * @param [in] logicalBlocks The number of logical blocks for the VDO + * @param [in] threadConfig The thread configuration of the VDO + * @param [in] flatPageCount The number of flat pages + * @param [in] rootOrigin The absolute PBN of the first root page + * @param [in] rootCount The number of tree roots + * @param [out] mapPtr The pointer to hold the new block map + * + * @return VDO_SUCCESS or an error code + **/ +int makeBlockMap(BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockCount flatPageCount, + PhysicalBlockNumber rootOrigin, + BlockCount rootCount, + BlockMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Quiesce all block map I/O, possibly writing out all dirty metadata. + * + * @param map The block map to drain + * @param operation The type of drain to perform + * @param parent The completion to notify when the drain is complete + **/ +void drainBlockMap(BlockMap *map, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Resume I/O for a quiescent block map. + * + * @param map The block map to resume + * @param parent The completion to notify when the resume is complete + **/ +void resumeBlockMap(BlockMap *map, VDOCompletion *parent); + +/** + * Prepare to grow the block map by allocating an expanded collection of trees. + * + * @param map The block map to grow + * @param newLogicalBlocks The new logical size of the VDO + * + * @return VDO_SUCCESS or an error + **/ +int prepareToGrowBlockMap(BlockMap *map, BlockCount newLogicalBlocks) + __attribute__((warn_unused_result)); + +/** + * Get the logical size to which this block map is prepared to grow. + * + * @param map The block map + * + * @return The new number of entries the block map will be grown to or 0 if + * the block map is not prepared to grow + **/ +BlockCount getNewEntryCount(BlockMap *map) + __attribute__((warn_unused_result)); + +/** + * Grow a block map on which prepareToGrowBlockMap() has already been called. + * + * @param map The block map to grow + * @param parent The object to notify when the growth is complete + **/ +void growBlockMap(BlockMap *map, VDOCompletion *parent); + +/** + * Abandon any preparations which were made to grow this block map. + * + * @param map The map which won't be grown + **/ +void abandonBlockMapGrowth(BlockMap *map); + +/** + * Decode the state of a block map saved in a buffer, without creating page + * caches. + * + * @param [in] buffer A buffer containing the super block state + * @param [in] logicalBlocks The number of logical blocks for the VDO + * @param [in] threadConfig The thread configuration of the VDO + * @param [out] mapPtr The pointer to hold the new block map + * + * @return VDO_SUCCESS or an error code + **/ +int decodeBlockMap(Buffer *buffer, + BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Create a block map from the saved state of a Sodium block map, and do any + * necessary upgrade work. + * + * @param [in] buffer A buffer containing the super block state + * @param [in] logicalBlocks The number of logical blocks for the VDO + * @param [in] threadConfig The thread configuration of the VDO + * @param [out] mapPtr The pointer to hold the new block map + * + * @return VDO_SUCCESS or an error code + **/ +int decodeSodiumBlockMap(Buffer *buffer, + BlockCount logicalBlocks, + const ThreadConfig *threadConfig, + BlockMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate the page caches for a block map. + * + * @param map The block map needing caches. + * @param layer The physical layer for the cache + * @param readOnlyNotifier The read only mode context + * @param journal The recovery journal (may be NULL) + * @param nonce The nonce to distinguish initialized pages + * @param cacheSize The block map cache size, in pages + * @param maximumAge The number of journal blocks before a dirtied page + * is considered old and must be written out + * + * @return VDO_SUCCESS or an error code + **/ +int makeBlockMapCaches(BlockMap *map, + PhysicalLayer *layer, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *journal, + Nonce nonce, + PageCount cacheSize, + BlockCount maximumAge) + __attribute__((warn_unused_result)); + +/** + * Free a block map and null out the reference to it. + * + * @param mapPtr A pointer to the block map to free + **/ +void freeBlockMap(BlockMap **mapPtr); + +/** + * Get the size of the encoded state of a block map. + * + * @return The encoded size of the map's state + **/ +size_t getBlockMapEncodedSize(void) + __attribute__((warn_unused_result)); + +/** + * Encode the state of a block map into a buffer. + * + * @param map The block map to encode + * @param buffer The buffer to encode into + * + * @return UDS_SUCCESS or an error + **/ +int encodeBlockMap(const BlockMap *map, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Obtain any necessary state from the recovery journal that is needed for + * normal block map operation. + * + * @param map The map in question + * @param journal The journal to initialize from + **/ +void initializeBlockMapFromJournal(BlockMap *map, RecoveryJournal *journal); + +/** + * Get the portion of the block map for a given logical zone. + * + * @param map The map + * @param zoneNumber The number of the zone + * + * @return The requested block map zone + **/ +BlockMapZone *getBlockMapZone(BlockMap *map, ZoneCount zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Compute the logical zone on which the entry for a DataVIO + * resides + * + * @param dataVIO The DataVIO + * + * @return The logical zone number for the DataVIO + **/ +ZoneCount computeLogicalZone(DataVIO *dataVIO); + +/** + * Compute the block map slot in which the block map entry for a DataVIO + * resides, and cache that number in the DataVIO. + * + * @param dataVIO The DataVIO + * @param callback The function to call once the slot has been found + * @param threadID The thread on which to run the callback + **/ +void findBlockMapSlotAsync(DataVIO *dataVIO, + VDOAction *callback, + ThreadID threadID); + +/** + * Get number of block map pages at predetermined locations. + * + * @param map The block map + * + * @return The number of fixed pages used by the map + **/ +PageCount getNumberOfFixedBlockMapPages(const BlockMap *map) + __attribute__((warn_unused_result)); + +/** + * Get number of block map entries. + * + * @param map The block map + * + * @return The number of entries stored in the map + **/ +BlockCount getNumberOfBlockMapEntries(const BlockMap *map) + __attribute__((warn_unused_result)); + +/** + * Notify the block map that the recovery journal has finished a new block. + * This method must be called from the journal zone thread. + * + * @param map The block map + * @param recoveryBlockNumber The sequence number of the finished recovery + * journal block + **/ +void advanceBlockMapEra(BlockMap *map, SequenceNumber recoveryBlockNumber); + +/** + * Get the block number of the physical block containing the data for the + * specified logical block number. All blocks are mapped to physical block + * zero by default, which is conventionally the zero block. + * + * @param dataVIO The DataVIO of the block to map + **/ +void getMappedBlockAsync(DataVIO *dataVIO); + +/** + * Associate the logical block number for a block represented by a DataVIO + * with the physical block number in its newMapped field. + * + * @param dataVIO The DataVIO of the block to map + **/ +void putMappedBlockAsync(DataVIO *dataVIO); + +/** + * Get the stats for the block map page cache. + * + * @param map The block map containing the cache + * + * @return The block map statistics + **/ +BlockMapStatistics getBlockMapStatistics(BlockMap *map) + __attribute__((warn_unused_result)); + +#endif // BLOCK_MAP_H diff --git a/source/vdo/base/blockMapEntry.h b/source/vdo/base/blockMapEntry.h new file mode 100644 index 0000000..78304e9 --- /dev/null +++ b/source/vdo/base/blockMapEntry.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapEntry.h#4 $ + */ + +#ifndef BLOCK_MAP_ENTRY_H +#define BLOCK_MAP_ENTRY_H + +#include "blockMappingState.h" +#include "constants.h" +#include "numeric.h" +#include "types.h" + +/** + * The entry for each logical block in the block map is encoded into five + * bytes, which saves space in both the on-disk and in-memory layouts. It + * consists of the 36 low-order bits of a PhysicalBlockNumber (addressing 256 + * terabytes with a 4KB block size) and a 4-bit encoding of a + * BlockMappingState. + **/ +typedef union __attribute__((packed)) blockMapEntry { + struct __attribute__((packed)) { + /** + * Bits 7..4: The four highest bits of the 36-bit physical block number + * Bits 3..0: The 4-bit BlockMappingState + **/ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned mappingState : 4; + unsigned pbnHighNibble : 4; +#else + unsigned pbnHighNibble : 4; + unsigned mappingState : 4; +#endif + + /** 32 low-order bits of the 36-bit PBN, in little-endian byte order */ + byte pbnLowWord[4]; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[5]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + unsigned mappingState : 4; + unsigned pbnHighNibble : 4; + uint32_t pbnLowWord; + } littleEndian; +#endif +} BlockMapEntry; + +/** + * Unpack the fields of a BlockMapEntry, returning them as a DataLocation. + * + * @param entry A pointer to the entry to unpack + * + * @return the location of the data mapped by the block map entry + **/ +static inline DataLocation unpackBlockMapEntry(const BlockMapEntry *entry) +{ + PhysicalBlockNumber low32 = getUInt32LE(entry->fields.pbnLowWord); + PhysicalBlockNumber high4 = entry->fields.pbnHighNibble; + return (DataLocation) { + .pbn = ((high4 << 32) | low32), + .state = entry->fields.mappingState, + }; +} + +/**********************************************************************/ +static inline bool isMappedLocation(const DataLocation *location) +{ + return (location->state != MAPPING_STATE_UNMAPPED); +} + +/**********************************************************************/ +static inline bool isValidLocation(const DataLocation *location) +{ + if (location->pbn == ZERO_BLOCK) { + return !isCompressed(location->state); + } else { + return isMappedLocation(location); + } +} + +/** + * Pack a PhysicalBlockNumber into a BlockMapEntry. + * + * @param pbn The physical block number to convert to its + * packed five-byte representation + * @param mappingState The mapping state of the block + * + * @return the packed representation of the block number and mapping state + * + * @note unrepresentable high bits of the unpacked PBN are silently truncated + **/ +static inline BlockMapEntry packPBN(PhysicalBlockNumber pbn, + BlockMappingState mappingState) +{ + BlockMapEntry entry; + entry.fields.mappingState = (mappingState & 0x0F); + entry.fields.pbnHighNibble = ((pbn >> 32) & 0x0F), + storeUInt32LE(entry.fields.pbnLowWord, pbn & UINT_MAX); + return entry; +} + +#endif // BLOCK_MAP_ENTRY_H diff --git a/source/vdo/base/blockMapInternals.h b/source/vdo/base/blockMapInternals.h new file mode 100644 index 0000000..9b2f7a5 --- /dev/null +++ b/source/vdo/base/blockMapInternals.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapInternals.h#12 $ + */ + +#ifndef BLOCK_MAP_INTERNALS_H +#define BLOCK_MAP_INTERNALS_H + +#include "adminState.h" +#include "blockMapEntry.h" +#include "blockMapTree.h" +#include "completion.h" +#include "dirtyLists.h" +#include "header.h" +#include "intMap.h" +#include "ringNode.h" +#include "types.h" +#include "vdoPageCache.h" +#include "vioPool.h" + +/** + * The per-zone fields used by the block map tree. + **/ +struct blockMapTreeZone { + /** The BlockMapZone which owns this tree zone */ + BlockMapZone *mapZone; + /** The lists of dirty tree pages */ + DirtyLists *dirtyLists; + /** The number of tree lookups in progress */ + VIOCount activeLookups; + /** The map of pages currently being loaded */ + IntMap *loadingPages; + /** The pool of VIOs for tree I/O */ + VIOPool *vioPool; + /** The tree page which has issued or will be issuing a flush */ + TreePage *flusher; + /** The queue of pages waiting for a flush so they can be written out */ + WaitQueue flushWaiters; + /** The generation after the most recent flush */ + uint8_t generation; + /** The oldest active generation */ + uint8_t oldestGeneration; + /** The counts of dirty pages in each generation */ + uint32_t dirtyPageCounts[256]; +}; + +/** + * The per-zone fields of the block map. + **/ +struct blockMapZone { + /** The number of the zone this is */ + ZoneCount zoneNumber; + /** The ID of this zone's logical thread */ + ThreadID threadID; + /** The BlockMap which owns this BlockMapZone */ + BlockMap *blockMap; + /** The ReadOnlyNotifier of the VDO */ + ReadOnlyNotifier *readOnlyNotifier; + /** The page cache for this zone */ + VDOPageCache *pageCache; + /** The per-zone portion of the tree for this zone */ + BlockMapTreeZone treeZone; + /** The administrative state of the zone */ + AdminState state; +}; + +struct blockMap { + /** The manager for block map actions */ + ActionManager *actionManager; + /** The count of pages in the linear part of the block map */ + BlockCount flatPageCount; + /** The absolute PBN of the first root of the tree part of the block map */ + PhysicalBlockNumber rootOrigin; + /** The count of root pages of the tree part of the block map */ + BlockCount rootCount; + + /** The era point we are currently distributing to the zones */ + SequenceNumber currentEraPoint; + /** The next era point, not yet distributed to any zone */ + SequenceNumber pendingEraPoint; + + /** The number of entries in block map */ + BlockCount entryCount; + /** The VDO's nonce, for the pages */ + Nonce nonce; + /** The recovery journal for this map */ + RecoveryJournal *journal; + + /** The trees for finding block map pages */ + Forest *forest; + /** The expanded trees awaiting growth */ + Forest *nextForest; + /** The number of entries after growth */ + BlockCount nextEntryCount; + + /** The number of logical zones */ + ZoneCount zoneCount; + /** The per zone block map structure */ + BlockMapZone zones[]; +}; + +/** + * Compute the number of pages required for a block map with the specified + * parameters. + * + * @param entries The number of block map entries + * + * @return The number of pages required + **/ +PageCount computeBlockMapPageCount(BlockCount entries); + +/** + * Compute the number of the block map page on which the entry for a given + * logical block resides. + * + * @param lbn The logical block number whose page is desired + * + * @return The number of the block map page containing the entry for + * the given logical block number + **/ +__attribute__((warn_unused_result)) +static inline PageNumber computePageNumber(LogicalBlockNumber lbn) +{ + return (lbn / BLOCK_MAP_ENTRIES_PER_PAGE); +} + +/** + * Find the block map page slot in which the entry for a given logical + * block resides. + * + * @param lbn The logical block number whose slot + * + * @return The slot containing the entry for the given logical block number + **/ +__attribute__((warn_unused_result)) +static inline SlotNumber computeSlot(LogicalBlockNumber lbn) +{ + return (lbn % BLOCK_MAP_ENTRIES_PER_PAGE); +} + +/** + * Check whether a zone of the block map has drained, and if so, send a + * notification thereof. + * + * @param zone The zone to check + **/ +void checkForDrainComplete(BlockMapZone *zone); + + +#endif // BLOCK_MAP_INTERNALS_H diff --git a/source/vdo/base/blockMapPage.c b/source/vdo/base/blockMapPage.c new file mode 100644 index 0000000..8272e12 --- /dev/null +++ b/source/vdo/base/blockMapPage.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapPage.c#8 $ + */ + +#include "blockMapPage.h" + +#include "permassert.h" + +#include "blockMap.h" +#include "blockMapInternals.h" +#include "blockMapTree.h" +#include "constants.h" +#include "dataVIO.h" +#include "recoveryJournal.h" +#include "statusCodes.h" +#include "types.h" + +enum { + PAGE_HEADER_4_1_SIZE = 8 + 8 + 8 + 1 + 1 + 1 + 1, +}; + +static const VersionNumber BLOCK_MAP_4_1 = { + .majorVersion = 4, + .minorVersion = 1, +}; + +/**********************************************************************/ +bool isCurrentBlockMapPage(const BlockMapPage *page) +{ + return areSameVersion(BLOCK_MAP_4_1, unpackVersionNumber(page->version)); +} + +/**********************************************************************/ +BlockMapPage *formatBlockMapPage(void *buffer, + Nonce nonce, + PhysicalBlockNumber pbn, + bool initialized) +{ + memset(buffer, 0, VDO_BLOCK_SIZE); + BlockMapPage *page = (BlockMapPage *) buffer; + page->version = packVersionNumber(BLOCK_MAP_4_1); + storeUInt64LE(page->header.fields.nonce, nonce); + storeUInt64LE(page->header.fields.pbn, pbn); + page->header.fields.initialized = initialized; + return page; +} + +/**********************************************************************/ +BlockMapPageValidity validateBlockMapPage(BlockMapPage *page, + Nonce nonce, + PhysicalBlockNumber pbn) +{ + // Make sure the page layout isn't accidentally changed by changing the + // length of the page header. + STATIC_ASSERT_SIZEOF(PageHeader, PAGE_HEADER_4_1_SIZE); + + if (!areSameVersion(BLOCK_MAP_4_1, unpackVersionNumber(page->version)) + || !isBlockMapPageInitialized(page) + || (nonce != getUInt64LE(page->header.fields.nonce))) { + return BLOCK_MAP_PAGE_INVALID; + } + + if (pbn != getBlockMapPagePBN(page)) { + return BLOCK_MAP_PAGE_BAD; + } + + return BLOCK_MAP_PAGE_VALID; +} + +/**********************************************************************/ +void updateBlockMapPage(BlockMapPage *page, + DataVIO *dataVIO, + PhysicalBlockNumber pbn, + BlockMappingState mappingState, + SequenceNumber *recoveryLock) +{ + // Encode the new mapping. + TreeLock *treeLock = &dataVIO->treeLock; + SlotNumber slot = treeLock->treeSlots[treeLock->height].blockMapSlot.slot; + page->entries[slot] = packPBN(pbn, mappingState); + + // Adjust references (locks) on the recovery journal blocks. + BlockMapZone *zone = getBlockMapForZone(dataVIO->logical.zone); + BlockMap *blockMap = zone->blockMap; + RecoveryJournal *journal = blockMap->journal; + SequenceNumber oldLocked = *recoveryLock; + SequenceNumber newLocked = dataVIO->recoverySequenceNumber; + + if ((oldLocked == 0) || (oldLocked > newLocked)) { + // Acquire a lock on the newly referenced journal block. + acquireRecoveryJournalBlockReference(journal, newLocked, ZONE_TYPE_LOGICAL, + zone->zoneNumber); + + // If the block originally held a newer lock, release it. + if (oldLocked > 0) { + releaseRecoveryJournalBlockReference(journal, oldLocked, + ZONE_TYPE_LOGICAL, + zone->zoneNumber); + } + + *recoveryLock = newLocked; + } + + // Release the transferred lock from the DataVIO. + releasePerEntryLockFromOtherZone(journal, newLocked); + dataVIO->recoverySequenceNumber = 0; +} diff --git a/source/vdo/base/blockMapPage.h b/source/vdo/base/blockMapPage.h new file mode 100644 index 0000000..ee011b3 --- /dev/null +++ b/source/vdo/base/blockMapPage.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapPage.h#8 $ + */ + +#ifndef BLOCK_MAP_PAGE_H +#define BLOCK_MAP_PAGE_H + +#include "numeric.h" + +#include "blockMapEntry.h" +#include "header.h" +#include "types.h" + +/** + * The packed, on-disk representation of a block map page header. + **/ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** + * The 64-bit nonce of the current VDO, in little-endian byte order. Used + * to determine whether or not a page has been formatted. + **/ + byte nonce[8]; + + /** The 64-bit PBN of this page, in little-endian byte order */ + byte pbn[8]; + + /** Formerly recoverySequenceNumber; may be non-zero on disk */ + byte unusedLongWord[8]; + + /** Whether this page has been initialized on disk (i.e. written twice) */ + bool initialized; + + /** Formerly entryOffset; now unused since it should always be zero */ + byte unusedByte1; + + /** Formerly interiorTreePageWriting; may be non-zero on disk */ + byte unusedByte2; + + /** Formerly generation (for dirty tree pages); may be non-zero on disk */ + byte unusedByte3; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[8 + 8 + 8 + 1 + 1 + 1 + 1]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + uint64_t nonce; + PhysicalBlockNumber pbn; + uint64_t unusedLongWord; + bool initialized; + uint8_t unusedByte1; + uint8_t unusedByte2; + uint8_t unusedByte3; + } littleEndian; +#endif +} PageHeader; + +/** + * The format of a block map page. + **/ +typedef struct __attribute__((packed)) { + PackedVersionNumber version; + PageHeader header; + BlockMapEntry entries[]; +} BlockMapPage; + +typedef enum { + // A block map page is correctly initialized + BLOCK_MAP_PAGE_VALID, + // A block map page is uninitialized + BLOCK_MAP_PAGE_INVALID, + // A block map page is intialized, but is the wrong page + BLOCK_MAP_PAGE_BAD, +} BlockMapPageValidity; + +/** + * Check whether a block map page has been initialized. + * + * @param page The page to check + * + * @return true if the page has been initialized + **/ +__attribute__((warn_unused_result)) +static inline bool isBlockMapPageInitialized(const BlockMapPage *page) +{ + return page->header.fields.initialized; +} + +/** + * Mark whether a block map page has been initialized. + * + * @param page The page to mark + * @param initialized The state to set + * + * @return true if the initialized flag was modified + **/ +static inline bool markBlockMapPageInitialized(BlockMapPage *page, + bool initialized) +{ + if (initialized == page->header.fields.initialized) { + return false; + } + + page->header.fields.initialized = initialized; + return true; +} + +/** + * Get the physical block number where a block map page is stored. + * + * @param page The page to query + * + * @return the page's physical block number + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getBlockMapPagePBN(const BlockMapPage *page) +{ + return getUInt64LE(page->header.fields.pbn); +} + +/** + * Check whether a block map page is of the current version. + * + * @param page The page to check + * + * @return true if the page has the current version + **/ +bool isCurrentBlockMapPage(const BlockMapPage *page) + __attribute__((warn_unused_result)); + +/** + * Format a block map page in memory. + * + * @param buffer The buffer which holds the page + * @param nonce The VDO nonce + * @param pbn The absolute PBN of the page + * @param initialized Whether the page should be marked as initialized + * + * @return the buffer pointer, as a block map page (for convenience) + **/ +BlockMapPage *formatBlockMapPage(void *buffer, + Nonce nonce, + PhysicalBlockNumber pbn, + bool initialized); + +/** + * Check whether a newly read page is valid, upgrading its in-memory format if + * possible and necessary. If the page is valid, clear fields which are not + * meaningful on disk. + * + * @param page The page to validate + * @param nonce The VDO nonce + * @param pbn The expected absolute PBN of the page + * + * @return The validity of the page + **/ +BlockMapPageValidity validateBlockMapPage(BlockMapPage *page, + Nonce nonce, + PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Update an entry on a block map page. + * + * @param [in] page The page to update + * @param [in] dataVIO The DataVIO making the update + * @param [in] pbn The new PBN for the entry + * @param [in] mappingState The new mapping state for the entry + * @param [in,out] recoveryLock A reference to the current recovery sequence + * number lock held by the page. Will be updated + * if the lock changes to protect the new entry + **/ +void updateBlockMapPage(BlockMapPage *page, + DataVIO *dataVIO, + PhysicalBlockNumber pbn, + BlockMappingState mappingState, + SequenceNumber *recoveryLock); + +#endif // BLOCK_MAP_PAGE_H diff --git a/source/vdo/base/blockMapRecovery.c b/source/vdo/base/blockMapRecovery.c new file mode 100644 index 0000000..f70be42 --- /dev/null +++ b/source/vdo/base/blockMapRecovery.c @@ -0,0 +1,542 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.c#7 $ + */ + +#include "blockMapRecovery.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "heap.h" +#include "numUtils.h" +#include "refCounts.h" +#include "slabDepot.h" +#include "types.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" + +/** + * A completion to manage recovering the block map from the recovery journal. + * Note that the page completions kept in this structure are not immediately + * freed, so the corresponding pages will be locked down in the page cache + * until the recovery frees them. + **/ +typedef struct { + /** completion header */ + VDOCompletion completion; + /** the completion for flushing the block map */ + VDOCompletion subTaskCompletion; + /** the thread from which the block map may be flushed */ + ThreadID adminThread; + /** the thread on which all block map operations must be done */ + ThreadID logicalThreadID; + /** the block map */ + BlockMap *blockMap; + /** whether this recovery has been aborted */ + bool aborted; + /** whether we are currently launching the initial round of requests */ + bool launching; + + // Fields for the journal entries. + /** the journal entries to apply */ + NumberedBlockMapping *journalEntries; + /** + * a heap wrapping journalEntries. It re-orders and sorts journal entries in + * ascending LBN order, then original journal order. This permits efficient + * iteration over the journal entries in order. + **/ + Heap replayHeap; + + // Fields tracking progress through the journal entries. + /** a pointer to the next journal entry to apply */ + NumberedBlockMapping *currentEntry; + /** the next entry for which the block map page has not been requested */ + NumberedBlockMapping *currentUnfetchedEntry; + + // Fields tracking requested pages. + /** the absolute PBN of the current page being processed */ + PhysicalBlockNumber pbn; + /** number of pending (non-ready) requests */ + PageCount outstanding; + /** number of page completions */ + PageCount pageCount; + /** array of requested, potentially ready page completions */ + VDOPageCompletion pageCompletions[]; +} BlockMapRecoveryCompletion; + +/** + * This is a HeapComparator function that orders NumberedBlockMappings using + * the 'blockMapSlot' field as the primary key and the mapping 'number' field + * as the secondary key. Using the mapping number preserves the journal order + * of entries for the same slot, allowing us to sort by slot while still + * ensuring we replay all entries with the same slot in the exact order as they + * appeared in the journal. + * + *

The comparator order is reversed from the usual sense since Heap is a + * max-heap, returning larger elements before smaller ones, but we want to pop + * entries off the heap in ascending LBN order. + **/ +static int compareMappings(const void *item1, const void *item2) +{ + const NumberedBlockMapping *mapping1 = (const NumberedBlockMapping *) item1; + const NumberedBlockMapping *mapping2 = (const NumberedBlockMapping *) item2; + + if (mapping1->blockMapSlot.pbn != mapping2->blockMapSlot.pbn) { + return + ((mapping1->blockMapSlot.pbn < mapping2->blockMapSlot.pbn) ? 1 : -1); + } + + if (mapping1->blockMapSlot.slot != mapping2->blockMapSlot.slot) { + return + ((mapping1->blockMapSlot.slot < mapping2->blockMapSlot.slot) ? 1 : -1); + } + + if (mapping1->number != mapping2->number) { + return ((mapping1->number < mapping2->number) ? 1 : -1); + } + + return 0; +} + +/** + * Swap two NumberedBlockMapping structures. Implements HeapSwapper. + **/ +static void swapMappings(void *item1, void *item2) +{ + NumberedBlockMapping *mapping1 = item1; + NumberedBlockMapping *mapping2 = item2; + NumberedBlockMapping temp = *mapping1; + *mapping1 = *mapping2; + *mapping2 = temp; +} + +/** + * Convert a VDOCompletion to a BlockMapRecoveryCompletion. + * + * @param completion The completion to convert + * + * @return The completion as a BlockMapRecoveryCompletion + **/ +__attribute__((warn_unused_result)) +static inline BlockMapRecoveryCompletion * +asBlockMapRecoveryCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(BlockMapRecoveryCompletion, completion) == 0); + assertCompletionType(completion->type, BLOCK_MAP_RECOVERY_COMPLETION); + return (BlockMapRecoveryCompletion *) completion; +} + +/** + * Free a BlockMapRecoveryCompletion and null out the reference to it. + * + * @param completionPtr a pointer to the completion to free + **/ +static void freeRecoveryCompletion(VDOCompletion **completionPtr) +{ + VDOCompletion *completion = *completionPtr; + if (completion == NULL) { + return; + } + + BlockMapRecoveryCompletion *recovery + = asBlockMapRecoveryCompletion(*completionPtr); + destroyEnqueueable(completion); + destroyEnqueueable(&recovery->subTaskCompletion); + FREE(recovery); + *completionPtr = NULL; +} + +/** + * Free the BlockMapRecoveryCompletion and notify the parent that the block map + * recovery is done. This callback is registered in makeRecoveryCompletion(). + * + * @param completion The BlockMapRecoveryCompletion + **/ +static void finishBlockMapRecovery(VDOCompletion *completion) +{ + int result = completion->result; + VDOCompletion *parent = completion->parent; + freeRecoveryCompletion(&completion); + finishCompletion(parent, result); +} + +/** + * Make a new block map recovery completion. + * + * @param [in] vdo The VDO + * @param [in] entryCount The number of journal entries + * @param [in] journalEntries An array of journal entries to process + * @param [in] parent The parent of the recovery completion + * @param [out] recoveryPtr The new block map recovery completion + * + * @return a success or error code + **/ +static int makeRecoveryCompletion(VDO *vdo, + BlockCount entryCount, + NumberedBlockMapping *journalEntries, + VDOCompletion *parent, + BlockMapRecoveryCompletion **recoveryPtr) +{ + BlockMap *blockMap = getBlockMap(vdo); + PageCount pageCount + = minPageCount(getConfiguredCacheSize(vdo) >> 1, + MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS); + + BlockMapRecoveryCompletion *recovery; + int result = ALLOCATE_EXTENDED(BlockMapRecoveryCompletion, pageCount, + VDOPageCompletion, __func__, &recovery); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializeEnqueueableCompletion(&recovery->completion, + BLOCK_MAP_RECOVERY_COMPLETION, + vdo->layer); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = &recovery->completion; + freeRecoveryCompletion(&completion); + return result; + } + + result = initializeEnqueueableCompletion(&recovery->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = &recovery->completion; + freeRecoveryCompletion(&completion); + return result; + } + + recovery->blockMap = blockMap; + recovery->journalEntries = journalEntries; + recovery->pageCount = pageCount; + recovery->currentEntry = &recovery->journalEntries[entryCount - 1]; + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + recovery->adminThread = getAdminThread(threadConfig); + recovery->logicalThreadID = getLogicalZoneThread(threadConfig, 0); + + // Organize the journal entries into a binary heap so we can iterate over + // them in sorted order incrementally, avoiding an expensive sort call. + initializeHeap(&recovery->replayHeap, compareMappings, swapMappings, + journalEntries, entryCount, sizeof(NumberedBlockMapping)); + buildHeap(&recovery->replayHeap, entryCount); + + ASSERT_LOG_ONLY((getCallbackThreadID() == recovery->logicalThreadID), + "%s must be called on logical thread %u (not %u)", __func__, + recovery->logicalThreadID, getCallbackThreadID()); + prepareCompletion(&recovery->completion, finishBlockMapRecovery, + finishBlockMapRecovery, recovery->logicalThreadID, parent); + + // This message must be recognizable by VDOTest::RebuildBase. + logInfo("Replaying %zu recovery entries into block map", + recovery->replayHeap.count); + + *recoveryPtr = recovery; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void flushBlockMap(VDOCompletion *completion) +{ + logInfo("Flushing block map changes"); + BlockMapRecoveryCompletion *recovery + = asBlockMapRecoveryCompletion(completion->parent); + ASSERT_LOG_ONLY((completion->callbackThreadID == recovery->adminThread), + "flushBlockMap() called on admin thread"); + + prepareToFinishParent(completion, completion->parent); + drainBlockMap(recovery->blockMap, ADMIN_STATE_RECOVERING, completion); +} + +/** + * Check whether the recovery is done. If so, finish it by either flushing the + * block map (if the recovery was successful), or by cleaning up (if it + * wasn't). + * + * @param recovery The recovery completion + * + * @return true if the recovery or recovery is complete + **/ +static bool finishIfDone(BlockMapRecoveryCompletion *recovery) +{ + // Pages are still being launched or there is still work to do + if (recovery->launching || (recovery->outstanding > 0) + || (!recovery->aborted + && (recovery->currentEntry >= recovery->journalEntries))) { + return false; + } + + if (recovery->aborted) { + /* + * We need to be careful here to only free completions that exist. But + * since we know none are outstanding, we just go through the ready ones. + */ + for (size_t i = 0; i < recovery->pageCount; i++) { + VDOPageCompletion *pageCompletion = &recovery->pageCompletions[i]; + if (recovery->pageCompletions[i].ready) { + releaseVDOPageCompletion(&pageCompletion->completion); + } + } + completeCompletion(&recovery->completion); + } else { + launchCallbackWithParent(&recovery->subTaskCompletion, flushBlockMap, + recovery->adminThread, &recovery->completion); + } + + return true; +} + +/** + * Note that there has been an error during the recovery and finish it if there + * is nothing else outstanding. + * + * @param recovery The BlockMapRecoveryCompletion + * @param result The error result to use, if one is not already saved + **/ +static void abortRecovery(BlockMapRecoveryCompletion *recovery, int result) +{ + recovery->aborted = true; + setCompletionResult(&recovery->completion, result); + finishIfDone(recovery); +} + +/** + * Find the first journal entry after a given entry which is not on the same + * block map page. + * + * @param recovery the BlockMapRecoveryCompletion + * @param currentEntry the entry to search from + * @param needsSort Whether sorting is needed to proceed + * + * @return Pointer to the first later journal entry on a different block map + * page, or a pointer to just before the journal entries if no + * subsequent entry is on a different block map page. + **/ +static NumberedBlockMapping * +findEntryStartingNextPage(BlockMapRecoveryCompletion *recovery, + NumberedBlockMapping *currentEntry, + bool needsSort) +{ + // If currentEntry is invalid, return immediately. + if (currentEntry < recovery->journalEntries) { + return currentEntry; + } + size_t currentPage = currentEntry->blockMapSlot.pbn; + + // Decrement currentEntry until it's out of bounds or on a different page. + while ((currentEntry >= recovery->journalEntries) + && (currentEntry->blockMapSlot.pbn == currentPage)) { + if (needsSort) { + NumberedBlockMapping *justSortedEntry + = sortNextHeapElement(&recovery->replayHeap); + ASSERT_LOG_ONLY(justSortedEntry < currentEntry, + "heap is returning elements in an unexpected order"); + } + currentEntry--; + } + return currentEntry; +} + +/** + * Apply a range of journal entries to a block map page. + * + * @param page The block map page being modified + * @param startingEntry The first journal entry to apply + * @param endingEntry The entry just past the last journal entry to apply + **/ +static void applyJournalEntriesToPage(BlockMapPage *page, + NumberedBlockMapping *startingEntry, + NumberedBlockMapping *endingEntry) +{ + NumberedBlockMapping *currentEntry = startingEntry; + while (currentEntry != endingEntry) { + page->entries[currentEntry->blockMapSlot.slot] + = currentEntry->blockMapEntry; + currentEntry--; + } +} + +/**********************************************************************/ +static void recoverReadyPages(BlockMapRecoveryCompletion *recovery, + VDOCompletion *completion); + +/** + * Note that a page is now ready and attempt to process pages. This callback is + * registered in fetchPage(). + * + * @param completion The VDOPageCompletion for the fetched page + **/ +static void pageLoaded(VDOCompletion *completion) +{ + BlockMapRecoveryCompletion *recovery + = asBlockMapRecoveryCompletion(completion->parent); + recovery->outstanding--; + if (!recovery->launching) { + recoverReadyPages(recovery, completion); + } +} + +/** + * Handle an error loading a page. + * + * @param completion The VDOPageCompletion + **/ +static void handlePageLoadError(VDOCompletion *completion) +{ + BlockMapRecoveryCompletion *recovery + = asBlockMapRecoveryCompletion(completion->parent); + recovery->outstanding--; + abortRecovery(recovery, completion->result); +} + +/** + * Fetch a page from the block map. + * + * @param recovery the BlockMapRecoveryCompletion + * @param completion the page completion to use + **/ +static void fetchPage(BlockMapRecoveryCompletion *recovery, + VDOCompletion *completion) +{ + if (recovery->currentUnfetchedEntry < recovery->journalEntries) { + // Nothing left to fetch. + return; + } + + // Fetch the next page we haven't yet requested. + PhysicalBlockNumber newPBN + = recovery->currentUnfetchedEntry->blockMapSlot.pbn; + recovery->currentUnfetchedEntry + = findEntryStartingNextPage(recovery, recovery->currentUnfetchedEntry, + true); + initVDOPageCompletion(((VDOPageCompletion *) completion), + recovery->blockMap->zones[0].pageCache, + newPBN, true, &recovery->completion, + pageLoaded, handlePageLoadError); + recovery->outstanding++; + getVDOPageAsync(completion); +} + +/** + * Get the next page completion to process. If it isn't ready, we'll try again + * when it is. + * + * @param recovery The recovery completion + * @param completion The current page completion + * + * @return The next page completion to process + **/ +static VDOPageCompletion * +getNextPageCompletion(BlockMapRecoveryCompletion *recovery, + VDOPageCompletion *completion) +{ + completion++; + if (completion == (&recovery->pageCompletions[recovery->pageCount])) { + completion = &recovery->pageCompletions[0]; + } + return completion; +} + +/** + * Recover from as many pages as possible. + * + * @param recovery The recovery completion + * @param completion The first page completion to process + **/ +static void recoverReadyPages(BlockMapRecoveryCompletion *recovery, + VDOCompletion *completion) +{ + if (finishIfDone(recovery)) { + return; + } + + VDOPageCompletion *pageCompletion = (VDOPageCompletion *) completion; + if (recovery->pbn != pageCompletion->pbn) { + return; + } + + while (pageCompletion->ready) { + BlockMapPage *page = dereferenceWritableVDOPage(completion); + int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { + abortRecovery(recovery, result); + return; + } + + NumberedBlockMapping *startOfNextPage + = findEntryStartingNextPage(recovery, recovery->currentEntry, false); + applyJournalEntriesToPage(page, recovery->currentEntry, startOfNextPage); + recovery->currentEntry = startOfNextPage; + requestVDOPageWrite(completion); + releaseVDOPageCompletion(completion); + + if (finishIfDone(recovery)) { + return; + } + + recovery->pbn = recovery->currentEntry->blockMapSlot.pbn; + fetchPage(recovery, completion); + pageCompletion = getNextPageCompletion(recovery, pageCompletion); + completion = &pageCompletion->completion; + } +} + +/**********************************************************************/ +void recoverBlockMap(VDO *vdo, + BlockCount entryCount, + NumberedBlockMapping *journalEntries, + VDOCompletion *parent) +{ + BlockMapRecoveryCompletion *recovery; + int result = makeRecoveryCompletion(vdo, entryCount, journalEntries, parent, + &recovery); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + if (isHeapEmpty(&recovery->replayHeap)) { + finishCompletion(&recovery->completion, VDO_SUCCESS); + return; + } + + NumberedBlockMapping *firstSortedEntry + = sortNextHeapElement(&recovery->replayHeap); + ASSERT_LOG_ONLY(firstSortedEntry == recovery->currentEntry, + "heap is returning elements in an unexpected order"); + + // Prevent any page from being processed until all pages have been launched. + recovery->launching = true; + recovery->pbn = recovery->currentEntry->blockMapSlot.pbn; + recovery->currentUnfetchedEntry = recovery->currentEntry; + for (PageCount i = 0; i < recovery->pageCount; i++) { + if (recovery->currentUnfetchedEntry < recovery->journalEntries) { + break; + } + + fetchPage(recovery, &recovery->pageCompletions[i].completion); + } + recovery->launching = false; + + // Process any ready pages. + recoverReadyPages(recovery, &recovery->pageCompletions[0].completion); +} diff --git a/source/vdo/base/blockMapRecovery.h b/source/vdo/base/blockMapRecovery.h new file mode 100644 index 0000000..9029bf0 --- /dev/null +++ b/source/vdo/base/blockMapRecovery.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.h#1 $ + */ + +#ifndef BLOCK_MAP_RECOVERY_H +#define BLOCK_MAP_RECOVERY_H + +#include "blockMap.h" +#include "blockMappingState.h" +#include "types.h" + +/** + * An explicitly numbered block mapping. Numbering the mappings allows them to + * be sorted by logical block number during recovery while still preserving + * the relative order of journal entries with the same logical block number. + **/ +typedef struct { + BlockMapSlot blockMapSlot; // Block map slot to map + BlockMapEntry blockMapEntry; // The encoded block map entry for the LBN + uint32_t number; // The serial number to use during replay +} __attribute__((packed)) NumberedBlockMapping; + +/** + * Recover the block map (normal rebuild). + * + * @param vdo The VDO + * @param entryCount The number of journal entries + * @param journalEntries An array of journal entries to process + * @param parent The completion to notify when the rebuild is complete + **/ +void recoverBlockMap(VDO *vdo, + BlockCount entryCount, + NumberedBlockMapping *journalEntries, + VDOCompletion *parent); + +#endif // BLOCK_MAP_RECOVERY_H diff --git a/source/vdo/base/blockMapTree.c b/source/vdo/base/blockMapTree.c new file mode 100644 index 0000000..fb2b4f4 --- /dev/null +++ b/source/vdo/base/blockMapTree.c @@ -0,0 +1,1272 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTree.c#21 $ + */ + +#include "blockMapTree.h" + +#include "logger.h" + +#include "blockMap.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "blockMapTreeInternals.h" +#include "constants.h" +#include "dataVIO.h" +#include "dirtyLists.h" +#include "forest.h" +#include "numUtils.h" +#include "recoveryJournal.h" +#include "referenceOperation.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "types.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" +#include "vioPool.h" + +enum { + BLOCK_MAP_VIO_POOL_SIZE = 64, +}; + +typedef struct __attribute__((packed)) { + RootCount rootIndex; + Height height; + PageNumber pageIndex; + SlotNumber slot; +} PageDescriptor; + +typedef union { + PageDescriptor descriptor; + uint64_t key; +} PageKey; + +typedef struct { + BlockMapTreeZone *zone; + uint8_t generation; +} WriteIfNotDirtiedContext; + +/** + * An invalid PBN used to indicate that the page holding the location of a + * tree root has been "loaded". + **/ +const PhysicalBlockNumber INVALID_PBN = 0xFFFFFFFFFFFFFFFF; + +/** + * Convert a RingNode to a TreePage. + * + * @param ringNode The RingNode to convert + * + * @return The TreePage which owns the RingNode + **/ +static inline TreePage *treePageFromRingNode(RingNode *ringNode) +{ + return (TreePage *) ((byte *) ringNode - offsetof(TreePage, node)); +} + +/**********************************************************************/ +static void writeDirtyPagesCallback(RingNode *expired, void *context); + +/** + * Make VIOs for reading, writing, and allocating the arboreal block map. + * + * Implements VIOConstructor. + **/ +__attribute__((warn_unused_result)) +static int makeBlockMapVIOs(PhysicalLayer *layer, + void *parent, + void *buffer, + VIO **vioPtr) +{ + return createVIO(layer, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA, + parent, buffer, vioPtr); +} + +/**********************************************************************/ +int initializeTreeZone(BlockMapZone *zone, + PhysicalLayer *layer, + BlockCount eraLength) +{ + STATIC_ASSERT_SIZEOF(PageDescriptor, sizeof(uint64_t)); + BlockMapTreeZone *treeZone = &zone->treeZone; + treeZone->mapZone = zone; + + int result = makeDirtyLists(eraLength, writeDirtyPagesCallback, treeZone, + &treeZone->dirtyLists); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeIntMap(LOCK_MAP_CAPACITY, 0, &treeZone->loadingPages); + if (result != VDO_SUCCESS) { + return result; + } + + return makeVIOPool(layer, BLOCK_MAP_VIO_POOL_SIZE, zone->threadID, + makeBlockMapVIOs, treeZone, &treeZone->vioPool); +} + +/**********************************************************************/ +int replaceTreeZoneVIOPool(BlockMapTreeZone *zone, + PhysicalLayer *layer, + size_t poolSize) +{ + freeVIOPool(&zone->vioPool); + return makeVIOPool(layer, poolSize, zone->mapZone->threadID, + makeBlockMapVIOs, zone, &zone->vioPool); +} + +/**********************************************************************/ +void uninitializeBlockMapTreeZone(BlockMapTreeZone *treeZone) +{ + freeDirtyLists(&treeZone->dirtyLists); + freeVIOPool(&treeZone->vioPool); + freeIntMap(&treeZone->loadingPages); +} + +/**********************************************************************/ +void setTreeZoneInitialPeriod(BlockMapTreeZone *treeZone, + SequenceNumber period) +{ + setCurrentPeriod(treeZone->dirtyLists, period); +} + +/** + * Get the BlockMapTreeZone in which a DataVIO is operating. + * + * @param dataVIO The DataVIO + * + * @return The BlockMapTreeZone + **/ +__attribute__((warn_unused_result)) +static inline BlockMapTreeZone *getBlockMapTreeZone(DataVIO *dataVIO) +{ + return &(getBlockMapForZone(dataVIO->logical.zone)->treeZone); +} + +/** + * Get the TreePage for a given lock. This will be the page referred to by the + * lock's tree slot for the lock's current height. + * + * @param zone The tree zone of the tree + * @param lock The lock describing the page to get + * + * @return The requested page + **/ +static inline TreePage *getTreePage(const BlockMapTreeZone *zone, + const TreeLock *lock) +{ + return getTreePageByIndex(zone->mapZone->blockMap->forest, + lock->rootIndex, + lock->height, + lock->treeSlots[lock->height].pageIndex); +} + +/**********************************************************************/ +bool copyValidPage(char *buffer, + Nonce nonce, + PhysicalBlockNumber pbn, + BlockMapPage *page) +{ + BlockMapPage *loaded = (BlockMapPage *) buffer; + BlockMapPageValidity validity = validateBlockMapPage(loaded, nonce, pbn); + if (validity == BLOCK_MAP_PAGE_VALID) { + memcpy(page, loaded, VDO_BLOCK_SIZE); + return true; + } + + if (validity == BLOCK_MAP_PAGE_BAD) { + logErrorWithStringError(VDO_BAD_PAGE, + "Expected page %" PRIu64 + " but got page %llu instead", + pbn, getBlockMapPagePBN(loaded)); + } + + return false; +} + +/**********************************************************************/ +bool isTreeZoneActive(BlockMapTreeZone *zone) +{ + return ((zone->activeLookups != 0) + || hasWaiters(&zone->flushWaiters) + || isVIOPoolBusy(zone->vioPool)); +} + +/** + * Put the VDO in read-only mode and wake any VIOs waiting for a flush. + * + * @param zone The zone + * @param result The error which is causing read-only mode + **/ +static void enterZoneReadOnlyMode(BlockMapTreeZone *zone, int result) +{ + enterReadOnlyMode(zone->mapZone->readOnlyNotifier, result); + + // We are in read-only mode, so we won't ever write any page out. Just take + // all waiters off the queue so the tree zone can be closed. + while (hasWaiters(&zone->flushWaiters)) { + dequeueNextWaiter(&zone->flushWaiters); + } + + checkForDrainComplete(zone->mapZone); +} + +/** + * Check whether a generation is strictly older than some other generation in + * the context of a zone's current generation range. + * + * @param zone The zone in which to do the comparison + * @param a The generation in question + * @param b The generation to compare to + * + * @return true if generation a is not strictly older than + * generation b in the context of the zone + **/ +__attribute__((warn_unused_result)) +static bool isNotOlder(BlockMapTreeZone *zone, uint8_t a, uint8_t b) +{ + int result = ASSERT((inCyclicRange(zone->oldestGeneration, a, + zone->generation, 1 << 8) + && inCyclicRange(zone->oldestGeneration, b, + zone->generation, 1 << 8)), + "generation(s) %u, %u are out of range [%u, %u]", + a, b, zone->oldestGeneration, zone->generation); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + return true; + } + + return inCyclicRange(b, a, zone->generation, 1 << 8); +} + +/** + * Decrement the count for a generation and roll the oldest generation if there + * are no longer any active pages in it. + * + * @param zone The zone + * @param generation The generation to release + **/ +static void releaseGeneration(BlockMapTreeZone *zone, uint8_t generation) +{ + int result = ASSERT((zone->dirtyPageCounts[generation] > 0), + "dirty page count underflow for generation %u", + generation); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + return; + } + + zone->dirtyPageCounts[generation]--; + while ((zone->dirtyPageCounts[zone->oldestGeneration] == 0) + && (zone->oldestGeneration != zone->generation)) { + zone->oldestGeneration++; + } +} + +/** + * Set the generation of a page and update the dirty page count in the zone. + * + * @param zone The zone which owns the page + * @param page The page + * @param newGeneration The generation to set + * @param decrementOld Whether to decrement the count of the page's old + * generation + **/ +static void setGeneration(BlockMapTreeZone *zone, + TreePage *page, + uint8_t newGeneration, + bool decrementOld) +{ + uint8_t oldGeneration = page->generation; + if (decrementOld && (oldGeneration == newGeneration)) { + return; + } + + page->generation = newGeneration; + uint32_t newCount = ++zone->dirtyPageCounts[newGeneration]; + int result = ASSERT((newCount != 0), + "dirty page count overflow for generation %u", + newGeneration); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + return; + } + + if (decrementOld) { + releaseGeneration(zone, oldGeneration); + } +} + +/**********************************************************************/ +static void writePage(TreePage *treePage, VIOPoolEntry *entry); + +/** + * Write out a dirty page if it is still covered by the most recent flush + * or if it is the flusher. + * + *

Implements WaiterCallback + * + * @param waiter The page to write + * @param context The VIOPoolEntry with which to do the write + **/ +static void writePageCallback(Waiter *waiter, void *context) +{ + STATIC_ASSERT(offsetof(TreePage, waiter) == 0); + writePage((TreePage *) waiter, (VIOPoolEntry *) context); +} + +/** + * Acquire a VIO for writing a dirty page. + * + * @param waiter The page which needs a VIO + * @param zone The zone + **/ +static void acquireVIO(Waiter *waiter, BlockMapTreeZone *zone) +{ + waiter->callback = writePageCallback; + int result = acquireVIOFromPool(zone->vioPool, waiter); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + } +} + +/** + * Attempt to increment the generation. + * + * @param zone The zone whose generation is to be incremented + * + * @return true if all possible generations were not already + * active + **/ +static bool attemptIncrement(BlockMapTreeZone *zone) +{ + uint8_t generation = zone->generation + 1; + if (zone->oldestGeneration == generation) { + return false; + } + + zone->generation = generation; + return true; +} + +/** + * Enqueue a page to either launch a flush or wait for the current flush which + * is already in progress. + * + * @param page The page to enqueue + * @param zone The zone + **/ +static void enqueuePage(TreePage *page, BlockMapTreeZone *zone) +{ + if ((zone->flusher == NULL) && attemptIncrement(zone)) { + zone->flusher = page; + acquireVIO(&page->waiter, zone); + return; + } + + int result = enqueueWaiter(&zone->flushWaiters, &page->waiter); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + } +} + +/** + * Write pages which were waiting for a flush and have not been redirtied. + * Requeue those pages which were redirtied. + * + *

Implements WaiterCallback. + * + * @param waiter The dirty page + * @param context The zone and generation + **/ +static void writePageIfNotDirtied(Waiter *waiter, void *context) +{ + STATIC_ASSERT(offsetof(TreePage, waiter) == 0); + TreePage *page = (TreePage *) waiter; + WriteIfNotDirtiedContext *writeContext = context; + if (page->generation == writeContext->generation) { + acquireVIO(waiter, writeContext->zone); + return; + } + + enqueuePage(page, writeContext->zone); +} + +/** + * Return a VIO to the zone's pool. + * + * @param zone The zone which owns the pool + * @param entry The pool entry to return + **/ +static void returnToPool(BlockMapTreeZone *zone, VIOPoolEntry *entry) +{ + returnVIOToPool(zone->vioPool, entry); + checkForDrainComplete(zone->mapZone); +} + +/** + * Handle the successful write of a tree page. This callback is registered in + * writeInitializedPage(). + * + * @param completion The VIO doing the write + **/ +static void finishPageWrite(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + TreePage *page = entry->parent; + BlockMapTreeZone *zone = entry->context; + releaseRecoveryJournalBlockReference(zone->mapZone->blockMap->journal, + page->writingRecoveryLock, + ZONE_TYPE_LOGICAL, + zone->mapZone->zoneNumber); + + bool dirty = (page->writingGeneration != page->generation); + releaseGeneration(zone, page->writingGeneration); + page->writing = false; + + if (zone->flusher == page) { + WriteIfNotDirtiedContext context = { + .zone = zone, + .generation = page->writingGeneration, + }; + notifyAllWaiters(&zone->flushWaiters, writePageIfNotDirtied, &context); + if (dirty && attemptIncrement(zone)) { + writePage(page, entry); + return; + } + + zone->flusher = NULL; + } + + if (dirty) { + enqueuePage(page, zone); + } else if ((zone->flusher == NULL) + && hasWaiters(&zone->flushWaiters) + && attemptIncrement(zone)) { + zone->flusher = (TreePage *) dequeueNextWaiter(&zone->flushWaiters); + writePage(zone->flusher, entry); + return; + } + + returnToPool(zone, entry); +} + +/** + * Handle an error writing a tree page. This error handler is registered in + * writePage() and writeInitializedPage(). + * + * @param completion The VIO doing the write + **/ +static void handleWriteError(VDOCompletion *completion) +{ + int result = completion->result; + VIOPoolEntry *entry = completion->parent; + BlockMapTreeZone *zone = entry->context; + enterZoneReadOnlyMode(zone, result); + returnToPool(zone, entry); +} + +/** + * Write a page which has been written at least once. This callback is + * registered in (or called directly from) writePage(). + * + * @param completion The VIO which will do the write + **/ +static void writeInitializedPage(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; + TreePage *treePage = (TreePage *) entry->parent; + + /* + * Set the initialized field of the copy of the page we are writing to true. + * We don't want to set it true on the real page in memory until after this + * write succeeds. + */ + BlockMapPage *page = (BlockMapPage *) entry->buffer; + markBlockMapPageInitialized(page, true); + launchWriteMetadataVIOWithFlush(entry->vio, getBlockMapPagePBN(page), + finishPageWrite, handleWriteError, + (zone->flusher == treePage), false); +} + +/** + * Write a dirty tree page now that we have a VIO with which to write it. + * + * @param treePage The page to write + * @param entry The VIOPoolEntry with which to write + **/ +static void writePage(TreePage *treePage, VIOPoolEntry *entry) +{ + BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; + if ((zone->flusher != treePage) + && (isNotOlder(zone, treePage->generation, zone->generation))) { + // This page was re-dirtied after the last flush was issued, hence we need + // to do another flush. + enqueuePage(treePage, zone); + returnToPool(zone, entry); + return; + } + + entry->parent = treePage; + memcpy(entry->buffer, treePage->pageBuffer, VDO_BLOCK_SIZE); + + VDOCompletion *completion = vioAsCompletion(entry->vio); + completion->callbackThreadID = zone->mapZone->threadID; + + treePage->writing = true; + treePage->writingGeneration = treePage->generation; + treePage->writingRecoveryLock = treePage->recoveryLock; + + // Clear this now so that we know this page is not on any dirty list. + treePage->recoveryLock = 0; + + BlockMapPage *page = asBlockMapPage(treePage); + if (!markBlockMapPageInitialized(page, true)) { + writeInitializedPage(completion); + return; + } + + launchWriteMetadataVIO(entry->vio, getBlockMapPagePBN(page), + writeInitializedPage, handleWriteError); +} + +/** + * Schedule a batch of dirty pages for writing. + * + *

Implements DirtyListsCallback. + * + * @param expired The pages to write + * @param context The zone + **/ +static void writeDirtyPagesCallback(RingNode *expired, void *context) +{ + BlockMapTreeZone *zone = (BlockMapTreeZone *) context; + uint8_t generation = zone->generation; + while (!isRingEmpty(expired)) { + TreePage *page = treePageFromRingNode(chopRingNode(expired)); + + int result = ASSERT(!isWaiting(&page->waiter), + "Newly expired page not already waiting to write"); + if (result != VDO_SUCCESS) { + enterZoneReadOnlyMode(zone, result); + continue; + } + + setGeneration(zone, page, generation, false); + if (!page->writing) { + enqueuePage(page, zone); + } + } +} + +/**********************************************************************/ +void advanceZoneTreePeriod(BlockMapTreeZone *zone, SequenceNumber period) +{ + advancePeriod(zone->dirtyLists, period); +} + +/**********************************************************************/ +void drainZoneTrees(BlockMapTreeZone *zone) +{ + ASSERT_LOG_ONLY((zone->activeLookups == 0), + "drainZoneTrees() called with no active lookups"); + if (!isSuspending(&zone->mapZone->state)) { + flushDirtyLists(zone->dirtyLists); + } +} + +/** + * Release a lock on a page which was being loaded or allocated. + * + * @param dataVIO The DataVIO releasing the page lock + * @param what What the DataVIO was doing (for logging) + **/ +static void releasePageLock(DataVIO *dataVIO, char *what) +{ + TreeLock *lock = &dataVIO->treeLock; + ASSERT_LOG_ONLY(lock->locked, + "release of unlocked block map page %s for key %" PRIu64 + " in tree %u", + what, lock->key, lock->rootIndex); + BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); + TreeLock *lockHolder = intMapRemove(zone->loadingPages, lock->key); + ASSERT_LOG_ONLY((lockHolder == lock), + "block map page %s mismatch for key %llu in tree %u", + what, lock->key, lock->rootIndex); + lock->locked = false; +} + +/** + * Continue a DataVIO now that the lookup is complete. + * + * @param dataVIO The DataVIO + * @param result The result of the lookup + **/ +static void finishLookup(DataVIO *dataVIO, int result) +{ + dataVIO->treeLock.height = 0; + + BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); + --zone->activeLookups; + + VDOCompletion *completion = dataVIOAsCompletion(dataVIO); + setCompletionResult(completion, result); + launchCallback(completion, dataVIO->treeLock.callback, + dataVIO->treeLock.threadID); +} + +/** + * Abort a block map PBN lookup due to an error in the load or allocation on + * which we were waiting. + * + * @param waiter The DataVIO which was waiting for a page load or allocation + * @param context The error which caused the abort + **/ +static void abortLookupForWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + int result = *((int *) context); + if (isReadDataVIO(dataVIO)) { + if (result == VDO_NO_SPACE) { + result = VDO_SUCCESS; + } + } else if (result != VDO_NO_SPACE) { + result = VDO_READ_ONLY; + } + + finishLookup(dataVIO, result); +} + +/** + * Abort a block map PBN lookup due to an error loading or allocating a page. + * + * @param dataVIO The DataVIO which was loading or allocating a page + * @param result The error code + * @param what What the DataVIO was doing (for logging) + **/ +static void abortLookup(DataVIO *dataVIO, int result, char *what) +{ + if (result != VDO_NO_SPACE) { + enterZoneReadOnlyMode(getBlockMapTreeZone(dataVIO), result); + } + + if (dataVIO->treeLock.locked) { + releasePageLock(dataVIO, what); + notifyAllWaiters(&dataVIO->treeLock.waiters, abortLookupForWaiter, + &result); + } + + finishLookup(dataVIO, result); +} + +/** + * Abort a block map PBN lookup due to an error loading a page. + * + * @param dataVIO The DataVIO doing the page load + * @param result The error code + **/ +static void abortLoad(DataVIO *dataVIO, int result) +{ + abortLookup(dataVIO, result, "load"); +} + +/** + * Determine if a location represents a valid mapping for a tree page. + * + * @param vdo The VDO + * @param mapping The DataLocation to check + * @param height The height of the entry in the tree + * + * @return true if the entry represents a invalid page mapping + **/ +__attribute__((warn_unused_result)) +static bool isInvalidTreeEntry(const VDO *vdo, + const DataLocation *mapping, + Height height) +{ + if (!isValidLocation(mapping) + || isCompressed(mapping->state) + || (isMappedLocation(mapping) && (mapping->pbn == ZERO_BLOCK))) { + return true; + } + + // Roots aren't physical data blocks, so we can't check their PBNs. + if (height == BLOCK_MAP_TREE_HEIGHT) { + return false; + } + + return !isPhysicalDataBlock(vdo->depot, mapping->pbn); +} + +/**********************************************************************/ +static void loadBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO); +static void allocateBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO); + +/** + * Continue a block map PBN lookup now that a page has been loaded by + * descending one level in the tree. + * + * @param dataVIO The DataVIO doing the lookup + * @param page The page which was just loaded + **/ +static void continueWithLoadedPage(DataVIO *dataVIO, BlockMapPage *page) +{ + TreeLock *lock = &dataVIO->treeLock; + BlockMapTreeSlot slot = lock->treeSlots[lock->height]; + DataLocation mapping + = unpackBlockMapEntry(&page->entries[slot.blockMapSlot.slot]); + if (isInvalidTreeEntry(getVDOFromDataVIO(dataVIO), &mapping, lock->height)) { + logErrorWithStringError(VDO_BAD_MAPPING, + "Invalid block map tree PBN: %llu with " + "state %u for page index %u at height %u", + mapping.pbn, mapping.state, + lock->treeSlots[lock->height - 1].pageIndex, + lock->height - 1); + abortLoad(dataVIO, VDO_BAD_MAPPING); + return; + } + + if (!isMappedLocation(&mapping)) { + // The page we need is unallocated + allocateBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); + return; + } + + lock->treeSlots[lock->height - 1].blockMapSlot.pbn = mapping.pbn; + if (lock->height == 1) { + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + // We know what page we need to load next + loadBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); +} + +/** + * Continue a block map PBN lookup now that the page load we were waiting on + * has finished. + * + * @param waiter The DataVIO waiting for a page to be loaded + * @param context The page which was just loaded + **/ +static void continueLoadForWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + dataVIO->treeLock.height--; + continueWithLoadedPage(dataVIO, (BlockMapPage *) context); +} + +/** + * Finish loading a page now that it has been read in from disk. This callback + * is registered in loadPage(). + * + * @param completion The VIO doing the page read + **/ +static void finishBlockMapPageLoad(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + DataVIO *dataVIO = entry->parent; + BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; + TreeLock *treeLock = &dataVIO->treeLock; + + treeLock->height--; + PhysicalBlockNumber pbn + = treeLock->treeSlots[treeLock->height].blockMapSlot.pbn; + TreePage *treePage = getTreePage(zone, treeLock); + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + Nonce nonce = zone->mapZone->blockMap->nonce; + if (!copyValidPage(entry->buffer, nonce, pbn, page)) { + formatBlockMapPage(page, nonce, pbn, false); + } + returnVIOToPool(zone->vioPool, entry); + + // Release our claim to the load and wake any waiters + releasePageLock(dataVIO, "load"); + notifyAllWaiters(&treeLock->waiters, continueLoadForWaiter, page); + continueWithLoadedPage(dataVIO, page); +} + +/** + * Handle an error loading a tree page. + * + * @param completion The VIO doing the page read + **/ +static void handleIOError(VDOCompletion *completion) +{ + int result = completion->result; + VIOPoolEntry *entry = completion->parent; + DataVIO *dataVIO = entry->parent; + BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; + returnVIOToPool(zone->vioPool, entry); + abortLoad(dataVIO, result); +} + +/** + * Read a tree page from disk now that we've gotten a VIO with which to do the + * read. This WaiterCallback is registered in loadBlockMapPage(). + * + * @param waiter The DataVIO which requires a page load + * @param context The VIOPool entry with which to do the read + **/ +static void loadPage(Waiter *waiter, void *context) +{ + VIOPoolEntry *entry = context; + DataVIO *dataVIO = waiterAsDataVIO(waiter); + + entry->parent = dataVIO; + entry->vio->completion.callbackThreadID + = getBlockMapForZone(dataVIO->logical.zone)->threadID; + + TreeLock *lock = &dataVIO->treeLock; + launchReadMetadataVIO(entry->vio, + lock->treeSlots[lock->height - 1].blockMapSlot.pbn, + finishBlockMapPageLoad, handleIOError); +} + +/** + * Attempt to acquire a lock on a page in the block map tree. If the page is + * already locked, queue up to wait for the lock to be released. If the lock is + * acquired, the DataVIO's treeLock.locked field will be set to true. + * + * @param zone The BlockMapTreeZone in which the DataVIO operates + * @param dataVIO The DataVIO which desires a page lock + * + * @return VDO_SUCCESS or an error + **/ +static int attemptPageLock(BlockMapTreeZone *zone, DataVIO *dataVIO) +{ + TreeLock *lock = &dataVIO->treeLock; + Height height = lock->height; + BlockMapTreeSlot treeSlot = lock->treeSlots[height]; + PageKey key; + key.descriptor = (PageDescriptor) { + .rootIndex = lock->rootIndex, + .height = height, + .pageIndex = treeSlot.pageIndex, + .slot = treeSlot.blockMapSlot.slot, + }; + lock->key = key.key; + + TreeLock *lockHolder; + int result = intMapPut(zone->loadingPages, lock->key, lock, false, + (void **) &lockHolder); + if (result != VDO_SUCCESS) { + return result; + } + + if (lockHolder == NULL) { + // We got the lock + dataVIO->treeLock.locked = true; + return VDO_SUCCESS; + } + + // Someone else is loading or allocating the page we need + return enqueueDataVIO(&lockHolder->waiters, dataVIO, + THIS_LOCATION("$F;cb=blockMapTreePage")); +} + +/** + * Load a block map tree page from disk. + * + * @param zone The BlockMapTreeZone in which the DataVIO operates + * @param dataVIO The DataVIO which requires a page to be loaded + **/ +static void loadBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO) +{ + int result = attemptPageLock(zone, dataVIO); + if (result != VDO_SUCCESS) { + abortLoad(dataVIO, result); + return; + } + + if (dataVIO->treeLock.locked) { + Waiter *waiter = dataVIOAsWaiter(dataVIO); + waiter->callback = loadPage; + result = acquireVIOFromPool(zone->vioPool, waiter); + if (result != VDO_SUCCESS) { + abortLoad(dataVIO, result); + } + } +} + +/** + * Set the callback of a DataVIO after it has allocated a block map page. + * + * @param dataVIO The DataVIO + **/ +static void setPostAllocationCallback(DataVIO *dataVIO) +{ + setCallback(dataVIOAsCompletion(dataVIO), dataVIO->treeLock.callback, + dataVIO->treeLock.threadID); +} + +/** + * Abort a block map PBN lookup due to an error allocating a page. + * + * @param dataVIO The DataVIO doing the page allocation + * @param result The error code + **/ +static void abortAllocation(DataVIO *dataVIO, int result) +{ + setPostAllocationCallback(dataVIO); + abortLookup(dataVIO, result, "allocation"); +} + +/** + * Callback to handle an error while attempting to allocate a page. This + * callback is used to transfer back to the logical zone along the block map + * page allocation path. + * + * @param completion The DataVIO doing the allocation + **/ +static void allocationFailure(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + abortAllocation(dataVIO, completion->result); +} + +/** + * Continue with page allocations now that a parent page has been allocated. + * + * @param waiter The DataVIO which was waiting for a page to be allocated + * @param context The physical block number of the page which was just + * allocated + **/ +static void continueAllocationForWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + TreeLock *treeLock = &dataVIO->treeLock; + PhysicalBlockNumber pbn = *((PhysicalBlockNumber *) context); + + treeLock->height--; + dataVIO->treeLock.treeSlots[treeLock->height].blockMapSlot.pbn = pbn; + + if (treeLock->height == 0) { + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + allocateBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); +} + +/** + * Finish the page allocation process by recording the allocation in the tree + * and waking any waiters now that the write lock has been released. This + * callback is registered in releaseBlockMapWriteLock(). + * + * @param completion The DataVIO doing the allocation + **/ +static void finishBlockMapAllocation(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + if (completion->result != VDO_SUCCESS) { + allocationFailure(completion); + return; + } + + BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); + TreeLock *treeLock = &dataVIO->treeLock; + TreePage *treePage = getTreePage(zone, treeLock); + Height height = treeLock->height; + + PhysicalBlockNumber pbn = treeLock->treeSlots[height - 1].blockMapSlot.pbn; + + // Record the allocation. + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + SequenceNumber oldLock = treePage->recoveryLock; + updateBlockMapPage(page, dataVIO, pbn, MAPPING_STATE_UNCOMPRESSED, + &treePage->recoveryLock); + + if (isWaiting(&treePage->waiter)) { + // This page is waiting to be written out. + if (zone->flusher != treePage) { + // The outstanding flush won't cover the update we just made, so mark + // the page as needing another flush. + setGeneration(zone, treePage, zone->generation, true); + } + } else { + // Put the page on a dirty list + if (oldLock == 0) { + initializeRing(&treePage->node); + } + addToDirtyLists(zone->dirtyLists, &treePage->node, oldLock, + treePage->recoveryLock); + } + + treeLock->height--; + if (height > 1) { + // Format the interior node we just allocated (in memory). + treePage = getTreePage(zone, treeLock); + formatBlockMapPage(treePage->pageBuffer, zone->mapZone->blockMap->nonce, + pbn, false); + } + + // Release our claim to the allocation and wake any waiters + releasePageLock(dataVIO, "allocation"); + notifyAllWaiters(&treeLock->waiters, continueAllocationForWaiter, &pbn); + if (treeLock->height == 0) { + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + allocateBlockMapPage(zone, dataVIO); +} + +/** + * Release the write lock on a newly allocated block map page now that we + * have made its journal entries and reference count updates. This callback + * is registered in setBlockMapPageReferenceCount(). + * + * @param completion The DataVIO doing the allocation + **/ +static void releaseBlockMapWriteLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); + assertInAllocatedZone(dataVIO); + if (completion->result != VDO_SUCCESS) { + launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); + return; + } + + releaseAllocationLock(allocatingVIO); + resetAllocation(allocatingVIO); + launchLogicalCallback(dataVIO, finishBlockMapAllocation, + THIS_LOCATION("$F;cb=finishBlockMapAllocation")); +} + +/** + * Set the reference count of a newly allocated block map page to + * MAXIMUM_REFERENCES now that we have made a recovery journal entry for it. + * MAXIMUM_REFERENCES is used to prevent deduplication against the block after + * we release the write lock on it, but before we write out the page. + * + * @param completion The DataVIO doing the allocation + **/ +static void setBlockMapPageReferenceCount(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInAllocatedZone(dataVIO); + if (completion->result != VDO_SUCCESS) { + launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); + return; + } + + TreeLock *lock = &dataVIO->treeLock; + PhysicalBlockNumber pbn = lock->treeSlots[lock->height - 1].blockMapSlot.pbn; + completion->callback = releaseBlockMapWriteLock; + addSlabJournalEntry(getSlabJournal(getVDOFromDataVIO(dataVIO)->depot, pbn), + dataVIO); +} + +/** + * Make a recovery journal entry for a newly allocated block map page. + * This callback is registered in continueBlockMapPageAllocation(). + * + * @param completion The DataVIO doing the allocation + **/ +static void journalBlockMapAllocation(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (completion->result != VDO_SUCCESS) { + launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); + return; + } + + setAllocatedZoneCallback(dataVIO, setBlockMapPageReferenceCount, + THIS_LOCATION(NULL)); + addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, + dataVIO); +} + +/** + * Continue the process of allocating a block map page now that the + * BlockAllocator has given us a block. This method is supplied as the callback + * to allocateDataBlock() by allocateBlockMapPage(). + * + * @param allocatingVIO The DataVIO which is doing the allocation + **/ +static void continueBlockMapPageAllocation(AllocatingVIO *allocatingVIO) +{ + DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO); + if (!hasAllocation(dataVIO)) { + setLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); + continueDataVIO(dataVIO, VDO_NO_SPACE); + return; + } + + PhysicalBlockNumber pbn = allocatingVIO->allocation; + TreeLock *lock = &dataVIO->treeLock; + lock->treeSlots[lock->height - 1].blockMapSlot.pbn = pbn; + setUpReferenceOperationWithLock(BLOCK_MAP_INCREMENT, pbn, + MAPPING_STATE_UNCOMPRESSED, + allocatingVIO->allocationLock, + &dataVIO->operation); + launchJournalCallback(dataVIO, journalBlockMapAllocation, + THIS_LOCATION("$F;cb=journalBlockMapAllocation")); +} + +/** + * Allocate a block map page. + * + * @param zone The zone in which the DataVIO is operating + * @param dataVIO The DataVIO which needs to allocate a page + **/ +static void allocateBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO) +{ + if (!isWriteDataVIO(dataVIO) || isTrimDataVIO(dataVIO)) { + // This is a pure read, the read phase of a read-modify-write, or a trim, + // so there's nothing left to do here. + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + int result = attemptPageLock(zone, dataVIO); + if (result != VDO_SUCCESS) { + abortAllocation(dataVIO, result); + return; + } + + if (!dataVIO->treeLock.locked) { + return; + } + + allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO), + getAllocationSelector(dataVIO->logical.zone), + VIO_BLOCK_MAP_WRITE_LOCK, + continueBlockMapPageAllocation); +} + +/**********************************************************************/ +void lookupBlockMapPBN(DataVIO *dataVIO) +{ + BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); + zone->activeLookups++; + if (isDraining(&zone->mapZone->state)) { + finishLookup(dataVIO, VDO_SHUTTING_DOWN); + return; + } + + TreeLock *lock = &dataVIO->treeLock; + PageNumber pageIndex + = ((lock->treeSlots[0].pageIndex - zone->mapZone->blockMap->flatPageCount) + / zone->mapZone->blockMap->rootCount); + BlockMapTreeSlot treeSlot = { + .pageIndex = pageIndex / BLOCK_MAP_ENTRIES_PER_PAGE, + .blockMapSlot = { + .pbn = 0, + .slot = pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE, + }, + }; + + BlockMapPage *page = NULL; + for (lock->height = 1; lock->height <= BLOCK_MAP_TREE_HEIGHT; + lock->height++) { + lock->treeSlots[lock->height] = treeSlot; + page = (BlockMapPage *) (getTreePage(zone, lock)->pageBuffer); + PhysicalBlockNumber pbn = getBlockMapPagePBN(page); + if (pbn != ZERO_BLOCK) { + lock->treeSlots[lock->height].blockMapSlot.pbn = pbn; + break; + } + + // Calculate the index and slot for the next level. + treeSlot.blockMapSlot.slot + = treeSlot.pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE; + treeSlot.pageIndex + = treeSlot.pageIndex / BLOCK_MAP_ENTRIES_PER_PAGE; + } + + // The page at this height has been allocated and loaded. + DataLocation mapping + = unpackBlockMapEntry(&page->entries[treeSlot.blockMapSlot.slot]); + if (isInvalidTreeEntry(getVDOFromDataVIO(dataVIO), &mapping, lock->height)) { + logErrorWithStringError(VDO_BAD_MAPPING, + "Invalid block map tree PBN: %llu with " + "state %u for page index %u at height %u", + mapping.pbn, mapping.state, + lock->treeSlots[lock->height - 1].pageIndex, + lock->height - 1); + abortLoad(dataVIO, VDO_BAD_MAPPING); + return; + } + + if (!isMappedLocation(&mapping)) { + // The page we want one level down has not been allocated, so allocate it. + allocateBlockMapPage(zone, dataVIO); + return; + } + + lock->treeSlots[lock->height - 1].blockMapSlot.pbn = mapping.pbn; + if (lock->height == 1) { + // This is the ultimate block map page, so we're done + finishLookup(dataVIO, VDO_SUCCESS); + return; + } + + // We know what page we need to load. + loadBlockMapPage(zone, dataVIO); +} + +/**********************************************************************/ +PhysicalBlockNumber findBlockMapPagePBN(BlockMap *map, PageNumber pageNumber) +{ + if (pageNumber < map->flatPageCount) { + return (BLOCK_MAP_FLAT_PAGE_ORIGIN + pageNumber); + } + + RootCount rootIndex = pageNumber % map->rootCount; + PageNumber pageIndex = ((pageNumber - map->flatPageCount) / map->rootCount); + SlotNumber slot = pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE; + pageIndex /= BLOCK_MAP_ENTRIES_PER_PAGE; + + TreePage *treePage + = getTreePageByIndex(map->forest, rootIndex, 1, pageIndex); + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + if (!isBlockMapPageInitialized(page)) { + return ZERO_BLOCK; + } + + DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); + if (!isValidLocation(&mapping) || isCompressed(mapping.state)) { + return ZERO_BLOCK; + } + return mapping.pbn; +} + +/**********************************************************************/ +void writeTreePage(TreePage *page, BlockMapTreeZone *zone) +{ + bool waiting = isWaiting(&page->waiter); + if (waiting && (zone->flusher == page)) { + return; + } + + setGeneration(zone, page, zone->generation, waiting); + if (waiting || page->writing) { + return; + } + + enqueuePage(page, zone); +} diff --git a/source/vdo/base/blockMapTree.h b/source/vdo/base/blockMapTree.h new file mode 100644 index 0000000..c581454 --- /dev/null +++ b/source/vdo/base/blockMapTree.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTree.h#7 $ + */ + +#ifndef BLOCK_MAP_TREE_H +#define BLOCK_MAP_TREE_H + +#include "constants.h" +#include "types.h" + +typedef struct treePage TreePage; + +/** + * Intialize a BlockMapTreeZone. + * + * @param zone The BlockMapZone of the tree zone to intialize + * @param layer The physical layer + * @param maximumAge The number of journal blocks before a dirtied page + * is considered old and may be written out + * + * @return VDO_SUCCESS or an error + **/ +int initializeTreeZone(BlockMapZone *zone, + PhysicalLayer *layer, + BlockCount maximumAge) + __attribute__((warn_unused_result)); + +/** + * Clean up a BlockMapTreeZone. + * + * @param treeZone The zone to clean up + **/ +void uninitializeBlockMapTreeZone(BlockMapTreeZone *treeZone); + +/** + * Set the initial dirty period for a tree zone. + * + * @param treeZone The tree zone + * @param period The initial dirty period to set + **/ +void setTreeZoneInitialPeriod(BlockMapTreeZone *treeZone, + SequenceNumber period); + +/** + * Check whether a tree zone is active (i.e. has any active lookups, + * outstanding I/O, or pending I/O). + * + * @param zone The zone to check + * + * @return true if the zone is active + **/ +bool isTreeZoneActive(BlockMapTreeZone *zone) + __attribute__((warn_unused_result)); + +/** + * Advance the dirty period for a tree zone. + * + * @param zone The BlockMapTreeZone to advance + * @param period The new dirty period + **/ +void advanceZoneTreePeriod(BlockMapTreeZone *zone, SequenceNumber period); + +/** + * Drain the zone trees, i.e. ensure that all I/O is quiesced. If required by + * the drain type, all dirty block map trees will be written to disk. This + * method must not be called when lookups are active. + * + * @param zone The BlockMapTreeZone to drain + **/ +void drainZoneTrees(BlockMapTreeZone *zone); + +/** + * Look up the PBN of the block map page for a DataVIO's LBN in the arboreal + * block map. If necessary, the block map page will be allocated. Also, the + * ancestors of the block map page will be allocated or loaded if necessary. + * + * @param dataVIO The DataVIO requesting the lookup + **/ +void lookupBlockMapPBN(DataVIO *dataVIO); + +/** + * Find the PBN of a leaf block map page. This method may only be used after + * all allocated tree pages have been loaded, otherwise, it may give the wrong + * answer (0). + * + * @param map The block map containing the forest + * @param pageNumber The page number of the desired block map page + * + * @return The PBN of the page + **/ +PhysicalBlockNumber findBlockMapPagePBN(BlockMap *map, PageNumber pageNumber); + +/** + * Write a tree page or indicate that it has been re-dirtied if it is already + * being written. This method is used when correcting errors in the tree during + * read-only rebuild. + * + * @param page The page to write + * @param zone The tree zone managing the page + **/ +void writeTreePage(TreePage *page, BlockMapTreeZone *zone); + +#endif // BLOCK_MAP_TREE_H diff --git a/source/vdo/base/blockMapTreeInternals.h b/source/vdo/base/blockMapTreeInternals.h new file mode 100644 index 0000000..49b69eb --- /dev/null +++ b/source/vdo/base/blockMapTreeInternals.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTreeInternals.h#4 $ + */ + +#ifndef BLOCK_MAP_TREE_INTERNALS_H +#define BLOCK_MAP_TREE_INTERNALS_H + +#include "blockMapTree.h" + +#include "blockMapPage.h" +#include "types.h" + +/** A single page of a block map tree */ +struct treePage { + /** Waiter for a VIO to write out this page */ + Waiter waiter; + + /** Dirty list node */ + RingNode node; + + /** + * If this is a dirty tree page, the tree zone flush generation in which it + * was last dirtied. + */ + uint8_t generation; + + /** Whether this page is an interior tree page being written out. */ + bool writing; + + /** + * If this page is being written, the tree zone flush generation of the + * copy of the page being written. + **/ + uint8_t writingGeneration; + + /** The earliest journal block containing uncommitted updates to this page */ + SequenceNumber recoveryLock; + + /** The value of recoveryLock when the this page last started writing */ + SequenceNumber writingRecoveryLock; + + /** The buffer to hold the on-disk representation of this page */ + char pageBuffer[VDO_BLOCK_SIZE]; +}; + +typedef struct { + PageNumber levels[BLOCK_MAP_TREE_HEIGHT]; +} Boundary; + +/** + * An invalid PBN used to indicate that the page holding the location of a + * tree root has been "loaded". + **/ +extern const PhysicalBlockNumber INVALID_PBN; + +/** + * Extract the BlockMapPage from a TreePage. + * + * @param treePage The TreePage + * + * @return The BlockMapPage of the TreePage + **/ +__attribute__((warn_unused_result)) +static inline BlockMapPage *asBlockMapPage(TreePage *treePage) +{ + return (BlockMapPage *) treePage->pageBuffer; +} + +/** + * Replace the VIOPool in a tree zone. This method is used by unit tests. + * + * @param zone The zone whose pool is to be replaced + * @param layer The physical layer from which to make VIOs + * @param poolSize The size of the new pool + * + * @return VDO_SUCCESS or an error + **/ +int replaceTreeZoneVIOPool(BlockMapTreeZone *zone, + PhysicalLayer *layer, + size_t poolSize) + __attribute__((warn_unused_result)); + +/** + * Check whether a buffer contains a valid page. If the page is bad, log an + * error. If the page is valid, copy it to the supplied page. + * + * @param buffer The buffer to validate (and copy) + * @param nonce The VDO nonce + * @param pbn The absolute PBN of the page + * @param page The page to copy into if valid + * + * @return true if the page was copied (valid) + **/ +bool copyValidPage(char *buffer, + Nonce nonce, + PhysicalBlockNumber pbn, + BlockMapPage *page); + +#endif // BLOCK_MAP_TREE_INTERNALS_H diff --git a/source/vdo/base/blockMappingState.h b/source/vdo/base/blockMappingState.h new file mode 100644 index 0000000..ad2460a --- /dev/null +++ b/source/vdo/base/blockMappingState.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMappingState.h#1 $ + */ + +#ifndef BLOCK_MAPPING_STATE_H +#define BLOCK_MAPPING_STATE_H + +#include "common.h" + +/** + * Four bits of each five-byte block map entry contain a mapping state value + * used to distinguish unmapped or trimmed logical blocks (which are treated + * as mapped to the zero block) from entries that have been mapped to a + * physical block, including the zero block. + **/ +typedef enum { + MAPPING_STATE_UNMAPPED = 0, // Must be zero to be the default value + MAPPING_STATE_UNCOMPRESSED = 1, // A normal (uncompressed) block + MAPPING_STATE_COMPRESSED_BASE = 2, // Compressed in slot 0 + MAPPING_STATE_COMPRESSED_MAX = 15, // Compressed in slot 13 +} BlockMappingState; + +/** + * The total number of compressed blocks that can live in a physical block. + **/ +enum { + MAX_COMPRESSION_SLOTS = + MAPPING_STATE_COMPRESSED_MAX - MAPPING_STATE_COMPRESSED_BASE + 1, +}; + +/**********************************************************************/ +static inline BlockMappingState getStateForSlot(byte slotNumber) +{ + return (slotNumber + MAPPING_STATE_COMPRESSED_BASE); +} + +/**********************************************************************/ +static inline byte getSlotFromState(BlockMappingState mappingState) +{ + return (mappingState - MAPPING_STATE_COMPRESSED_BASE); +} + +/**********************************************************************/ +static inline bool isCompressed(const BlockMappingState mappingState) +{ + return (mappingState > MAPPING_STATE_UNCOMPRESSED); +} + +#endif // BLOCK_MAPPING_STATE_H diff --git a/source/vdo/base/completion.c b/source/vdo/base/completion.c new file mode 100644 index 0000000..d27fd72 --- /dev/null +++ b/source/vdo/base/completion.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/completion.c#10 $ + */ + +#include "completion.h" + +#include "logger.h" +#include "statusCodes.h" + +static const char *VDO_COMPLETION_TYPE_NAMES[] = { + // Keep UNSET_COMPLETION_TYPE at the top. + "UNSET_COMPLETION_TYPE", + + // Keep the rest of these in sorted order. If you add or remove an entry, + // be sure to update the corresponding list in completion.h. + "ACTION_COMPLETION", + "ADMIN_COMPLETION", + "ASYNC_ACTION_CONTEXT", + "BLOCK_ALLOCATOR_COMPLETION", + "BLOCK_MAP_RECOVERY_COMPLETION", + "CHECK_IDENTIFIER_COMPLETION", + "EXTERNAL_COMPLETION", + "FLUSH_NOTIFICATION_COMPLETION", + "GENERATION_FLUSHED_COMPLETION", + "HEARTBEAT_COMPLETION", + "LOCK_COUNTER_COMPLETION", + "PARTITION_COPY_COMPLETION", + "READ_ONLY_MODE_COMPLETION", + "READ_ONLY_REBUILD_COMPLETION", + "RECOVERY_COMPLETION", + "REFERENCE_COUNT_REBUILD_COMPLETION", + "SLAB_SCRUBBER_COMPLETION", + "SUB_TASK_COMPLETION", + "TEST_COMPLETION", + "VDO_COMMAND_COMPLETION", + "VDO_COMMAND_SUB_COMPLETION", + "VDO_EXTENT_COMPLETION", + "VDO_PAGE_COMPLETION", + "VIO_COMPLETION", + "WRAPPING_COMPLETION", +}; + +/**********************************************************************/ +void initializeCompletion(VDOCompletion *completion, + VDOCompletionType type, + PhysicalLayer *layer) +{ + memset(completion, 0, sizeof(*completion)); + completion->layer = layer; + completion->type = type; + resetCompletion(completion); +} + +/**********************************************************************/ +int initializeEnqueueableCompletion(VDOCompletion *completion, + VDOCompletionType type, + PhysicalLayer *layer) +{ + initializeCompletion(completion, type, layer); + return ((layer->createEnqueueable == NULL) + ? VDO_SUCCESS : layer->createEnqueueable(completion)); +} + +/**********************************************************************/ +void resetCompletion(VDOCompletion *completion) +{ + completion->result = VDO_SUCCESS; + completion->complete = false; +} + +/** + * Assert that a completion is not complete. + * + * @param completion The completion to check + **/ +static inline void assertIncomplete(VDOCompletion *completion) +{ + ASSERT_LOG_ONLY(!completion->complete, "completion is not complete"); +} + +/**********************************************************************/ +void setCompletionResult(VDOCompletion *completion, int result) +{ + assertIncomplete(completion); + if (completion->result == VDO_SUCCESS) { + completion->result = result; + } +} + +/** + * Check whether a completion's callback must be enqueued, or if it can be run + * on the current thread. Side effect: clears the requeue flag if it is set, + * so the caller MUST requeue if this returns true. + * + * @param completion The completion whose callback is to be invoked + * + * @return false if the callback must be run on this thread + * true if the callback must be enqueued + **/ +__attribute__((warn_unused_result)) +static inline bool requiresEnqueue(VDOCompletion *completion) +{ + if (completion->requeue) { + completion->requeue = false; + return true; + } + + ThreadID callbackThread = completion->callbackThreadID; + return (callbackThread != completion->layer->getCurrentThreadID()); +} + +/**********************************************************************/ +void invokeCallback(VDOCompletion *completion) +{ + if (requiresEnqueue(completion)) { + if (completion->enqueueable != NULL) { + completion->layer->enqueue(completion->enqueueable); + return; + } + ASSERT_LOG_ONLY(false, + "non-enqueueable completion (type %s) on correct thread", + getCompletionTypeName(completion->type)); + } + + runCallback(completion); +} + +/**********************************************************************/ +void continueCompletion(VDOCompletion *completion, int result) +{ + setCompletionResult(completion, result); + invokeCallback(completion); +} + +/**********************************************************************/ +void completeCompletion(VDOCompletion *completion) +{ + assertIncomplete(completion); + completion->complete = true; + if (completion->callback != NULL) { + invokeCallback(completion); + } +} + +/**********************************************************************/ +void releaseCompletion(VDOCompletion **completionPtr) +{ + VDOCompletion *completion = *completionPtr; + if (completion == NULL) { + return; + } + + *completionPtr = NULL; + completeCompletion(completion); +} + +/**********************************************************************/ +void releaseCompletionWithResult(VDOCompletion **completionPtr, int result) +{ + if (*completionPtr == NULL) { + return; + } + + setCompletionResult(*completionPtr, result); + releaseCompletion(completionPtr); +} + +/**********************************************************************/ +void finishParentCallback(VDOCompletion *completion) +{ + finishCompletion((VDOCompletion *) completion->parent, completion->result); +} + +/**********************************************************************/ +void preserveErrorAndContinue(VDOCompletion *completion) +{ + if (completion->parent != NULL) { + setCompletionResult(completion->parent, completion->result); + } + + resetCompletion(completion); + invokeCallback(completion); +} + +/**********************************************************************/ +const char *getCompletionTypeName(VDOCompletionType completionType) +{ + // Try to catch failures to update the array when the enum values change. + STATIC_ASSERT(COUNT_OF(VDO_COMPLETION_TYPE_NAMES) + == (MAX_COMPLETION_TYPE - UNSET_COMPLETION_TYPE)); + + if (completionType >= MAX_COMPLETION_TYPE) { + static char numeric[100]; + snprintf(numeric, 99, "%d (%#x)", completionType, completionType); + return numeric; + } + + return VDO_COMPLETION_TYPE_NAMES[completionType]; +} + +/**********************************************************************/ +void destroyEnqueueable(VDOCompletion *completion) +{ + if ((completion == NULL) || (completion->layer == NULL) + || (completion->layer->destroyEnqueueable == NULL)) { + return; + } + + completion->layer->destroyEnqueueable(&completion->enqueueable); +} + +/**********************************************************************/ +int assertCompletionType(VDOCompletionType actual, + VDOCompletionType expected) +{ + return ASSERT((expected == actual), + "completion type is %s instead of %s", + getCompletionTypeName(actual), + getCompletionTypeName(expected)); +} diff --git a/source/vdo/base/completion.h b/source/vdo/base/completion.h new file mode 100644 index 0000000..d245814 --- /dev/null +++ b/source/vdo/base/completion.h @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/completion.h#11 $ + */ + +#ifndef COMPLETION_H +#define COMPLETION_H + +#include "permassert.h" + +#include "physicalLayer.h" +#include "ringNode.h" +#include "types.h" + +typedef enum __attribute__((packed)) { + // Keep UNSET_COMPLETION_TYPE at the top. + UNSET_COMPLETION_TYPE = 0, + + // Keep the rest of these in sorted order. If you add or remove an entry, + // be sure to update the corresponding list in completion.c. + ACTION_COMPLETION, + ADMIN_COMPLETION, + ASYNC_ACTION_CONTEXT, + BLOCK_ALLOCATOR_COMPLETION, + BLOCK_MAP_RECOVERY_COMPLETION, + CHECK_IDENTIFIER_COMPLETION, + EXTERNAL_COMPLETION, + FLUSH_NOTIFICATION_COMPLETION, + GENERATION_FLUSHED_COMPLETION, + HEARTBEAT_COMPLETION, + LOCK_COUNTER_COMPLETION, + PARTITION_COPY_COMPLETION, + READ_ONLY_MODE_COMPLETION, + READ_ONLY_REBUILD_COMPLETION, + RECOVERY_COMPLETION, + REFERENCE_COUNT_REBUILD_COMPLETION, + SLAB_SCRUBBER_COMPLETION, + SUB_TASK_COMPLETION, + TEST_COMPLETION, // each unit test may define its own + VDO_COMMAND_COMPLETION, + VDO_COMMAND_SUB_COMPLETION, + VDO_EXTENT_COMPLETION, + VDO_PAGE_COMPLETION, + VIO_COMPLETION, + WRAPPING_COMPLETION, + + // Keep MAX_COMPLETION_TYPE at the bottom. + MAX_COMPLETION_TYPE +} VDOCompletionType; + +/** + * An asynchronous VDO operation. + * + * @param completion the completion of the operation + **/ +typedef void VDOAction(VDOCompletion *completion); + +struct vdoCompletion { + /** The type of completion this is */ + VDOCompletionType type; + + /** + * true once the processing of the operation is complete. + * This flag should not be used by waiters external to the VDO base as + * it is used to gate calling the callback. + **/ + bool complete; + + /** + * If true, queue this completion on the next callback invocation, even if + * it is already running on the correct thread. + **/ + bool requeue; + + /** The ID of the thread which should run the next callback */ + ThreadID callbackThreadID; + + /** The result of the operation */ + int result; + + /** The physical layer on which this completion operates */ + PhysicalLayer *layer; + + /** The callback which will be called once the operation is complete */ + VDOAction *callback; + + /** The callback which, if set, will be called if an error result is set */ + VDOAction *errorHandler; + + /** The parent object, if any, that spawned this completion */ + void *parent; + + /** The enqueueable for this completion (may be NULL) */ + Enqueueable *enqueueable; +}; + +/** + * Actually run the callback. This function must be called from the correct + * callback thread. + **/ +static inline void runCallback(VDOCompletion *completion) +{ + if ((completion->result != VDO_SUCCESS) + && (completion->errorHandler != NULL)) { + completion->errorHandler(completion); + return; + } + + completion->callback(completion); +} + +/** + * Set the result of a completion. Older errors will not be masked. + * + * @param completion The completion whose result is to be set + * @param result The result to set + **/ +void setCompletionResult(VDOCompletion *completion, int result); + +/** + * Initialize a completion to a clean state, for reused completions. + * + * @param completion The completion to initialize + * @param type The type of the completion + * @param layer The physical layer of the completion + **/ +void initializeCompletion(VDOCompletion *completion, + VDOCompletionType type, + PhysicalLayer *layer); + +/** + * Initialize a completion to a clean state and make an enqueueable for it. + * + * @param completion The completion to initialize + * @param type The type of the completion + * @param layer The physical layer of the completion + * + * @return VDO_SUCCESS or an error + **/ +int initializeEnqueueableCompletion(VDOCompletion *completion, + VDOCompletionType type, + PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Reset a completion to a clean state, while keeping + * the type, layer and parent information. + * + * @param completion the completion to reset + **/ +void resetCompletion(VDOCompletion *completion); + +/** + * Invoke the callback of a completion. If called on the correct thread (i.e. + * the one specified in the completion's callbackThreadID field), the + * completion will be run immediately. Otherwise, the completion will be + * enqueued on the correct callback thread. + **/ +void invokeCallback(VDOCompletion *completion); + +/** + * Continue processing a completion by setting the current result and calling + * invokeCallback(). + * + * @param completion The completion to continue + * @param result The current result (will not mask older errors) + **/ +void continueCompletion(VDOCompletion *completion, int result); + +/** + * Complete a completion. + * + * @param completion The completion to complete + **/ +void completeCompletion(VDOCompletion *completion); + +/** + * Finish a completion. + * + * @param completion The completion to finish + * @param result The result of the completion (will not mask older errors) + **/ +static inline void finishCompletion(VDOCompletion *completion, int result) +{ + setCompletionResult(completion, result); + completeCompletion(completion); +} + +/** + * Complete a completion and NULL out the reference to it. + * + * @param completionPtr A pointer to the completion to release + **/ +void releaseCompletion(VDOCompletion **completionPtr); + +/** + * Finish a completion and NULL out the reference to it. + * + * @param completionPtr A pointer to the completion to release + * @param result The result of the completion + **/ +void releaseCompletionWithResult(VDOCompletion **completionPtr, int result); + +/** + * A callback to finish the parent of a completion. + * + * @param completion The completion which has finished and whose parent should + * be finished + **/ +void finishParentCallback(VDOCompletion *completion); + +/** + * Error handler which preserves an error in the parent (if there is one), + * and then resets the failing completion and calls its non-error callback. + * + * @param completion The completion which failed + **/ +void preserveErrorAndContinue(VDOCompletion *completion); + +/** + * A callback which does nothing. This callback is intended to be set as an + * error handler in the case where an error should do nothing. + * + * @param completion The completion being called back + **/ +static inline +void noopCallback(VDOCompletion *completion __attribute__((unused))) +{ +} + +/** + * Destroy the enqueueable associated with this completion. + * + * @param completion The completion + **/ +void destroyEnqueueable(VDOCompletion *completion); + +/** + * Assert that a completion is of the correct type + * + * @param actual The actual completion type + * @param expected The expected completion type + * + * @return VDO_SUCCESS or VDO_PARAMETER_MISMATCH + **/ +int assertCompletionType(VDOCompletionType actual, + VDOCompletionType expected); + +/** + * Return the name of a completion type. + * + * @param completionType the completion type + * + * @return a pointer to a static string; if the completionType is unknown + * this is to a static buffer that may be overwritten. + **/ +const char *getCompletionTypeName(VDOCompletionType completionType); + +/** + * Set the callback for a completion. + * + * @param completion The completion + * @param callback The callback to register + * @param threadID The ID of the thread on which the callback should run + **/ +static inline void setCallback(VDOCompletion *completion, + VDOAction *callback, + ThreadID threadID) +{ + completion->callback = callback; + completion->callbackThreadID = threadID; +} + +/** + * Set the callback for a completion and invoke it immediately. + * + * @param completion The completion + * @param callback The callback to register + * @param threadID The ID of the thread on which the callback should run + **/ +static inline void launchCallback(VDOCompletion *completion, + VDOAction *callback, + ThreadID threadID) +{ + setCallback(completion, callback, threadID); + invokeCallback(completion); +} + +/** + * Set the callback and parent for a completion. + * + * @param completion The completion + * @param callback The callback to register + * @param threadID The ID of the thread on which the callback should run + * @param parent The new parent of the completion + **/ +static inline void setCallbackWithParent(VDOCompletion *completion, + VDOAction *callback, + ThreadID threadID, + void *parent) +{ + setCallback(completion, callback, threadID); + completion->parent = parent; +} + +/** + * Set the callback and parent for a completion and invoke the callback + * immediately. + * + * @param completion The completion + * @param callback The callback to register + * @param threadID The ID of the thread on which the callback should run + * @param parent The new parent of the completion + **/ +static inline void launchCallbackWithParent(VDOCompletion *completion, + VDOAction *callback, + ThreadID threadID, + void *parent) +{ + setCallbackWithParent(completion, callback, threadID, parent); + invokeCallback(completion); +} + +/** + * Prepare a completion for launch. Reset it, and then set its callback, error + * handler, callback thread, and parent. + * + * @param completion The completion + * @param callback The callback to register + * @param errorHandler The error handler to register + * @param threadID The ID of the thread on which the callback should run + * @param parent The new parent of the completion + **/ +static inline void prepareCompletion(VDOCompletion *completion, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID, + void *parent) +{ + resetCompletion(completion); + setCallbackWithParent(completion, callback, threadID, parent); + completion->errorHandler = errorHandler; +} + +/** + * Prepare a completion for launch ensuring that it will always be requeued. + * Reset it, and then set its callback, error handler, callback thread, and + * parent. + * + * @param completion The completion + * @param callback The callback to register + * @param errorHandler The error handler to register + * @param threadID The ID of the thread on which the callback should run + * @param parent The new parent of the completion + **/ +static inline void prepareForRequeue(VDOCompletion *completion, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID, + void *parent) +{ + prepareCompletion(completion, callback, errorHandler, threadID, parent); + completion->requeue = true; +} + +/** + * Prepare a completion for launch which will complete its parent when + * finished. + * + * @param completion The completion + * @param parent The parent to complete + **/ +static inline void prepareToFinishParent(VDOCompletion *completion, + VDOCompletion *parent) +{ + prepareCompletion(completion, finishParentCallback, finishParentCallback, + parent->callbackThreadID, parent); +} + +#endif // COMPLETION_H diff --git a/source/vdo/base/compressedBlock.c b/source/vdo/base/compressedBlock.c new file mode 100644 index 0000000..d9f93e8 --- /dev/null +++ b/source/vdo/base/compressedBlock.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressedBlock.c#3 $ + */ + +#include "compressedBlock.h" + +#include "memoryAlloc.h" +#include "numeric.h" + +static const VersionNumber COMPRESSED_BLOCK_1_0 = { + .majorVersion = 1, + .minorVersion = 0, +}; + +/**********************************************************************/ +void resetCompressedBlockHeader(CompressedBlockHeader *header) +{ + STATIC_ASSERT(sizeof(header->fields) == sizeof(header->raw)); + + header->fields.version = packVersionNumber(COMPRESSED_BLOCK_1_0); + memset(header->fields.sizes, 0, sizeof(header->fields.sizes)); +} + +/**********************************************************************/ +static uint16_t +getCompressedFragmentSize(const CompressedBlockHeader *header, byte slot) +{ + return getUInt16LE(header->fields.sizes[slot]); +} + +/**********************************************************************/ +int getCompressedBlockFragment(BlockMappingState mappingState, + char *buffer, + BlockSize blockSize, + uint16_t *fragmentOffset, + uint16_t *fragmentSize) +{ + if (!isCompressed(mappingState)) { + return VDO_INVALID_FRAGMENT; + } + + CompressedBlockHeader *header = (CompressedBlockHeader *) buffer; + VersionNumber version = unpackVersionNumber(header->fields.version); + if (!areSameVersion(version, COMPRESSED_BLOCK_1_0)) { + return VDO_INVALID_FRAGMENT; + } + + byte slot = getSlotFromState(mappingState); + if (slot >= MAX_COMPRESSION_SLOTS) { + return VDO_INVALID_FRAGMENT; + } + + uint16_t compressedSize = getCompressedFragmentSize(header, slot); + uint16_t offset = sizeof(CompressedBlockHeader); + for (unsigned int i = 0; i < slot; i++) { + offset += getCompressedFragmentSize(header, i); + if (offset >= blockSize) { + return VDO_INVALID_FRAGMENT; + } + } + + if ((offset + compressedSize) > blockSize) { + return VDO_INVALID_FRAGMENT; + } + + *fragmentOffset = offset; + *fragmentSize = compressedSize; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void putCompressedBlockFragment(CompressedBlock *block, + unsigned int fragment, + uint16_t offset, + const char *data, + uint16_t size) +{ + storeUInt16LE(block->header.fields.sizes[fragment], size); + memcpy(&block->data[offset], data, size); +} diff --git a/source/vdo/base/compressedBlock.h b/source/vdo/base/compressedBlock.h new file mode 100644 index 0000000..603841f --- /dev/null +++ b/source/vdo/base/compressedBlock.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressedBlock.h#3 $ + */ + +#ifndef COMPRESSED_BLOCK_H +#define COMPRESSED_BLOCK_H + +#include "blockMappingState.h" +#include "header.h" + +/** + * The header of a compressed block. + **/ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** Unsigned 32-bit major and minor versions, in little-endian byte order */ + PackedVersionNumber version; + + /** List of unsigned 16-bit compressed block sizes, in little-endian order */ + byte sizes[MAX_COMPRESSION_SLOTS][2]; + } fields; + + // A raw view of the packed encoding. + byte raw[4 + 4 + (2 * MAX_COMPRESSION_SLOTS)]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining compressed block headers in GDB. + struct __attribute__((packed)) { + VersionNumber version; + uint16_t sizes[MAX_COMPRESSION_SLOTS]; + } littleEndian; +#endif +} CompressedBlockHeader; + +/** + * The compressed block overlay. + **/ +typedef struct { + CompressedBlockHeader header; + char data[]; +} __attribute__((packed)) CompressedBlock; + +/** + * Initializes/resets a compressed block header. + * + * @param header the header + * + * When done, the version number is set to the current version, and all + * fragments are empty. + **/ +void resetCompressedBlockHeader(CompressedBlockHeader *header); + +/** + * Get a reference to a compressed fragment from a compression block. + * + * @param [in] mappingState the mapping state for the look up + * @param [in] buffer buffer that contains compressed data + * @param [in] blockSize size of a data block + * @param [out] fragmentOffset the offset of the fragment within a + * compressed block + * @param [out] fragmentSize the size of the fragment + * + * @return If a valid compressed fragment is found, VDO_SUCCESS; + * otherwise, VDO_INVALID_FRAGMENT if the fragment is invalid. + **/ +int getCompressedBlockFragment(BlockMappingState mappingState, + char *buffer, + BlockSize blockSize, + uint16_t *fragmentOffset, + uint16_t *fragmentSize); + +/** + * Copy a fragment into the compressed block. + * + * @param block the compressed block + * @param fragment the number of the fragment + * @param offset the byte offset of the fragment in the data area + * @param data a pointer to the compressed data + * @param size the size of the data + * + * @note no bounds checking -- the data better fit without smashing other stuff + **/ +void putCompressedBlockFragment(CompressedBlock *block, + unsigned int fragment, + uint16_t offset, + const char *data, + uint16_t size); + +#endif // COMPRESSED_BLOCK_H diff --git a/source/vdo/base/compressionState.c b/source/vdo/base/compressionState.c new file mode 100644 index 0000000..d773756 --- /dev/null +++ b/source/vdo/base/compressionState.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionState.c#2 $ + */ + +#include "compressionStateInternals.h" + +#include "dataVIO.h" +#include "packer.h" + +static const uint32_t STATUS_MASK = 0xff; +static const uint32_t MAY_NOT_COMPRESS_MASK = 0x80000000; + +/**********************************************************************/ +VIOCompressionState getCompressionState(DataVIO *dataVIO) +{ + uint32_t packedValue = atomicLoad32(&dataVIO->compression.state); + return (VIOCompressionState) { + .status = packedValue & STATUS_MASK, + .mayNotCompress = ((packedValue & MAY_NOT_COMPRESS_MASK) != 0), + }; +} + +/** + * Convert a VIOCompressionState into a uint32_t which may be stored + * atomically. + * + * @param state The state to convert + * + * @return The compression state packed into a uint32_t + **/ +__attribute__((warn_unused_result)) +static uint32_t packState(VIOCompressionState state) +{ + return state.status | (state.mayNotCompress ? MAY_NOT_COMPRESS_MASK : 0); +} + +/**********************************************************************/ +bool setCompressionState(DataVIO *dataVIO, + VIOCompressionState state, + VIOCompressionState newState) +{ + return compareAndSwap32(&dataVIO->compression.state, packState(state), + packState(newState)); +} + +/** + * Advance to the next compression state along the compression path. + * + * @param dataVIO The DataVIO to advance + * + * @return The new compression status of the DataVIO + **/ +static VIOCompressionStatus advanceStatus(DataVIO *dataVIO) +{ + for (;;) { + VIOCompressionState state = getCompressionState(dataVIO); + if (state.status == VIO_POST_PACKER) { + // We're already in the last state. + return state.status; + } + + VIOCompressionState newState = state; + if (state.mayNotCompress) { + // Compression has been dis-allowed for this VIO, so skip the rest of the + // path and go to the end. + newState.status = VIO_POST_PACKER; + } else { + // Go to the next state. + newState.status++; + } + + if (setCompressionState(dataVIO, state, newState)) { + return newState.status; + } + + // Another thread changed the state out from under us so try again. + } +} + +/**********************************************************************/ +bool mayCompressDataVIO(DataVIO *dataVIO) +{ + if (!hasAllocation(dataVIO) + || ((getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC) + && vioRequiresFlushAfter(dataVIOAsVIO(dataVIO))) + || !getVDOCompressing(getVDOFromDataVIO(dataVIO))) { + /* + * If this VIO didn't get an allocation, the compressed write probably + * won't either, so don't try compressing it. Also, if compression is off, + * don't compress. + */ + setCompressionDone(dataVIO); + return false; + } + + if (dataVIO->hashLock == NULL) { + // DataVIOs without a HashLock (which should be extremely rare) aren't + // able to share the packer's PBN lock, so don't try to compress them. + return false; + } + + return (advanceStatus(dataVIO) == VIO_COMPRESSING); +} + +/**********************************************************************/ +bool mayPackDataVIO(DataVIO *dataVIO) +{ + if (!isSufficientlyCompressible(dataVIO) + || !getVDOCompressing(getVDOFromDataVIO(dataVIO)) + || getCompressionState(dataVIO).mayNotCompress) { + // If the data in this VIO doesn't compress, or compression is off, or + // compression for this VIO has been canceled, don't send it to the packer. + setCompressionDone(dataVIO); + return false; + } + + return true; +} + +/**********************************************************************/ +bool mayBlockInPacker(DataVIO *dataVIO) +{ + return (advanceStatus(dataVIO) == VIO_PACKING); +} + +/**********************************************************************/ +bool mayWriteCompressedDataVIO(DataVIO *dataVIO) +{ + advanceStatus(dataVIO); + return !getCompressionState(dataVIO).mayNotCompress; +} + +/**********************************************************************/ +void setCompressionDone(DataVIO *dataVIO) +{ + for (;;) { + VIOCompressionState state = getCompressionState(dataVIO); + if (state.status == VIO_POST_PACKER) { + // The VIO is already done. + return; + } + + // If compression was cancelled on this VIO, preserve that fact. + VIOCompressionState newState = { + .status = VIO_POST_PACKER, + .mayNotCompress = true, + }; + if (setCompressionState(dataVIO, state, newState)) { + return; + } + } +} + +/**********************************************************************/ +bool cancelCompression(DataVIO *dataVIO) +{ + VIOCompressionState state; + for (;;) { + state = getCompressionState(dataVIO); + if (state.mayNotCompress || (state.status == VIO_POST_PACKER)) { + // This DataVIO is already set up to not block in the packer. + break; + } + + VIOCompressionState newState = { + .status = state.status, + .mayNotCompress = true, + }; + if (setCompressionState(dataVIO, state, newState)) { + break; + } + } + + return ((state.status == VIO_PACKING) && !state.mayNotCompress); +} diff --git a/source/vdo/base/compressionState.h b/source/vdo/base/compressionState.h new file mode 100644 index 0000000..19a4143 --- /dev/null +++ b/source/vdo/base/compressionState.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionState.h#2 $ + */ + +#ifndef COMPRESSION_STATE_H +#define COMPRESSION_STATE_H + +#include "atomic.h" +#include "types.h" + +/** + * Where a DataVIO is on the compression path; advanceStatus() depends on the + * order of this enum. + **/ +typedef enum { + /* A VIO which has not yet entered the compression path */ + VIO_PRE_COMPRESSOR = 0, + /* A VIO which is in the compressor */ + VIO_COMPRESSING, + /* A VIO which is blocked in the packer */ + VIO_PACKING, + /* A VIO which is no longer on the compression path (and never will be) */ + VIO_POST_PACKER, +} VIOCompressionStatus; + +typedef struct { + VIOCompressionStatus status; + bool mayNotCompress; +} VIOCompressionState; + +/** + * Get the compression state of a DataVIO. + * + * @param dataVIO The DataVIO + * + * @return The compression state + **/ +__attribute__((warn_unused_result)) +VIOCompressionState getCompressionState(DataVIO *dataVIO); + +/** + * Check whether a DataVIO may go to the compressor. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO may be compressed at this time + **/ +bool mayCompressDataVIO(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Check whether a DataVIO may go to the packer. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO may be packed at this time + **/ +bool mayPackDataVIO(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Check whether a DataVIO which has gone to the packer may block there. Any + * cancelation after this point and before the DataVIO is written out requires + * this DataVIO to be picked up by the canceling DataVIO. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO may block in the packer + **/ +bool mayBlockInPacker(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Check whether the packer may write out a DataVIO as part of a compressed + * block. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO may be written as part of a + * compressed block at this time + **/ +bool mayWriteCompressedDataVIO(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Indicate that this DataVIO is leaving the compression path. + * + * @param dataVIO The DataVIO leaving the compression path + **/ +void setCompressionDone(DataVIO *dataVIO); + +/** + * Prevent this DataVIO from being compressed or packed. + * + * @param dataVIO The DataVIO to cancel + * + * @return true if the DataVIO is in the packer and the caller + * was the first caller to cancel it + **/ +bool cancelCompression(DataVIO *dataVIO); + +#endif /* COMPRESSION_STATE_H */ diff --git a/source/vdo/base/compressionStateInternals.h b/source/vdo/base/compressionStateInternals.h new file mode 100644 index 0000000..a9b8dec --- /dev/null +++ b/source/vdo/base/compressionStateInternals.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionStateInternals.h#1 $ + */ + +#ifndef COMPRESSION_STATE_INTERNALS_H +#define COMPRESSION_STATE_INTERNALS_H + +#include "compressionState.h" + +/** + * Set the compression state of a DataVIO (exposed for testing). + * + * @param dataVIO The DataVIO whose compression state is to be set + * @param state The expected current state of the DataVIO + * @param newState The state to set + * + * @return true if the new state was set, false if the DataVIO's + * compression state did not match the expected state, and so was + * left unchanged + **/ +bool setCompressionState(DataVIO *dataVIO, + VIOCompressionState state, + VIOCompressionState newState); + +#endif /* COMPRESSION_STATE_H */ diff --git a/source/vdo/base/constants.c b/source/vdo/base/constants.c new file mode 100644 index 0000000..05d3a42 --- /dev/null +++ b/source/vdo/base/constants.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/constants.c#1 $ + */ + +#include "types.h" + +/** The maximum logical space is 4 petabytes, which is 1 terablock. */ +const BlockCount MAXIMUM_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024; + +/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ +const BlockCount MAXIMUM_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64; + +// unit test minimum +const BlockCount MINIMUM_SLAB_JOURNAL_BLOCKS = 2; diff --git a/source/vdo/base/constants.h b/source/vdo/base/constants.h new file mode 100644 index 0000000..8b61c5f --- /dev/null +++ b/source/vdo/base/constants.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/constants.h#2 $ + */ + +#ifndef CONSTANTS_H +#define CONSTANTS_H + +#include "types.h" + +enum { + /** The number of entries on a block map page */ + BLOCK_MAP_ENTRIES_PER_PAGE = 812, + + /** The origin of the flat portion of the block map */ + BLOCK_MAP_FLAT_PAGE_ORIGIN = 1, + + /** + * The height of a block map tree. Assuming a root count of 60 and 812 + * entries per page, this is big enough to represent almost 95 PB of logical + * space. + **/ + BLOCK_MAP_TREE_HEIGHT = 5, + + /** The number of trees in the arboreal block map */ + DEFAULT_BLOCK_MAP_TREE_ROOT_COUNT = 60, + + /** The default size of the recovery journal, in blocks */ + DEFAULT_RECOVERY_JOURNAL_SIZE = 32 * 1024, + + /** The default size of each slab journal, in blocks */ + DEFAULT_SLAB_JOURNAL_SIZE = 224, + + /** + * The initial size of lbnOperations and pbnOperations, which is based + * upon the expected maximum number of outstanding VIOs. This value was + * chosen to make it highly unlikely that the maps would need to be resized. + **/ + LOCK_MAP_CAPACITY = 10000, + + /** The maximum number of logical zones */ + MAX_LOGICAL_ZONES = 60, + + /** The maximum number of physical zones */ + MAX_PHYSICAL_ZONES = 16, + + /** The base-2 logarithm of the maximum blocks in one slab */ + MAX_SLAB_BITS = 23, + + /** The maximum number of slabs the slab depot supports */ + MAX_SLABS = 8192, + + /** + * The maximum number of block map pages to load simultaneously during + * recovery or rebuild. + **/ + MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS = 1024, + + /** The maximum number of VIOs in the system at once */ + MAXIMUM_USER_VIOS = 2048, + + /** + * The number of in-memory recovery journal blocks is determined by: + * -- 311 journal entries in a 4k block + * -- maximum of 2048 VIOs making entries at once + * so we need at least 2048 / 312 = 7 journal blocks. + **/ + RECOVERY_JOURNAL_TAIL_BUFFER_SIZE = 64, + + /** The number of sectors per block */ + SECTORS_PER_BLOCK = 8, + + /** The only physical block size supported by VDO */ + VDO_BLOCK_SIZE = 4096, + + /** The size of a sector that will not be torn */ + VDO_SECTOR_SIZE = 512, + + /** The physical block number reserved for storing the zero block */ + ZERO_BLOCK = 0, +}; + +/** The maximum logical space is 4 petabytes, which is 1 terablock. */ +extern const BlockCount MAXIMUM_LOGICAL_BLOCKS; + +/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ + extern const BlockCount MAXIMUM_PHYSICAL_BLOCKS; + +// unit test minimum +extern const BlockCount MINIMUM_SLAB_JOURNAL_BLOCKS; + +#endif // CONSTANTS_H diff --git a/source/vdo/base/dataVIO.c b/source/vdo/base/dataVIO.c new file mode 100644 index 0000000..a9778f5 --- /dev/null +++ b/source/vdo/base/dataVIO.c @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dataVIO.c#7 $ + */ + +#include "dataVIO.h" + +#include "logger.h" + +#include "atomic.h" +#include "blockMap.h" +#include "compressionState.h" +#include "extent.h" +#include "logicalZone.h" +#include "threadConfig.h" +#include "vdoInternal.h" +#include "vioRead.h" +#include "vioWrite.h" + +static const char *ASYNC_OPERATION_NAMES[] = { + "launch", + "acknowledgeWrite", + "acquireHashLock", + "acquireLogicalBlockLock", + "acquirePBNReadLock", + "checkForDedupeForRollover", + "checkForDeduplication", + "compressData", + "continueVIOAsync", + "findBlockMapSlot", + "getMappedBlock", + "getMappedBlockForDedupe", + "getMappedBlockForWrite", + "hashData", + "journalDecrementForDedupe", + "journalDecrementForWrite", + "journalIncrementForCompression", + "journalIncrementForDedupe", + "journalIncrementForWrite", + "journalMappingForCompression", + "journalMappingForDedupe", + "journalMappingForWrite", + "journalUnmappingForDedupe", + "journalUnmappingForWrite", + "attemptPacking", + "putMappedBlock", + "putMappedBlockForDedupe", + "readData", + "updateIndex", + "verifyDeduplication", + "writeData", +}; + +/** + * Initialize the LBN lock of a DataVIO. In addition to recording the LBN on + * which the DataVIO will operate, it will also find the logical zone + * associated with the LBN. + * + * @param dataVIO The dataVIO to initialize + * @param lbn The lbn on which the dataVIO will operate + **/ +static void initializeLBNLock(DataVIO *dataVIO, LogicalBlockNumber lbn) +{ + LBNLock *lock = &dataVIO->logical; + lock->lbn = lbn; + lock->locked = false; + initializeWaitQueue(&lock->waiters); + + VDO *vdo = getVDOFromDataVIO(dataVIO); + lock->zone = getLogicalZone(vdo->logicalZones, computeLogicalZone(dataVIO)); +} + +/**********************************************************************/ +void prepareDataVIO(DataVIO *dataVIO, + LogicalBlockNumber lbn, + VIOOperation operation, + bool isTrim, + VDOAction *callback) +{ + // Clearing the tree lock must happen before initializing the LBN lock, + // which also adds information to the tree lock. + memset(&dataVIO->treeLock, 0, sizeof(dataVIO->treeLock)); + initializeLBNLock(dataVIO, lbn); + initializeRing(&dataVIO->hashLockNode); + initializeRing(&dataVIO->writeNode); + + resetAllocation(dataVIOAsAllocatingVIO(dataVIO)); + + dataVIO->isDuplicate = false; + + memset(&dataVIO->chunkName, 0, sizeof(dataVIO->chunkName)); + memset(&dataVIO->duplicate, 0, sizeof(dataVIO->duplicate)); + + VIO *vio = dataVIOAsVIO(dataVIO); + vio->operation = operation; + vio->callback = callback; + dataVIO->pageCompletion.completion.enqueueable + = vioAsCompletion(vio)->enqueueable; + + dataVIO->mapped.state = MAPPING_STATE_UNCOMPRESSED; + dataVIO->newMapped.state + = (isTrim ? MAPPING_STATE_UNMAPPED : MAPPING_STATE_UNCOMPRESSED); + resetCompletion(vioAsCompletion(vio)); + setLogicalCallback(dataVIO, attemptLogicalBlockLock, + THIS_LOCATION("$F;cb=acquireLogicalBlockLock")); +} + +/**********************************************************************/ +void completeDataVIO(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + if (completion->result != VDO_SUCCESS) { + VIO *vio = dataVIOAsVIO(dataVIO); + updateVIOErrorStats(vio, + "Completing %s VIO for LBN %" PRIu64 + " with error after %s", + getVIOReadWriteFlavor(vio), dataVIO->logical.lbn, + getOperationName(dataVIO)); + } + + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F($io)")); + if (isReadDataVIO(dataVIO)) { + cleanupReadDataVIO(dataVIO); + } else { + cleanupWriteDataVIO(dataVIO); + } +} + +/**********************************************************************/ +void finishDataVIO(DataVIO *dataVIO, int result) +{ + VDOCompletion *completion = dataVIOAsCompletion(dataVIO); + setCompletionResult(completion, result); + completeDataVIO(completion); +} + +/**********************************************************************/ +const char *getOperationName(DataVIO *dataVIO) +{ + STATIC_ASSERT((MAX_ASYNC_OPERATION_NUMBER - MIN_ASYNC_OPERATION_NUMBER) + == COUNT_OF(ASYNC_OPERATION_NAMES)); + + return ((dataVIO->lastAsyncOperation < MAX_ASYNC_OPERATION_NUMBER) + ? ASYNC_OPERATION_NAMES[dataVIO->lastAsyncOperation] + : "unknown async operation"); +} + +/**********************************************************************/ +void receiveDedupeAdvice(DataVIO *dataVIO, const DataLocation *advice) +{ + /* + * NOTE: this is called on non-base-code threads. Be very careful to not do + * anything here that needs a base code thread-local variable, such as + * trying to get the current thread ID, or that does a lot of work. + */ + + VDO *vdo = getVDOFromDataVIO(dataVIO); + ZonedPBN duplicate = validateDedupeAdvice(vdo, advice, dataVIO->logical.lbn); + setDuplicateLocation(dataVIO, duplicate); +} + +/**********************************************************************/ +void setDuplicateLocation(DataVIO *dataVIO, const ZonedPBN source) +{ + dataVIO->isDuplicate = (source.pbn != ZERO_BLOCK); + dataVIO->duplicate = source; +} + +/**********************************************************************/ +void clearMappedLocation(DataVIO *dataVIO) +{ + dataVIO->mapped = (ZonedPBN) { .state = MAPPING_STATE_UNMAPPED }; +} + +/**********************************************************************/ +int setMappedLocation(DataVIO *dataVIO, + PhysicalBlockNumber pbn, + BlockMappingState state) +{ + PhysicalZone *zone; + int result = getPhysicalZone(getVDOFromDataVIO(dataVIO), pbn, &zone); + if (result != VDO_SUCCESS) { + return result; + } + + dataVIO->mapped = (ZonedPBN) { + .pbn = pbn, + .state = state, + .zone = zone, + }; + return VDO_SUCCESS; +} + +/** + * Launch a request which has acquired an LBN lock. + * + * @param dataVIO The DataVIO which has just acquired a lock + **/ +static void launchLockedRequest(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + dataVIO->logical.locked = true; + + if (isWriteDataVIO(dataVIO)) { + launchWriteDataVIO(dataVIO); + } else { + launchReadDataVIO(dataVIO); + } +} + +/**********************************************************************/ +void attemptLogicalBlockLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + + if (dataVIO->logical.lbn + >= getVDOFromDataVIO(dataVIO)->config.logicalBlocks) { + finishDataVIO(dataVIO, VDO_OUT_OF_RANGE); + return; + } + + DataVIO *lockHolder; + LBNLock *lock = &dataVIO->logical; + int result = intMapPut(getLBNLockMap(lock->zone), lock->lbn, dataVIO, false, + (void **) &lockHolder); + if (result != VDO_SUCCESS) { + finishDataVIO(dataVIO, result); + return; + } + + if (lockHolder == NULL) { + // We got the lock + launchLockedRequest(dataVIO); + return; + } + + result = ASSERT(lockHolder->logical.locked, "logical block lock held"); + if (result != VDO_SUCCESS) { + finishDataVIO(dataVIO, result); + return; + } + + /* + * If the new request is a pure read request (not read-modify-write) and + * the lockHolder is writing and has received an allocation (VDO-2683), + * service the read request immediately by copying data from the lockHolder + * to avoid having to flush the write out of the packer just to prevent the + * read from waiting indefinitely. If the lockHolder does not yet have an + * allocation, prevent it from blocking in the packer and wait on it. + */ + if (isReadDataVIO(dataVIO) && atomicLoadBool(&lockHolder->hasAllocation)) { + dataVIOAsCompletion(dataVIO)->layer->copyData(lockHolder, dataVIO); + finishDataVIO(dataVIO, VDO_SUCCESS); + return; + } + + dataVIO->lastAsyncOperation = ACQUIRE_LOGICAL_BLOCK_LOCK; + result = enqueueDataVIO(&lockHolder->logical.waiters, dataVIO, + THIS_LOCATION("$F;cb=logicalBlockLock")); + if (result != VDO_SUCCESS) { + finishDataVIO(dataVIO, result); + return; + } + + // Prevent writes and read-modify-writes from blocking indefinitely on + // lock holders in the packer. + if (!isReadDataVIO(lockHolder) && cancelCompression(lockHolder)) { + dataVIO->compression.lockHolder = lockHolder; + launchPackerCallback(dataVIO, removeLockHolderFromPacker, + THIS_LOCATION("$F;cb=removeLockHolderFromPacker")); + } +} + +/** + * Release an uncontended LBN lock. + * + * @param dataVIO The DataVIO holding the lock + **/ +static void releaseLock(DataVIO *dataVIO) +{ + LBNLock *lock = &dataVIO->logical; + IntMap *lockMap = getLBNLockMap(lock->zone); + if (!lock->locked) { + // The lock is not locked, so it had better not be registered in the lock + // map. + DataVIO *lockHolder = intMapGet(lockMap, lock->lbn); + ASSERT_LOG_ONLY((dataVIO != lockHolder), + "no logical block lock held for block %llu", + lock->lbn); + return; + } + + // Remove the lock from the logical block lock map, releasing the lock. + DataVIO *lockHolder = intMapRemove(lockMap, lock->lbn); + ASSERT_LOG_ONLY((dataVIO == lockHolder), + "logical block lock mismatch for block %llu", lock->lbn); + lock->locked = false; + return; +} + +/**********************************************************************/ +void releaseLogicalBlockLock(DataVIO *dataVIO) +{ + assertInLogicalZone(dataVIO); + if (!hasWaiters(&dataVIO->logical.waiters)) { + releaseLock(dataVIO); + return; + } + + LBNLock *lock = &dataVIO->logical; + ASSERT_LOG_ONLY(lock->locked, "LBNLock with waiters is not locked"); + + // Another DataVIO is waiting for the lock, so just transfer it in a single + // lock map operation + DataVIO *nextLockHolder = waiterAsDataVIO(dequeueNextWaiter(&lock->waiters)); + + // Transfer the remaining lock waiters to the next lock holder. + transferAllWaiters(&lock->waiters, &nextLockHolder->logical.waiters); + + DataVIO *lockHolder; + int result = intMapPut(getLBNLockMap(lock->zone), lock->lbn, nextLockHolder, + true, (void **) &lockHolder); + if (result != VDO_SUCCESS) { + finishDataVIO(nextLockHolder, result); + return; + } + + ASSERT_LOG_ONLY((lockHolder == dataVIO), + "logical block lock mismatch for block %llu", lock->lbn); + lock->locked = false; + + /* + * If there are still waiters, other DataVIOs must be trying to get the lock + * we just transferred. We must ensure that the new lock holder doesn't block + * in the packer. + */ + if (hasWaiters(&nextLockHolder->logical.waiters)) { + cancelCompression(nextLockHolder); + } + + // Avoid stack overflow on lock transfer. + // XXX: this is only an issue in the 1 thread config. + dataVIOAsCompletion(nextLockHolder)->requeue = true; + launchLockedRequest(nextLockHolder); +} diff --git a/source/vdo/base/dataVIO.h b/source/vdo/base/dataVIO.h new file mode 100644 index 0000000..ec6e9f6 --- /dev/null +++ b/source/vdo/base/dataVIO.h @@ -0,0 +1,945 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dataVIO.h#4 $ + */ + +#ifndef DATA_VIO_H +#define DATA_VIO_H + +#include "allocatingVIO.h" +#include "atomic.h" +#include "blockMapEntry.h" +#include "blockMappingState.h" +#include "constants.h" +#include "hashZone.h" +#include "journalPoint.h" +#include "logicalZone.h" +#include "referenceOperation.h" +#include "ringNode.h" +#include "threadConfig.h" +#include "trace.h" +#include "types.h" +#include "vdoPageCache.h" +#include "vio.h" +#include "waitQueue.h" + +/** + * Codes for describing the last asynchronous operation performed on a VIO. + **/ +typedef enum __attribute__((packed)) { + MIN_ASYNC_OPERATION_NUMBER = 0, + LAUNCH = MIN_ASYNC_OPERATION_NUMBER, + ACKNOWLEDGE_WRITE, + ACQUIRE_HASH_LOCK, + ACQUIRE_LOGICAL_BLOCK_LOCK, + ACQUIRE_PBN_READ_LOCK, + CHECK_FOR_DEDUPE_FOR_ROLLOVER, + CHECK_FOR_DEDUPLICATION, + COMPRESS_DATA, + CONTINUE_VIO_ASYNC, + FIND_BLOCK_MAP_SLOT, + GET_MAPPED_BLOCK, + GET_MAPPED_BLOCK_FOR_DEDUPE, + GET_MAPPED_BLOCK_FOR_WRITE, + HASH_DATA, + JOURNAL_DECREMENT_FOR_DEDUPE, + JOURNAL_DECREMENT_FOR_WRITE, + JOURNAL_INCREMENT_FOR_COMPRESSION, + JOURNAL_INCREMENT_FOR_DEDUPE, + JOURNAL_INCREMENT_FOR_WRITE, + JOURNAL_MAPPING_FOR_COMPRESSION, + JOURNAL_MAPPING_FOR_DEDUPE, + JOURNAL_MAPPING_FOR_WRITE, + JOURNAL_UNMAPPING_FOR_DEDUPE, + JOURNAL_UNMAPPING_FOR_WRITE, + PACK_COMPRESSED_BLOCK, + PUT_MAPPED_BLOCK, + PUT_MAPPED_BLOCK_FOR_DEDUPE, + READ_DATA, + UPDATE_INDEX, + VERIFY_DEDUPLICATION, + WRITE_DATA, + MAX_ASYNC_OPERATION_NUMBER, +} AsyncOperationNumber; + +/* + * An LBN lock. + */ +struct lbnLock { + /* The LBN being locked */ + LogicalBlockNumber lbn; + /* Whether the lock is locked */ + bool locked; + /* The queue of waiters for the lock */ + WaitQueue waiters; + /* The logical zone of the LBN */ + LogicalZone *zone; +}; + +/* + * Fields for using the arboreal block map. + */ +typedef struct { + /* The current height at which this DataVIO is operating */ + Height height; + /* The block map tree for this LBN */ + RootCount rootIndex; + /* Whether we hold a page lock */ + bool locked; + /* The thread on which to run the callback */ + ThreadID threadID; + /* The function to call after looking up a block map slot */ + VDOAction *callback; + /* The key for the lock map */ + uint64_t key; + /* The queue of waiters for the page this VIO is allocating or loading */ + WaitQueue waiters; + /* The block map tree slots for this LBN */ + BlockMapTreeSlot treeSlots[BLOCK_MAP_TREE_HEIGHT + 1]; +} TreeLock; + +typedef struct { + /* + * The current compression state of this VIO. This field contains a value + * which consists of a VIOCompressionState possibly ORed with a flag + * indicating that a request has been made to cancel (or prevent) compression + * for this VIO. + * + * This field should be accessed through the getCompressionState() and + * setCompressionState() methods. It should not be accessed directly. + */ + Atomic32 state; + + /* The compressed size of this block */ + uint16_t size; + + /* The packer input or output bin slot which holds the enclosing DataVIO */ + SlotNumber slot; + + /* The packer input bin to which the enclosing DataVIO has been assigned */ + InputBin *bin; + + /* A pointer to the compressed form of this block */ + char *data; + + /* + * A VIO which is blocked in the packer while holding a lock this VIO needs. + */ + DataVIO *lockHolder; + +} CompressionState; + +/** + * A VIO for processing user data requests. + **/ +struct dataVIO { + /* The underlying AllocatingVIO */ + AllocatingVIO allocatingVIO; + + /* The logical block of this request */ + LBNLock logical; + + /* The state for traversing the block map tree */ + TreeLock treeLock; + + /* The current partition address of this block */ + ZonedPBN mapped; + + /** The hash of this VIO (if not zero) */ + UdsChunkName chunkName; + + /* Used for logging and debugging */ + AsyncOperationNumber lastAsyncOperation; + + /* The operation to record in the recovery and slab journals */ + ReferenceOperation operation; + + /* Whether this VIO is a read-and-write VIO */ + bool isPartialWrite; + + /* Whether this VIO contains all zeros */ + bool isZeroBlock; + + /* Whether this VIO write is a duplicate */ + bool isDuplicate; + + /* + * Whether this VIO has received an allocation (needs to be atomic so it can + * be examined from threads not in the allocation zone). + */ + AtomicBool hasAllocation; + + /* The new partition address of this block after the VIO write completes */ + ZonedPBN newMapped; + + /* The hash zone responsible for the chunk name (NULL if isZeroBlock) */ + HashZone *hashZone; + + /* The lock this VIO holds or shares with other VIOs with the same data */ + HashLock *hashLock; + + /* All DataVIOs sharing a hash lock are kept in a ring linking these nodes */ + RingNode hashLockNode; + + /* The block number in the partition of the albireo deduplication advice */ + ZonedPBN duplicate; + + /* + * The sequence number of the recovery journal block containing the increment + * entry for this VIO. + */ + SequenceNumber recoverySequenceNumber; + + /* The point in the recovery journal where this write last made an entry */ + JournalPoint recoveryJournalPoint; + + /* The RingNode of VIOs in user initiated write requests */ + RingNode writeNode; + + /* A flag indicating that a data write VIO has a flush generation lock */ + bool hasFlushGenerationLock; + + /* The generation number of the VDO that this VIO belongs to */ + SequenceNumber flushGeneration; + + /* The completion to use for fetching block map pages for this vio */ + VDOPageCompletion pageCompletion; + + /* All of the fields necessary for the compression path */ + CompressionState compression; +}; + +/** + * Convert an AllocatingVIO to a DataVIO. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return The AllocatingVIO as a DataVIO + **/ +static inline DataVIO *allocatingVIOAsDataVIO(AllocatingVIO *allocatingVIO) +{ + STATIC_ASSERT(offsetof(DataVIO, allocatingVIO) == 0); + ASSERT_LOG_ONLY((allocatingVIOAsVIO(allocatingVIO)->type == VIO_TYPE_DATA), + "AllocatingVIO is a DataVIO"); + return (DataVIO *) allocatingVIO; +} + +/** + * Convert a VIO to a DataVIO. + * + * @param vio The VIO to convert + * + * @return The VIO as a DataVIO + **/ +static inline DataVIO *vioAsDataVIO(VIO *vio) +{ + STATIC_ASSERT(offsetof(DataVIO, allocatingVIO) == 0); + STATIC_ASSERT(offsetof(AllocatingVIO, vio) == 0); + ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "VIO is a DataVIO"); + return (DataVIO *) vio; +} + +/** + * Convert a DataVIO to an AllocatingVIO. + * + * @param dataVIO The DataVIO to convert + * + * @return The DataVIO as an AllocatingVIO + **/ +static inline AllocatingVIO *dataVIOAsAllocatingVIO(DataVIO *dataVIO) +{ + return &dataVIO->allocatingVIO; +} + +/** + * Convert a DataVIO to a VIO. + * + * @param dataVIO The DataVIO to convert + * + * @return The DataVIO as a VIO + **/ +static inline VIO *dataVIOAsVIO(DataVIO *dataVIO) +{ + return allocatingVIOAsVIO(dataVIOAsAllocatingVIO(dataVIO)); +} + +/** + * Convert a generic VDOCompletion to a DataVIO. + * + * @param completion The completion to convert + * + * @return The completion as a DataVIO + **/ +static inline DataVIO *asDataVIO(VDOCompletion *completion) +{ + return vioAsDataVIO(asVIO(completion)); +} + +/** + * Convert a DataVIO to a generic completion. + * + * @param dataVIO The DataVIO to convert + * + * @return The DataVIO as a completion + **/ +static inline VDOCompletion *dataVIOAsCompletion(DataVIO *dataVIO) +{ + return allocatingVIOAsCompletion(dataVIOAsAllocatingVIO(dataVIO)); +} + +/** + * Convert a DataVIO to a generic wait queue entry. + * + * @param dataVIO The DataVIO to convert + * + * @return The DataVIO as a wait queue entry + **/ +static inline Waiter *dataVIOAsWaiter(DataVIO *dataVIO) +{ + return allocatingVIOAsWaiter(dataVIOAsAllocatingVIO(dataVIO)); +} + +/** + * Convert a DataVIO's generic wait queue entry back to the DataVIO. + * + * @param waiter The wait queue entry to convert + * + * @return The wait queue entry as a DataVIO + **/ +static inline DataVIO *waiterAsDataVIO(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + + return allocatingVIOAsDataVIO(waiterAsAllocatingVIO(waiter)); +} + +/** + * Check whether a DataVIO is a read. + * + * @param dataVIO The DataVIO to check + **/ +static inline bool isReadDataVIO(DataVIO *dataVIO) +{ + return isReadVIO(dataVIOAsVIO(dataVIO)); +} + +/** + * Check whether a DataVIO is a write. + * + * @param dataVIO The DataVIO to check + **/ +static inline bool isWriteDataVIO(DataVIO *dataVIO) +{ + return isWriteVIO(dataVIOAsVIO(dataVIO)); +} + +/** + * Check whether a DataVIO is a compressed block write. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO is a compressed block write + **/ +static inline bool isCompressedWriteDataVIO(DataVIO *dataVIO) +{ + return isCompressedWriteVIO(dataVIOAsVIO(dataVIO)); +} + +/** + * Check whether a DataVIO is a trim. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO is a trim + **/ +static inline bool isTrimDataVIO(DataVIO *dataVIO) +{ + return (dataVIO->newMapped.state == MAPPING_STATE_UNMAPPED); +} + +/** + * Get the location that should passed Albireo as the new advice for where to + * find the data written by this DataVIO. + * + * @param dataVIO The write DataVIO that is ready to update Albireo + * + * @return a DataLocation containing the advice to store in Albireo + **/ +static inline DataLocation getDataVIONewAdvice(const DataVIO *dataVIO) +{ + return (DataLocation) { + .pbn = dataVIO->newMapped.pbn, + .state = dataVIO->newMapped.state, + }; +} + +/** + * Get the VDO from a DataVIO. + * + * @param dataVIO The DataVIO from which to get the VDO + * + * @return The VDO to which a DataVIO belongs + **/ +static inline VDO *getVDOFromDataVIO(DataVIO *dataVIO) +{ + return dataVIOAsVIO(dataVIO)->vdo; +} + +/** + * Get the ThreadConfig from a DataVIO. + * + * @param dataVIO The DataVIO from which to get the ThreadConfig + * + * @return The ThreadConfig of the VDO to which a DataVIO belongs + **/ +static inline const ThreadConfig *getThreadConfigFromDataVIO(DataVIO *dataVIO) +{ + return getThreadConfig(getVDOFromDataVIO(dataVIO)); +} + +/** + * Get the allocation of a DataVIO. + * + * @param dataVIO The DataVIO + * + * @return The allocation of the DataVIO + **/ +static inline PhysicalBlockNumber getDataVIOAllocation(DataVIO *dataVIO) +{ + return dataVIOAsAllocatingVIO(dataVIO)->allocation; +} + +/** + * Check whether a DataVIO has an allocation. + * + * @param dataVIO The DataVIO to check + * + * @return true if the DataVIO has an allocated block + **/ +static inline bool hasAllocation(DataVIO *dataVIO) +{ + return (getDataVIOAllocation(dataVIO) != ZERO_BLOCK); +} + +/** + * (Re)initialize a DataVIO to have a new logical block number, keeping the + * same parent and other state. This method must be called before using a + * DataVIO. + * + * @param dataVIO The DataVIO to initialize + * @param lbn The logical block number of the DataVIO + * @param operation The operation this DataVIO will perform + * @param isTrim true if this DataVIO is for a trim request + * @param callback The function to call once the VIO has completed its + * operation + **/ +void prepareDataVIO(DataVIO *dataVIO, + LogicalBlockNumber lbn, + VIOOperation operation, + bool isTrim, + VDOAction *callback); + +/** + * Complete the processing of a DataVIO. + * + * @param completion The completion of the VIO to complete + **/ +void completeDataVIO(VDOCompletion *completion); + +/** + * Finish processing a DataVIO, possibly due to an error. This function will + * set any error, and then initiate DataVIO clean up. + * + * @param dataVIO The DataVIO to abort + * @param result The result of processing the DataVIO + **/ +void finishDataVIO(DataVIO *dataVIO, int result); + +/** + * Continue processing a DataVIO that has been waiting for an event, setting + * the result from the event and calling the current callback. + * + * @param dataVIO The DataVIO to continue + * @param result The current result (will not mask older errors) + **/ +static inline void continueDataVIO(DataVIO *dataVIO, int result) +{ + continueCompletion(dataVIOAsCompletion(dataVIO), result); +} + +/** + * Get the name of the last asynchronous operation performed on a DataVIO. + * + * @param dataVIO The DataVIO in question + * + * @return The name of the last operation performed on the DataVIO + **/ +const char *getOperationName(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Add a trace record for the current source location. + * + * @param dataVIO The DataVIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void dataVIOAddTraceRecord(DataVIO *dataVIO, + TraceLocation location) +{ + vioAddTraceRecord(dataVIOAsVIO(dataVIO), location); +} + +/** + * Add a DataVIO to the tail end of a wait queue. The DataVIO must not already + * be waiting in a queue. A trace record is also generated for the DataVIO. + * + * @param queue The queue to which to add the waiter + * @param waiter The DataVIO to add to the queue + * @param location The source-location descriptor to be traced in the DataVIO + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static inline int enqueueDataVIO(WaitQueue *queue, + DataVIO *waiter, + TraceLocation location) +{ + dataVIOAddTraceRecord(waiter, location); + return enqueueWaiter(queue, dataVIOAsWaiter(waiter)); +} + +/** + * Check that a DataVIO is running on the correct thread for its hash zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInHashZone(DataVIO *dataVIO) +{ + ThreadID expected = getHashZoneThreadID(dataVIO->hashZone); + ThreadID threadID = getCallbackThreadID(); + // It's odd to use the LBN, but converting the chunk name to hex is a bit + // clunky for an inline, and the LBN better than nothing as an identifier. + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for logical block %" PRIu64 + " on thread %u, should be on hash zone thread %u", + dataVIO->logical.lbn, threadID, expected); +} + +/** + * Set a callback as a hash zone operation. This function presumes that the + * hashZone field of the DataVIO has already been set. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setHashZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getHashZoneThreadID(dataVIO->hashZone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a hash zone operation and invoke it immediately. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void launchHashZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setHashZoneCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the correct thread for its logical zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInLogicalZone(DataVIO *dataVIO) +{ + ThreadID expected = getLogicalZoneThreadID(dataVIO->logical.zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for logical block %" PRIu64 + " on thread %u, should be on thread %u", + dataVIO->logical.lbn, threadID, expected); +} + +/** + * Set a callback as a logical block operation. This function presumes that the + * logicalZone field of the DataVIO has already been set. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setLogicalCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getLogicalZoneThreadID(dataVIO->logical.zone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a logical block operation and invoke it immediately. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void launchLogicalCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setLogicalCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the correct thread for its allocated + * zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInAllocatedZone(DataVIO *dataVIO) +{ + assertInPhysicalZone(dataVIOAsAllocatingVIO(dataVIO)); +} + +/** + * Set a callback as a physical block operation in a DataVIO's allocated zone. + * + * @param dataVIO The DataVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setAllocatedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setPhysicalZoneCallback(dataVIOAsAllocatingVIO(dataVIO), callback, + location); +} + +/** + * Set a callback as a physical block operation in a DataVIO's allocated zone + * and queue the DataVIO and invoke it immediately. + * + * @param dataVIO The DataVIO + * @param callback The callback to invoke + * @param location The tracing info for the call site + **/ +static inline void launchAllocatedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + launchPhysicalZoneCallback(dataVIOAsAllocatingVIO(dataVIO), callback, + location); +} + +/** + * Check that a DataVIO is running on the correct thread for its duplicate + * zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInDuplicateZone(DataVIO *dataVIO) +{ + ThreadID expected = getPhysicalZoneThreadID(dataVIO->duplicate.zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for duplicate physical block %" PRIu64 + " on thread %u, should be on thread %u", + dataVIO->duplicate.pbn, threadID, expected); +} + +/** + * Set a callback as a physical block operation in a DataVIO's duplicate zone. + * + * @param dataVIO The DataVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setDuplicateZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getPhysicalZoneThreadID(dataVIO->duplicate.zone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a physical block operation in a DataVIO's duplicate zone + * and queue the DataVIO and invoke it immediately. + * + * @param dataVIO The DataVIO + * @param callback The callback to invoke + * @param location The tracing info for the call site + **/ +static inline void launchDuplicateZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setDuplicateZoneCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the correct thread for its mapped zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInMappedZone(DataVIO *dataVIO) +{ + ThreadID expected = getPhysicalZoneThreadID(dataVIO->mapped.zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for mapped physical block %" PRIu64 + " on thread %u, should be on thread %u", + dataVIO->mapped.pbn, threadID, expected); +} + +/** + * Set a callback as a physical block operation in a DataVIO's mapped zone. + * + * @param dataVIO The DataVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setMappedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getPhysicalZoneThreadID(dataVIO->mapped.zone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Check that a DataVIO is running on the correct thread for its newMapped + * zone. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInNewMappedZone(DataVIO *dataVIO) +{ + ThreadID expected = getPhysicalZoneThreadID(dataVIO->newMapped.zone); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for newMapped physical block %" PRIu64 + " on thread %u, should be on thread %u", + dataVIO->newMapped.pbn, threadID, expected); +} + +/** + * Set a callback as a physical block operation in a DataVIO's newMapped zone. + * + * @param dataVIO The DataVIO + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setNewMappedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getPhysicalZoneThreadID(dataVIO->newMapped.zone)); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a physical block operation in a DataVIO's newMapped zone + * and queue the DataVIO and invoke it immediately. + * + * @param dataVIO The DataVIO + * @param callback The callback to invoke + * @param location The tracing info for the call site + **/ +static inline void launchNewMappedZoneCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setNewMappedZoneCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the journal thread. + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInJournalZone(DataVIO *dataVIO) +{ + ThreadID expected + = getJournalZoneThread(getThreadConfigFromDataVIO(dataVIO)); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for logical block %" PRIu64 + " on thread %u, should be on journal thread %u", + dataVIO->logical.lbn, threadID, expected); +} + +/** + * Set a callback as a journal operation. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setJournalCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getJournalZoneThread(getThreadConfigFromDataVIO(dataVIO))); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a journal operation and invoke it immediately. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void launchJournalCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setJournalCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check that a DataVIO is running on the packer thread + * + * @param dataVIO The DataVIO in question + **/ +static inline void assertInPackerZone(DataVIO *dataVIO) +{ + ThreadID expected = getPackerZoneThread(getThreadConfigFromDataVIO(dataVIO)); + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((expected == threadID), + "DataVIO for logical block %" PRIu64 + " on thread %u, should be on packer thread %u", + dataVIO->logical.lbn, threadID, expected); +} + +/** + * Set a callback as a packer operation. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void setPackerCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setCallback(dataVIOAsCompletion(dataVIO), callback, + getPackerZoneThread(getThreadConfigFromDataVIO(dataVIO))); + dataVIOAddTraceRecord(dataVIO, location); +} + +/** + * Set a callback as a packer operation and invoke it immediately. + * + * @param dataVIO The DataVIO with which to set the callback + * @param callback The callback to set + * @param location The tracing info for the call site + **/ +static inline void launchPackerCallback(DataVIO *dataVIO, + VDOAction *callback, + TraceLocation location) +{ + setPackerCallback(dataVIO, callback, location); + invokeCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Check whether the advice received from Albireo is a valid data location, + * and if it is, accept it as the location of a potential duplicate of the + * DataVIO. + * + * @param dataVIO The DataVIO that queried Albireo + * @param advice A potential location of the data, or NULL for no advice + **/ +void receiveDedupeAdvice(DataVIO *dataVIO, const DataLocation *advice); + +/** + * Set the location of the duplicate block for a DataVIO, updating the + * isDuplicate and duplicate fields from a ZonedPBN. + * + * @param dataVIO The DataVIO to modify + * @param source The location of the duplicate + **/ +void setDuplicateLocation(DataVIO *dataVIO, const ZonedPBN source); + +/** + * Clear a DataVIO's mapped block location, setting it to be unmapped. This + * indicates the block map entry for the logical block is either unmapped or + * corrupted. + * + * @param dataVIO The DataVIO whose mapped block location is to be reset + **/ +void clearMappedLocation(DataVIO *dataVIO); + +/** + * Set a DataVIO's mapped field to the physical location recorded in the block + * map for the logical block in the VIO. + * + * @param dataVIO The DataVIO whose field is to be set + * @param pbn The physical block number to set + * @param state The mapping state to set + * + * @return VDO_SUCCESS or an error code if the mapping is unusable + **/ +int setMappedLocation(DataVIO *dataVIO, + PhysicalBlockNumber pbn, + BlockMappingState state) + __attribute__((warn_unused_result)); + +/** + * Attempt to acquire the lock on a logical block. This is the start of the + * path for all external requests. It is registered in prepareDataVIO(). + * + * @param completion The DataVIO for an external data request as a completion + **/ +void attemptLogicalBlockLock(VDOCompletion *completion); + +/** + * Release the lock on the logical block, if any, that a DataVIO has acquired. + * + * @param dataVIO The DataVIO releasing its logical block lock + **/ +void releaseLogicalBlockLock(DataVIO *dataVIO); + +#endif // DATA_VIO_H diff --git a/source/vdo/base/dirtyLists.c b/source/vdo/base/dirtyLists.c new file mode 100644 index 0000000..d16b790 --- /dev/null +++ b/source/vdo/base/dirtyLists.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyLists.c#1 $ + */ + +#include "dirtyLists.h" +#include "dirtyListsInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "types.h" + +struct dirtyLists { + /** The number of periods after which an element will be expired */ + BlockCount maximumAge; + /** The oldest period which has unexpired elements */ + SequenceNumber oldestPeriod; + /** One more than the current period */ + SequenceNumber nextPeriod; + /** The function to call on expired elements */ + DirtyCallback *callback; + /** The callback context */ + void *context; + /** The offset in the array of lists of the oldest period */ + BlockCount offset; + /** The list of elements which are being expired */ + RingNode expired; + /** The lists of dirty elements */ + RingNode lists[]; +}; + +/**********************************************************************/ +int makeDirtyLists(BlockCount maximumAge, + DirtyCallback *callback, + void *context, + DirtyLists **dirtyListsPtr) +{ + DirtyLists *dirtyLists; + int result = ALLOCATE_EXTENDED(DirtyLists, maximumAge, RingNode, __func__, + &dirtyLists); + if (result != VDO_SUCCESS) { + return result; + } + + dirtyLists->maximumAge = maximumAge; + dirtyLists->callback = callback; + dirtyLists->context = context; + + initializeRing(&dirtyLists->expired); + for (BlockCount i = 0; i < maximumAge; i++) { + initializeRing(&dirtyLists->lists[i]); + } + + *dirtyListsPtr = dirtyLists; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeDirtyLists(DirtyLists **dirtyListsPtr) +{ + DirtyLists *lists = *dirtyListsPtr; + if (lists == NULL) { + return; + } + + FREE(lists); + *dirtyListsPtr = NULL; +} + +/**********************************************************************/ +void setCurrentPeriod(DirtyLists *dirtyLists, SequenceNumber period) +{ + ASSERT_LOG_ONLY(dirtyLists->nextPeriod == 0, "current period not set"); + dirtyLists->oldestPeriod = period; + dirtyLists->nextPeriod = period + 1; + dirtyLists->offset = period % dirtyLists->maximumAge; +} + +/** + * Expire the oldest list. + * + * @param dirtyLists The DirtyLists to expire + **/ +static void expireOldestList(DirtyLists *dirtyLists) +{ + dirtyLists->oldestPeriod++; + RingNode *ring = &(dirtyLists->lists[dirtyLists->offset++]); + if (!isRingEmpty(ring)) { + spliceRingChainBefore(ring->next, ring->prev, &dirtyLists->expired); + } + + if (dirtyLists->offset == dirtyLists->maximumAge) { + dirtyLists->offset = 0; + } +} + +/** + * Update the period if necessary. + * + * @param dirtyLists The DirtyLists + * @param period The new period + **/ +static void updatePeriod(DirtyLists *dirtyLists, SequenceNumber period) +{ + while (dirtyLists->nextPeriod <= period) { + if ((dirtyLists->nextPeriod - dirtyLists->oldestPeriod) + == dirtyLists->maximumAge) { + expireOldestList(dirtyLists); + } + dirtyLists->nextPeriod++; + } +} + +/** + * Write out the expired list. + * + * @param dirtyLists The dirtyLists + **/ +static void writeExpiredElements(DirtyLists *dirtyLists) +{ + if (isRingEmpty(&dirtyLists->expired)) { + return; + } + + dirtyLists->callback(&dirtyLists->expired, dirtyLists->context); + ASSERT_LOG_ONLY(isRingEmpty(&dirtyLists->expired), + "no expired elements remain"); +} + +/**********************************************************************/ +void addToDirtyLists(DirtyLists *dirtyLists, + RingNode *node, + SequenceNumber oldPeriod, + SequenceNumber newPeriod) +{ + if ((oldPeriod == newPeriod) + || ((oldPeriod != 0) && (oldPeriod < newPeriod))) { + return; + } + + if (newPeriod < dirtyLists->oldestPeriod) { + pushRingNode(&dirtyLists->expired, node); + } else { + updatePeriod(dirtyLists, newPeriod); + pushRingNode(&dirtyLists->lists[newPeriod % dirtyLists->maximumAge], node); + } + + writeExpiredElements(dirtyLists); +} + +/**********************************************************************/ +void advancePeriod(DirtyLists *dirtyLists, SequenceNumber period) +{ + updatePeriod(dirtyLists, period); + writeExpiredElements(dirtyLists); +} + +/**********************************************************************/ +void flushDirtyLists(DirtyLists *dirtyLists) +{ + while (dirtyLists->oldestPeriod < dirtyLists->nextPeriod) { + expireOldestList(dirtyLists); + } + writeExpiredElements(dirtyLists); +} + +/**********************************************************************/ +SequenceNumber getDirtyListsNextPeriod(DirtyLists *dirtyLists) +{ + return dirtyLists->nextPeriod; +} diff --git a/source/vdo/base/dirtyLists.h b/source/vdo/base/dirtyLists.h new file mode 100644 index 0000000..f3d27f7 --- /dev/null +++ b/source/vdo/base/dirtyLists.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyLists.h#1 $ + */ + +#ifndef DIRTY_LISTS_H +#define DIRTY_LISTS_H + +#include "ringNode.h" +#include "types.h" + +/** + * A collection of lists of dirty elements ordered by age. An element is always + * placed on the oldest list in which it was dirtied (moving between lists or + * removing altogether is cheap). Whenever the current period is advanced, any + * elements older than the maxium age are expired. If an element is to be added + * with a dirty age older than the maximum age, it is expired immediately. + **/ +typedef struct dirtyLists DirtyLists; + +/** + * A function which will be called with a ring of dirty elements which have + * been expired. All of the expired elements must be removed from the ring + * before this function returns. + * + * @param expired The list of expired elements + * @param context The context for the callback + **/ +typedef void DirtyCallback(RingNode *expired, void *context); + +/** + * Construct a new set of dirty lists. + * + * @param [in] maximumAge The age at which an element will be expired + * @param [in] callback The function to call when a set of elements have + * expired + * @param [in] context The context for the callback + * @param [out] dirtyListsPtr A pointer to hold the new DirtyLists + * + * @return VDO_SUCCESS or an error + **/ +int makeDirtyLists(BlockCount maximumAge, + DirtyCallback *callback, + void *context, + DirtyLists **dirtyListsPtr) + __attribute__((warn_unused_result)); + +/** + * Free a set of dirty lists and null out the pointer to them. + * + * @param dirtyListsPtr A pointer to the dirty lists to be freed + **/ +void freeDirtyLists(DirtyLists **dirtyListsPtr); + +/** + * Set the current period. This function should only be called once. + * + * @param dirtyLists The dirtyLists + * @param period The current period + **/ +void setCurrentPeriod(DirtyLists *dirtyLists, SequenceNumber period); + +/** + * Add an element to the dirty lists. + * + * @param dirtyLists The DirtyLists receiving the element + * @param node The RingNode of the element to add + * @param oldPeriod The period in which the element was previous dirtied, + * or 0 if it was not dirty + * @param newPeriod The period in which the element has now been dirtied, + * or 0 if it does not hold a lock + **/ +void addToDirtyLists(DirtyLists *dirtyLists, + RingNode *node, + SequenceNumber oldPeriod, + SequenceNumber newPeriod); + +/** + * Advance the current period. If the current period is greater than the number + * of lists, expire the oldest lists. + * + * @param dirtyLists The DirtyLists to advance + * @param period The new current period + **/ +void advancePeriod(DirtyLists *dirtyLists, SequenceNumber period); + +/** + * Flush all dirty lists. This will cause the period to be advanced past the + * current period. + * + * @param dirtyLists The dirtyLists to flush + **/ +void flushDirtyLists(DirtyLists *dirtyLists); + +#endif // DIRTY_LISTS_H diff --git a/source/vdo/base/dirtyListsInternals.h b/source/vdo/base/dirtyListsInternals.h new file mode 100644 index 0000000..d5876d0 --- /dev/null +++ b/source/vdo/base/dirtyListsInternals.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyListsInternals.h#1 $ + */ + +#ifndef DIRTY_LISTS_INTERNALS_H +#define DIRTY_LISTS_INTERNALS_H + +#include "dirtyLists.h" +#include "types.h" + +/** + * Get the next period from a DirtyLists. This method is used by unit tests. + * + * @param dirtyLists The DirtyLists to examine + **/ +SequenceNumber getDirtyListsNextPeriod(DirtyLists *dirtyLists) + __attribute__((warn_unused_result)); + +#endif // DIRTY_LISTS_INTERNALS_H diff --git a/source/vdo/base/extent.c b/source/vdo/base/extent.c new file mode 100644 index 0000000..5983615 --- /dev/null +++ b/source/vdo/base/extent.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/extent.c#3 $ + */ + +#include "extent.h" + +#include "memoryAlloc.h" + +#include "completion.h" +#include "constants.h" +#include "logger.h" +#include "physicalLayer.h" +#include "types.h" +#include "vdo.h" +#include "vioRead.h" +#include "vioWrite.h" + +/**********************************************************************/ +int createExtent(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + BlockCount blockCount, + char *data, + VDOExtent **extentPtr) +{ + int result = ASSERT(isMetadataVIOType(vioType), + "createExtent() called for metadata"); + if (result != VDO_SUCCESS) { + return result; + } + + VDOExtent *extent; + result = ALLOCATE_EXTENDED(VDOExtent, blockCount, VIO *, __func__, &extent); + if (result != VDO_SUCCESS) { + return result; + } + + result = initializeEnqueueableCompletion(&extent->completion, + VDO_EXTENT_COMPLETION, layer); + if (result != VDO_SUCCESS) { + FREE(extent); + return result; + } + + for (; extent->count < blockCount; extent->count++) { + result = layer->createMetadataVIO(layer, vioType, priority, extent, data, + &extent->vios[extent->count]); + if (result != VDO_SUCCESS) { + freeExtent(&extent); + return result; + } + + data += VDO_BLOCK_SIZE; + } + + *extentPtr = extent; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeExtent(VDOExtent **extentPtr) +{ + VDOExtent *extent = *extentPtr; + if (extent == NULL) { + return; + } + + for (BlockCount i = 0; i < extent->count; i++) { + freeVIO(&extent->vios[i]); + } + + destroyEnqueueable(&extent->completion); + FREE(extent); + *extentPtr = NULL; +} + +/** + * Launch a metadata extent. + * + * @param extent The extent + * @param startBlock The absolute physical block at which the extent should + * begin its I/O + * @param count The number of blocks to write + * @param operation The operation to perform on the extent + **/ +static void launchMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count, + VIOOperation operation) +{ + resetCompletion(&extent->completion); + if (count > extent->count) { + finishCompletion(&extent->completion, VDO_OUT_OF_RANGE); + return; + } + + extent->completeCount = extent->count - count; + for (BlockCount i = 0; i < count; i++) { + VIO *vio = extent->vios[i]; + vio->completion.callbackThreadID = extent->completion.callbackThreadID; + launchMetadataVIO(vio, startBlock++, handleVIOCompletion, + handleVIOCompletion, operation); + } +} + +/**********************************************************************/ +void readPartialMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count) +{ + launchMetadataExtent(extent, startBlock, count, VIO_READ); +} + +/**********************************************************************/ +void writePartialMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count) +{ + launchMetadataExtent(extent, startBlock, count, VIO_WRITE); +} + +/**********************************************************************/ +void handleVIOCompletion(VDOCompletion *completion) +{ + VDOExtent *extent = asVDOExtent(completion->parent); + if (++extent->completeCount != extent->count) { + setCompletionResult(extentAsCompletion(extent), completion->result); + return; + } + + finishCompletion(extentAsCompletion(extent), completion->result); +} diff --git a/source/vdo/base/extent.h b/source/vdo/base/extent.h new file mode 100644 index 0000000..b023c06 --- /dev/null +++ b/source/vdo/base/extent.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/extent.h#2 $ + */ + +#ifndef EXTENT_H +#define EXTENT_H + +#include "permassert.h" + +#include "completion.h" +#include "types.h" +#include "vio.h" + +/** + * A chain of VIOs which are part of the same request. An extent contains + * a chain of at least 'count' VIOs. The 'next' pointer of the last VIO + * in the extent (as indicated by the count) may not be NULL, but it is not + * part of the extent. A VIO may belong to a single extent. + **/ +struct vdoExtent { + // The completion for asynchronous extent processing + VDOCompletion completion; + // The number of VIOs in the extent + BlockCount count; + // The number of completed VIOs in the extent + BlockCount completeCount; + // The VIOs in the extent + VIO *vios[]; +}; + +/** + * Convert a generic VDOCompletion to a VDOExtent. + * + * @param completion The completion to convert + * + * @return The completion as an extent + **/ +static inline VDOExtent *asVDOExtent(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(VDOExtent, completion) == 0); + assertCompletionType(completion->type, VDO_EXTENT_COMPLETION); + return (VDOExtent *) completion; +} + +/** + * Convert a VDOExtent to VDOCompletion. + * + * @param extent The extent to convert + * + * @return The extent as a VDOCompletion + **/ +static inline VDOCompletion *extentAsCompletion(VDOExtent *extent) +{ + return &extent->completion; +} + +/** + * Create a VDOExtent. + * + * @param [in] layer The layer + * @param [in] vioType The usage type to assign to the VIOs in the extent + * (data / block map / journal) + * @param [in] priority The relative priority to assign to the VIOs + * @param [in] blockCount The number of blocks in the buffer + * @param [in] data The buffer + * @param [out] extentPtr A pointer to hold the new extent + * + * @return VDO_SUCCESS or an error + **/ +int createExtent(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + BlockCount blockCount, + char *data, + VDOExtent **extentPtr) + __attribute__((warn_unused_result)); + +/** + * Free an extent and null out the reference to it. + * + * @param [in,out] extentPtr The reference to the extent to free + **/ +void freeExtent(VDOExtent **extentPtr); + +/** + * Read metadata from the underlying storage. + * + * @param extent The extent to read + * @param startBlock The physical block number of the first block + * in the extent + * @param count The number of blocks to read (must be less than or + * equal to the length of the extent) + **/ +void readPartialMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count); + +/** + * Read metadata from the underlying storage. + * + * @param extent The extent to read + * @param startBlock The physical block number of the first block + * in the extent + **/ +static inline void readMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock) +{ + readPartialMetadataExtent(extent, startBlock, extent->count); +} + +/** + * Write metadata to the underlying storage. + * + * @param extent The extent to write + * @param startBlock The physical block number of the first block in the + * extent + * @param count The number of blocks to read (must be less than or + * equal to the length of the extent) + **/ +void writePartialMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock, + BlockCount count); +/** + * Write metadata to the underlying storage. + * + * @param extent The extent to write + * @param startBlock The physical block number of the first block in the + * extent + **/ +static inline void writeMetadataExtent(VDOExtent *extent, + PhysicalBlockNumber startBlock) +{ + writePartialMetadataExtent(extent, startBlock, extent->count); +} + +/** + * Notify an extent that one of its VIOs has completed. If the signaling VIO + * is the last of the extent's VIOs to complete, the extent will finish. This + * function is set as the VIO callback in completeVIO(). + * + * @param completion The completion of the VIO which has just finished + **/ +void handleVIOCompletion(VDOCompletion *completion); + +#endif /* EXTENT_H */ diff --git a/source/vdo/base/fixedLayout.c b/source/vdo/base/fixedLayout.c new file mode 100644 index 0000000..4ea048a --- /dev/null +++ b/source/vdo/base/fixedLayout.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/fixedLayout.c#3 $ + */ + +#include "fixedLayout.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" + +#include "header.h" +#include "statusCodes.h" + +const BlockCount ALL_FREE_BLOCKS = (uint64_t) -1; + +struct fixedLayout { + PhysicalBlockNumber firstFree; + PhysicalBlockNumber lastFree; + size_t numPartitions; + Partition *head; +}; + +struct partition { + PartitionID id; // The id of this partition + FixedLayout *layout; // The layout to which this partition belongs + PhysicalBlockNumber offset; // The offset into the layout of this partition + PhysicalBlockNumber base; // The untranslated number of the first block + BlockCount count; // The number of blocks in the partition + Partition *next; // A pointer to the next partition in the layout +}; + +typedef struct { + PhysicalBlockNumber firstFree; + PhysicalBlockNumber lastFree; + byte partitionCount; +} __attribute__((packed)) Layout3_0; + +typedef struct { + PartitionID id; + PhysicalBlockNumber offset; + PhysicalBlockNumber base; + BlockCount count; +} __attribute__((packed)) Partition3_0; + +static const Header LAYOUT_HEADER_3_0 = { + .id = FIXED_LAYOUT, + .version = { + .majorVersion = 3, + .minorVersion = 0, + }, + .size = sizeof(Layout3_0), // Minimum size (contains no partitions) +}; + +/**********************************************************************/ +int makeFixedLayout(BlockCount totalBlocks, + PhysicalBlockNumber startOffset, + FixedLayout **layoutPtr) +{ + FixedLayout *layout; + int result = ALLOCATE(1, FixedLayout, "fixed layout", &layout); + if (result != UDS_SUCCESS) { + return result; + } + + layout->firstFree = startOffset; + layout->lastFree = startOffset + totalBlocks; + layout->numPartitions = 0; + layout->head = NULL; + + *layoutPtr = layout; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeFixedLayout(FixedLayout **layoutPtr) +{ + FixedLayout *layout = *layoutPtr; + if (layout == NULL) { + return; + } + + while (layout->head != NULL) { + Partition *part = layout->head; + layout->head = part->next; + FREE(part); + } + + FREE(layout); + *layoutPtr = NULL; +} + +/**********************************************************************/ +BlockCount getTotalFixedLayoutSize(const FixedLayout *layout) +{ + BlockCount size = getFixedLayoutBlocksAvailable(layout); + for (Partition *partition = layout->head; partition != NULL; + partition = partition->next) { + size += partition->count; + } + + return size; +} + +/**********************************************************************/ +int getPartition(FixedLayout *layout, PartitionID id, Partition **partitionPtr) +{ + for (Partition *partition = layout->head; partition != NULL; + partition = partition->next) { + if (partition->id == id) { + if (partitionPtr != NULL) { + *partitionPtr = partition; + } + return VDO_SUCCESS; + } + } + + return VDO_UNKNOWN_PARTITION; +} + +/**********************************************************************/ +int translateToPBN(const Partition *partition, + PhysicalBlockNumber partitionBlockNumber, + PhysicalBlockNumber *layerBlockNumber) +{ + if (partition == NULL) { + *layerBlockNumber = partitionBlockNumber; + return VDO_SUCCESS; + } + + if (partitionBlockNumber < partition->base) { + return VDO_OUT_OF_RANGE; + } + + PhysicalBlockNumber offsetFromBase = partitionBlockNumber - partition->base; + if (offsetFromBase >= partition->count) { + return VDO_OUT_OF_RANGE; + } + + *layerBlockNumber = partition->offset + offsetFromBase; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int translateFromPBN(const Partition *partition, + PhysicalBlockNumber layerBlockNumber, + PhysicalBlockNumber *partitionBlockNumberPtr) +{ + if (partition == NULL) { + *partitionBlockNumberPtr = layerBlockNumber; + return VDO_SUCCESS; + } + + if (layerBlockNumber < partition->offset) { + return VDO_OUT_OF_RANGE; + } + + PhysicalBlockNumber partitionBlockNumber + = layerBlockNumber - partition->offset; + if (partitionBlockNumber >= partition->count) { + return VDO_OUT_OF_RANGE; + } + + *partitionBlockNumberPtr = partitionBlockNumber + partition->base; + return VDO_SUCCESS; +} + +/**********************************************************************/ +BlockCount getFixedLayoutBlocksAvailable(const FixedLayout *layout) +{ + return layout->lastFree - layout->firstFree; +} + +/** + * Allocate a partition. The partition will be attached to the partition + * list in the layout. + * + * @param layout The layout containing the partition + * @param id The id of the partition + * @param offset The offset into the layout at which the partition begins + * @param base The number of the first block for users of the partition + * @param blockCount The number of blocks in the partition + * + * @return VDO_SUCCESS or an error + **/ +static int allocatePartition(FixedLayout *layout, + byte id, + PhysicalBlockNumber offset, + PhysicalBlockNumber base, + BlockCount blockCount) +{ + Partition *partition; + int result = ALLOCATE(1, Partition, "fixed layout partition", &partition); + if (result != UDS_SUCCESS) { + return result; + } + + partition->id = id; + partition->layout = layout; + partition->offset = offset; + partition->base = base; + partition->count = blockCount; + partition->next = layout->head; + layout->head = partition; + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeFixedLayoutPartition(FixedLayout *layout, + PartitionID id, + BlockCount blockCount, + PartitionDirection direction, + PhysicalBlockNumber base) +{ + BlockCount freeBlocks = layout->lastFree - layout->firstFree; + if (blockCount == ALL_FREE_BLOCKS) { + if (freeBlocks == 0) { + return VDO_NO_SPACE; + } else { + blockCount = freeBlocks; + } + } else if (blockCount > freeBlocks) { + return VDO_NO_SPACE; + } + + int result = getPartition(layout, id, NULL); + if (result != VDO_UNKNOWN_PARTITION) { + return VDO_PARTITION_EXISTS; + } + + PhysicalBlockNumber offset = ((direction == FROM_END) + ? (layout->lastFree - blockCount) + : layout->firstFree); + result = allocatePartition(layout, id, offset, base, blockCount); + if (result != VDO_SUCCESS) { + return result; + } + + layout->numPartitions++; + if (direction == FROM_END) { + layout->lastFree = layout->lastFree - blockCount; + } else { + layout->firstFree += blockCount; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +BlockCount getFixedLayoutPartitionSize(const Partition *partition) +{ + return partition->count; +} + +/**********************************************************************/ +PhysicalBlockNumber getFixedLayoutPartitionOffset(const Partition *partition) +{ + return partition->offset; +} + +/**********************************************************************/ +PhysicalBlockNumber getFixedLayoutPartitionBase(const Partition *partition) +{ + return partition->base; +} + +/**********************************************************************/ +static inline size_t getEncodedSize(const FixedLayout *layout) +{ + return sizeof(Layout3_0) + (sizeof(Partition3_0) * layout->numPartitions); +} + +/**********************************************************************/ +size_t getFixedLayoutEncodedSize(const FixedLayout *layout) +{ + return ENCODED_HEADER_SIZE + getEncodedSize(layout); +} + +/** + * Encode a null-terminated list of fixed layout partitions into a buffer + * using partition format 3.0. + * + * @param layout The layout containing the list of partitions to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error code + **/ +static int encodePartitions_3_0(const FixedLayout *layout, Buffer *buffer) +{ + for (const Partition *partition = layout->head; + partition != NULL; + partition = partition->next) { + STATIC_ASSERT_SIZEOF(PartitionID, sizeof(byte)); + int result = putByte(buffer, partition->id); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, partition->offset); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, partition->base); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, partition->count); + if (result != UDS_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * Encode the header fields of a fixed layout into a buffer using layout + * format 3.0. + * + * @param layout The layout to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error code + **/ +static int encodeLayout_3_0(const FixedLayout *layout, Buffer *buffer) +{ + int result = ASSERT(layout->numPartitions <= UINT8_MAX, + "fixed layout partition count must fit in a byte"); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, layout->firstFree); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, layout->lastFree); + if (result != UDS_SUCCESS) { + return result; + } + + return putByte(buffer, layout->numPartitions); +} + +/**********************************************************************/ +int encodeFixedLayout(const FixedLayout *layout, Buffer *buffer) +{ + if (!ensureAvailableSpace(buffer, getFixedLayoutEncodedSize(layout))) { + return UDS_BUFFER_ERROR; + } + + Header header = LAYOUT_HEADER_3_0; + header.size = getEncodedSize(layout); + int result = encodeHeader(&header, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = encodeLayout_3_0(layout, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + result = ASSERT(encodedSize == sizeof(Layout3_0), + "encoded size of fixed layout header must match structure"); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodePartitions_3_0(layout, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + encodedSize = contentLength(buffer) - initialLength; + return ASSERT(encodedSize == header.size, + "encoded size of fixed layout must match header size"); +} + +/** + * Decode a sequence of fixed layout partitions from a buffer + * using partition format 3.0. + * + * @param buffer A buffer positioned at the start of the encoding + * @param layout The layout in which to allocate the decoded partitions + * + * @return UDS_SUCCESS or an error code + **/ +static int decodePartitions_3_0(Buffer *buffer, FixedLayout *layout) +{ + for (size_t i = 0; i < layout->numPartitions; i++) { + byte id; + int result = getByte(buffer, &id); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t offset; + result = getUInt64LEFromBuffer(buffer, &offset); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t base; + result = getUInt64LEFromBuffer(buffer, &base); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t count; + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + + result = allocatePartition(layout, id, offset, base, count); + if (result != VDO_SUCCESS) { + return result; + } + } + + return UDS_SUCCESS; +} + +/** + * Decode the header fields of a fixed layout from a buffer using layout + * format 3.0. + * + * @param buffer A buffer positioned at the start of the encoding + * @param layout The structure to receive the decoded fields + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeLayout_3_0(Buffer *buffer, Layout3_0 *layout) +{ + size_t initialLength = contentLength(buffer); + + PhysicalBlockNumber firstFree; + int result = getUInt64LEFromBuffer(buffer, &firstFree); + if (result != UDS_SUCCESS) { + return result; + } + + PhysicalBlockNumber lastFree; + result = getUInt64LEFromBuffer(buffer, &lastFree); + if (result != UDS_SUCCESS) { + return result; + } + + byte partitionCount; + result = getByte(buffer, &partitionCount); + if (result != UDS_SUCCESS) { + return result; + } + + *layout = (Layout3_0) { + .firstFree = firstFree, + .lastFree = lastFree, + .partitionCount = partitionCount, + }; + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(decodedSize == sizeof(Layout3_0), + "decoded size of fixed layout header must match structure"); +} + +/**********************************************************************/ +int decodeFixedLayout(Buffer *buffer, FixedLayout **layoutPtr) +{ + Header header; + int result = decodeHeader(buffer, &header); + if (result != UDS_SUCCESS) { + return result; + } + + // Layout is variable size, so only do a minimum size check here. + result = validateHeader(&LAYOUT_HEADER_3_0, &header, false, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + Layout3_0 layoutHeader; + result = decodeLayout_3_0(buffer, &layoutHeader); + if (result != UDS_SUCCESS) { + return result; + } + + if (contentLength(buffer) + < (sizeof(Partition3_0) * layoutHeader.partitionCount)) { + return VDO_UNSUPPORTED_VERSION; + } + + FixedLayout *layout; + result = ALLOCATE(1, FixedLayout, "fixed layout", &layout); + if (result != UDS_SUCCESS) { + return result; + } + + layout->firstFree = layoutHeader.firstFree; + layout->lastFree = layoutHeader.lastFree; + layout->numPartitions = layoutHeader.partitionCount; + + result = decodePartitions_3_0(buffer, layout); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + *layoutPtr = layout; + return VDO_SUCCESS; +} diff --git a/source/vdo/base/fixedLayout.h b/source/vdo/base/fixedLayout.h new file mode 100644 index 0000000..0907299 --- /dev/null +++ b/source/vdo/base/fixedLayout.h @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/fixedLayout.h#1 $ + */ + +#ifndef FIXED_LAYOUT_H +#define FIXED_LAYOUT_H + +#include "buffer.h" + +#include "types.h" + +typedef enum { + FROM_BEGINNING, + FROM_END, +} PartitionDirection; + +extern const BlockCount ALL_FREE_BLOCKS; + +/** + * A fixed layout is like a traditional disk partitioning scheme. In the + * beginning there is one large unused area, of which parts are carved off. + * Each carved off section has its own internal offset and size. + **/ +typedef struct fixedLayout FixedLayout; +typedef struct partition Partition; + +/** + * Make an unpartitioned fixed layout. + * + * @param [in] totalBlocks The total size of the layout, in blocks + * @param [in] startOffset The block offset in the underlying layer at which + * the fixed layout begins + * @param [out] layoutPtr The pointer to hold the resulting layout + * + * @return a success or error code + **/ +int makeFixedLayout(BlockCount totalBlocks, + PhysicalBlockNumber startOffset, + FixedLayout **layoutPtr) + __attribute__((warn_unused_result)); + +/** + * Free the fixed layout and null out the reference to it. + * + * @param layoutPtr The reference to the layout to free + * + * @note all partitions created by this layout become invalid pointers + **/ +void freeFixedLayout(FixedLayout **layoutPtr); + +/** + * Get the total size of the layout in blocks. + * + * @param layout The layout + * + * @return The size of the layout + **/ +BlockCount getTotalFixedLayoutSize(const FixedLayout *layout) + __attribute__((warn_unused_result)); + +/** + * Get a partition by id. + * + * @param layout The layout from which to get a partition + * @param id The id of the partition + * @param partitionPtr A pointer to hold the partition + * + * @return VDO_SUCCESS or an error + **/ +int getPartition(FixedLayout *layout, PartitionID id, Partition **partitionPtr) + __attribute__((warn_unused_result)); + +/** + * Translate a block number from the partition's view to the layer's + * + * @param partition The partition to use for translation + * @param partitionBlockNumber The block number relative to the partition + * @param layerBlockNumber The block number relative to the layer + * + * @return VDO_SUCCESS or an error code + **/ +int translateToPBN(const Partition *partition, + PhysicalBlockNumber partitionBlockNumber, + PhysicalBlockNumber *layerBlockNumber) + __attribute__((warn_unused_result)); + +/** + * Translate a block number from the layer's view to the partition's. + * This is the inverse of translateToPBN(). + * + * @param partition The partition to use for translation + * @param layerBlockNumber The block number relative to the layer + * @param partitionBlockNumber The block number relative to the partition + * + * @return VDO_SUCCESS or an error code + **/ +int translateFromPBN(const Partition *partition, + PhysicalBlockNumber layerBlockNumber, + PhysicalBlockNumber *partitionBlockNumber) + __attribute__((warn_unused_result)); + +/** + * Return the number of unallocated blocks available. + * + * @param layout the fixed layout + * + * @return the number of blocks yet unallocated to partitions + **/ +BlockCount getFixedLayoutBlocksAvailable(const FixedLayout *layout) + __attribute__((warn_unused_result)); + +/** + * Create a new partition from the beginning or end of the unused space + * within a fixed layout. + * + * @param layout the fixed layout + * @param id the id of the partition to make + * @param blockCount the number of blocks to carve out, if set + * to ALL_FREE_BLOCKS, all remaining blocks will + * be used + * @param direction whether to carve out from beginning or end + * @param base the number of the first block in the partition + * from the point of view of its users + * + * @return a success or error code, particularly + * VDO_NO_SPACE if there are less than blockCount blocks remaining + **/ +int makeFixedLayoutPartition(FixedLayout *layout, + PartitionID id, + BlockCount blockCount, + PartitionDirection direction, + PhysicalBlockNumber base) + __attribute__((warn_unused_result)); + +/** + * Return the size in blocks of a partition. + * + * @param partition a partition of the fixedLayout + * + * @return the size of the partition in blocks + **/ +BlockCount getFixedLayoutPartitionSize(const Partition *partition) + __attribute__((warn_unused_result)); + +/** + * Get the first block of the partition in the layout. + * + * @param partition a partition of the fixedLayout + * + * @return the partition's offset in blocks + **/ +PhysicalBlockNumber getFixedLayoutPartitionOffset(const Partition *partition) + __attribute__((warn_unused_result)); + +/** + * Get the number of the first block in the partition from the partition users + * point of view. + * + * @param partition a partition of the fixedLayout + * + * @return the number of the first block in the partition + **/ +PhysicalBlockNumber getFixedLayoutPartitionBase(const Partition *partition) + __attribute__((warn_unused_result)); + +/** + * Get the size of an encoded layout + * + * @param layout The layout + * + * @return The encoded size of the layout + **/ +size_t getFixedLayoutEncodedSize(const FixedLayout *layout) + __attribute__((warn_unused_result)); + +/** + * Encode a layout into a buffer. + * + * @param layout The layout to encode + * @param buffer The buffer to encode into + * + * @return UDS_SUCCESS or an error + **/ +int encodeFixedLayout(const FixedLayout *layout, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode a fixed layout from a buffer. + * + * @param [in] buffer The buffer from which to decode + * @param [out] layoutPtr A pointer to hold the layout + * + * @return VDO_SUCCESS or an error + **/ +int decodeFixedLayout(Buffer *buffer, FixedLayout **layoutPtr) + __attribute__((warn_unused_result)); + +#endif // FIXED_LAYOUT_H diff --git a/source/vdo/base/flush.c b/source/vdo/base/flush.c new file mode 100644 index 0000000..4c6b94c --- /dev/null +++ b/source/vdo/base/flush.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/flush.c#3 $ + */ + +#include "flush.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockAllocator.h" +#include "completion.h" +#include "logicalZone.h" +#include "numUtils.h" +#include "readOnlyNotifier.h" +#include "slabDepot.h" +#include "vdoInternal.h" + +struct flusher { + VDOCompletion completion; + /** The VDO to which this flusher belongs */ + VDO *vdo; + /** The current flush generation of the VDO */ + SequenceNumber flushGeneration; + /** The first unacknowledged flush generation */ + SequenceNumber firstUnacknowledgedGeneration; + /** The queue of flush requests waiting to notify other threads */ + WaitQueue notifiers; + /** The queue of flush requests waiting for VIOs to complete */ + WaitQueue pendingFlushes; + /** The flush generation for which notifications are being sent */ + SequenceNumber notifyGeneration; + /** The logical zone to notify next */ + LogicalZone *logicalZoneToNotify; + /** The ID of the thread on which flush requests should be made */ + ThreadID threadID; +}; + +/** + * Convert a generic VDOCompletion to a Flusher. + * + * @param completion The completion to convert + * + * @return The completion as a Flusher + **/ +static Flusher *asFlusher(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(Flusher, completion) == 0); + assertCompletionType(completion->type, FLUSH_NOTIFICATION_COMPLETION); + return (Flusher *) completion; +} + +/** + * Convert a VDOFlush's generic wait queue entry back to the VDOFlush. + * + * @param waiter The wait queue entry to convert + * + * @return The wait queue entry as a VDOFlush + **/ +static VDOFlush *waiterAsFlush(Waiter *waiter) +{ + STATIC_ASSERT(offsetof(VDOFlush, waiter) == 0); + return (VDOFlush *) waiter; +} + +/**********************************************************************/ +int makeFlusher(VDO *vdo) +{ + int result = ALLOCATE(1, Flusher, __func__, &vdo->flusher); + if (result != VDO_SUCCESS) { + return result; + } + + vdo->flusher->vdo = vdo; + vdo->flusher->threadID = getPackerZoneThread(getThreadConfig(vdo)); + return initializeEnqueueableCompletion(&vdo->flusher->completion, + FLUSH_NOTIFICATION_COMPLETION, + vdo->layer); +} + +/**********************************************************************/ +void freeFlusher(Flusher **flusherPtr) +{ + if (*flusherPtr == NULL) { + return; + } + + Flusher *flusher = *flusherPtr; + destroyEnqueueable(&flusher->completion); + FREE(flusher); + *flusherPtr = NULL; +} + +/**********************************************************************/ +ThreadID getFlusherThreadID(Flusher *flusher) +{ + return flusher->threadID; +} + +/**********************************************************************/ +static void notifyFlush(Flusher *flusher); + +/** + * Finish the notification process by checking if any flushes have completed + * and then starting the notification of the next flush request if one came in + * while the current notification was in progress. This callback is registered + * in flushPackerCallback(). + * + * @param completion The flusher completion + **/ +static void finishNotification(VDOCompletion *completion) +{ + Flusher *flusher = asFlusher(completion); + ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), + "finishNotification() called from flusher thread"); + + Waiter *waiter = dequeueNextWaiter(&flusher->notifiers); + int result = enqueueWaiter(&flusher->pendingFlushes, waiter); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(flusher->vdo->readOnlyNotifier, result); + VDOFlush *flush = waiterAsFlush(waiter); + completion->layer->completeFlush(&flush); + return; + } + + completeFlushes(flusher); + if (hasWaiters(&flusher->notifiers)) { + notifyFlush(flusher); + } +} + +/** + * Flush the packer now that all of the logical and physical zones have been + * notified of the new flush request. This callback is registered in + * incrementGeneration(). + * + * @param completion The flusher completion + **/ +static void flushPackerCallback(VDOCompletion *completion) +{ + Flusher *flusher = asFlusher(completion); + incrementPackerFlushGeneration(flusher->vdo->packer); + launchCallback(completion, finishNotification, flusher->threadID); +} + +/** + * Increment the flush generation in a logical zone. If there are more logical + * zones, go on to the next one, otherwise, prepare the physical zones. This + * callback is registered both in notifyFlush() and in itself. + * + * @param completion The flusher as a completion + **/ +static void incrementGeneration(VDOCompletion *completion) +{ + Flusher *flusher = asFlusher(completion); + incrementFlushGeneration(flusher->logicalZoneToNotify, + flusher->notifyGeneration); + flusher->logicalZoneToNotify + = getNextLogicalZone(flusher->logicalZoneToNotify); + if (flusher->logicalZoneToNotify == NULL) { + launchCallback(completion, flushPackerCallback, flusher->threadID); + return; + } + + launchCallback(completion, incrementGeneration, + getLogicalZoneThreadID(flusher->logicalZoneToNotify)); +} + +/** + * Lauch a flush notification. + * + * @param flusher The flusher doing the notification + **/ +static void notifyFlush(Flusher *flusher) +{ + VDOFlush *flush = waiterAsFlush(getFirstWaiter(&flusher->notifiers)); + flusher->notifyGeneration = flush->flushGeneration; + flusher->logicalZoneToNotify = getLogicalZone(flusher->vdo->logicalZones, 0); + flusher->completion.requeue = true; + launchCallback(&flusher->completion, incrementGeneration, + getLogicalZoneThreadID(flusher->logicalZoneToNotify)); +} + +/**********************************************************************/ +void flush(VDO *vdo, VDOFlush *flush) +{ + Flusher *flusher = vdo->flusher; + ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), + "flush() called from flusher thread"); + + flush->flushGeneration = flusher->flushGeneration++; + bool mayNotify = !hasWaiters(&flusher->notifiers); + + int result = enqueueWaiter(&flusher->notifiers, &flush->waiter); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(vdo->readOnlyNotifier, result); + flusher->completion.layer->completeFlush(&flush); + return; + } + + if (mayNotify) { + notifyFlush(flusher); + } +} + +/**********************************************************************/ +void completeFlushes(Flusher *flusher) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), + "completeFlushes() called from flusher thread"); + + SequenceNumber oldestActiveGeneration = UINT64_MAX; + for (LogicalZone *zone = getLogicalZone(flusher->vdo->logicalZones, 0); + zone != NULL; + zone = getNextLogicalZone(zone)) { + SequenceNumber oldestInZone = getOldestLockedGeneration(zone); + oldestActiveGeneration = minSequenceNumber(oldestActiveGeneration, + oldestInZone); + } + + while (hasWaiters(&flusher->pendingFlushes)) { + VDOFlush *flush = waiterAsFlush(getFirstWaiter(&flusher->pendingFlushes)); + if (flush->flushGeneration >= oldestActiveGeneration) { + return; + } + + ASSERT_LOG_ONLY((flush->flushGeneration + == flusher->firstUnacknowledgedGeneration), + "acknowledged next expected flush, %" PRIu64 + ", was: %llu", + flusher->firstUnacknowledgedGeneration, + flush->flushGeneration); + dequeueNextWaiter(&flusher->pendingFlushes); + flusher->completion.layer->completeFlush(&flush); + flusher->firstUnacknowledgedGeneration++; + } +} + +/**********************************************************************/ +void dumpFlusher(const Flusher *flusher) +{ + logInfo("Flusher"); + logInfo(" flushGeneration=%" PRIu64 + " firstUnacknowledgedGeneration=%llu", + flusher->flushGeneration, flusher->firstUnacknowledgedGeneration); + logInfo(" notifiers queue is %s; pendingFlushes queue is %s", + (hasWaiters(&flusher->notifiers) ? "not empty" : "empty"), + (hasWaiters(&flusher->pendingFlushes) ? "not empty" : "empty")); +} diff --git a/source/vdo/base/flush.h b/source/vdo/base/flush.h new file mode 100644 index 0000000..da7c8bc --- /dev/null +++ b/source/vdo/base/flush.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/flush.h#1 $ + */ + +#ifndef FLUSH_H +#define FLUSH_H + +#include "types.h" +#include "waitQueue.h" + +/** + * A marker for tracking which journal entries are affected by a flush request. + **/ +struct vdoFlush { + /** The wait queue entry for this flush */ + Waiter waiter; + /** Which flush this struct represents */ + SequenceNumber flushGeneration; +}; + +/** + * Make a flusher for a VDO. + * + * @param vdo The VDO which owns the flusher + * + * @return VDO_SUCCESS or an error + **/ +int makeFlusher(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Free a flusher and null out the reference to it. + * + * @param flusherPtr A pointer to the flusher to free + **/ +void freeFlusher(Flusher **flusherPtr); + +/** + * Get the ID of the thread on which flusher functions should be called. + * + * @param flusher The flusher to query + * + * @return The ID of the thread which handles the flusher + **/ +ThreadID getFlusherThreadID(Flusher *flusher) + __attribute__((warn_unused_result)); + +/** + * Handle empty flush requests. + * + * @param vdo The VDO + * @param vdoFlush The opaque flush request + **/ +void flush(VDO *vdo, VDOFlush *vdoFlush); + +/** + * Attempt to complete any flushes which might have finished. + * + * @param flusher The flusher + **/ +void completeFlushes(Flusher *flusher); + +/** + * Dump the flusher, in a thread-unsafe fashion. + * + * @param flusher The flusher + **/ +void dumpFlusher(const Flusher *flusher); + +#endif /* FLUSH_H */ diff --git a/source/vdo/base/forest.c b/source/vdo/base/forest.c new file mode 100644 index 0000000..eabd6c3 --- /dev/null +++ b/source/vdo/base/forest.c @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/forest.c#8 $ + */ + +#include "forest.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMap.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "blockMapTree.h" +#include "blockMapTreeInternals.h" +#include "constants.h" +#include "dirtyLists.h" +#include "forest.h" +#include "numUtils.h" +#include "recoveryJournal.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "types.h" +#include "vdoInternal.h" +#include "vio.h" +#include "vioPool.h" + +enum { + BLOCK_MAP_VIO_POOL_SIZE = 64, +}; + +typedef struct { + TreePage *levels[BLOCK_MAP_TREE_HEIGHT]; +} BlockMapTreeSegment; + +typedef struct blockMapTree { + BlockMapTreeSegment *segments; +} BlockMapTree; + +struct forest { + BlockMap *map; + size_t segments; + Boundary *boundaries; + TreePage **pages; + BlockMapTree trees[]; +}; + +typedef struct { + PageNumber pageIndex; + SlotNumber slot; +} CursorLevel; + +typedef struct cursors Cursors; + +typedef struct { + Waiter waiter; + BlockMapTree *tree; + Height height; + Cursors *parent; + Boundary boundary; + CursorLevel levels[BLOCK_MAP_TREE_HEIGHT]; + VIOPoolEntry *vioPoolEntry; +} Cursor; + +struct cursors { + BlockMap *map; + BlockMapTreeZone *zone; + VIOPool *pool; + EntryCallback *entryCallback; + VDOCompletion *parent; + RootCount activeRoots; + Cursor cursors[]; +}; + +/**********************************************************************/ +TreePage *getTreePageByIndex(Forest *forest, + RootCount rootIndex, + Height height, + PageNumber pageIndex) +{ + PageNumber offset = 0; + for (size_t segment = 0; segment < forest->segments; segment++) { + PageNumber border = forest->boundaries[segment].levels[height - 1]; + if (pageIndex < border) { + BlockMapTree *tree = &forest->trees[rootIndex]; + return &(tree->segments[segment].levels[height - 1][pageIndex - offset]); + } + offset = border; + } + + return NULL; +} + +/** + * Compute the number of pages which must be allocated at each level in order + * to grow the forest to a new number of entries. + * + * @param [in] rootCount The number of roots + * @param [in] flatPageCount The number of flat block map pages + * @param [in] oldSizes The current size of the forest at each level + * @param [in] entries The new number of entries the block map must + * address + * @param [out] newSizes The new size of the forest at each level + * + * @return The total number of non-leaf pages required + **/ +static BlockCount computeNewPages(RootCount rootCount, + BlockCount flatPageCount, + Boundary *oldSizes, + BlockCount entries, + Boundary *newSizes) +{ + PageCount leafPages + = maxPageCount(computeBlockMapPageCount(entries) - flatPageCount, 1); + PageCount levelSize = computeBucketCount(leafPages, rootCount); + BlockCount totalPages = 0; + for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { + levelSize = computeBucketCount(levelSize, BLOCK_MAP_ENTRIES_PER_PAGE); + newSizes->levels[height] = levelSize; + BlockCount newPages = levelSize; + if (oldSizes != NULL) { + newPages -= oldSizes->levels[height]; + } + totalPages += (newPages * rootCount); + } + + return totalPages; +} + +/**********************************************************************/ +static int makeSegment(Forest *oldForest, + BlockCount newPages, + Boundary *newBoundary, + Forest *forest) +{ + size_t index = (oldForest == NULL) ? 0 : oldForest->segments; + forest->segments = index + 1; + + int result = ALLOCATE(forest->segments, Boundary, "forest boundary array", + &forest->boundaries); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(forest->segments, TreePage *, "forest page pointers", + &forest->pages); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(newPages, TreePage, "new forest pages", + &forest->pages[index]); + if (result != VDO_SUCCESS) { + return result; + } + + if (index > 0) { + memcpy(forest->boundaries, oldForest->boundaries, + index * sizeof(Boundary)); + memcpy(forest->pages, oldForest->pages, index * sizeof(TreePage *)); + } + + memcpy(&(forest->boundaries[index]), newBoundary, sizeof(Boundary)); + + PageCount segmentSizes[BLOCK_MAP_TREE_HEIGHT]; + for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { + segmentSizes[height] = newBoundary->levels[height]; + if (index > 0) { + segmentSizes[height] -= oldForest->boundaries[index - 1].levels[height]; + } + } + + TreePage *pagePtr = forest->pages[index]; + for (RootCount root = 0; root < forest->map->rootCount; root++) { + BlockMapTree *tree = &(forest->trees[root]); + int result = ALLOCATE(forest->segments, BlockMapTreeSegment, + "tree root segments", &tree->segments); + if (result != VDO_SUCCESS) { + return result; + } + + if (index > 0) { + memcpy(tree->segments, oldForest->trees[root].segments, + index * sizeof(BlockMapTreeSegment)); + } + + BlockMapTreeSegment *segment = &(tree->segments[index]); + for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { + if (segmentSizes[height] == 0) { + continue; + } + + segment->levels[height] = pagePtr; + if (height == (BLOCK_MAP_TREE_HEIGHT - 1)) { + // Record the root. + BlockMapPage *page = formatBlockMapPage(pagePtr->pageBuffer, + forest->map->nonce, + INVALID_PBN, true); + page->entries[0] = packPBN(forest->map->rootOrigin + root, + MAPPING_STATE_UNCOMPRESSED); + } + pagePtr += segmentSizes[height]; + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void deforest(Forest *forest, size_t firstPageSegment) +{ + if (forest->pages != NULL) { + for (size_t segment = firstPageSegment; segment < forest->segments; + segment++) { + FREE(forest->pages[segment]); + } + FREE(forest->pages); + } + + for (RootCount root = 0; root < forest->map->rootCount; root++) { + BlockMapTree *tree = &(forest->trees[root]); + FREE(tree->segments); + } + + FREE(forest->boundaries); + FREE(forest); +} + +/**********************************************************************/ +int makeForest(BlockMap *map, BlockCount entries) +{ + STATIC_ASSERT(offsetof(TreePage, waiter) == 0); + + Forest *oldForest = map->forest; + Boundary *oldBoundary = NULL; + if (oldForest != NULL) { + oldBoundary = &(oldForest->boundaries[oldForest->segments - 1]); + } + + Boundary newBoundary; + BlockCount newPages = computeNewPages(map->rootCount, map->flatPageCount, + oldBoundary, entries, &newBoundary); + if (newPages == 0) { + map->nextEntryCount = entries; + return VDO_SUCCESS; + } + + Forest *forest; + int result = ALLOCATE_EXTENDED(Forest, map->rootCount, BlockMapTree, + __func__, &forest); + if (result != VDO_SUCCESS) { + return result; + } + + forest->map = map; + result = makeSegment(oldForest, newPages, &newBoundary, forest); + if (result != VDO_SUCCESS) { + deforest(forest, forest->segments - 1); + return result; + } + + map->nextForest = forest; + map->nextEntryCount = entries; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeForest(Forest **forestPtr) +{ + Forest *forest = *forestPtr; + if (forest == NULL) { + return; + } + + deforest(forest, 0); + *forestPtr = NULL; +} + +/**********************************************************************/ +void abandonForest(BlockMap *map) +{ + Forest *forest = map->nextForest; + map->nextForest = NULL; + if (forest != NULL) { + deforest(forest, forest->segments - 1); + } + + map->nextEntryCount = 0; +} + +/**********************************************************************/ +void replaceForest(BlockMap *map) +{ + if (map->nextForest != NULL) { + if (map->forest != NULL) { + deforest(map->forest, map->forest->segments); + } + map->forest = map->nextForest; + map->nextForest = NULL; + } + + map->entryCount = map->nextEntryCount; + map->nextEntryCount = 0; +} + +/** + * Finish the traversal of a single tree. If it was the last cursor, finish + * the traversal. + * + * @param cursor The cursor doing the traversal + **/ +static void finishCursor(Cursor *cursor) +{ + Cursors *cursors = cursor->parent; + returnVIOToPool(cursors->pool, cursor->vioPoolEntry); + if (--cursors->activeRoots > 0) { + return; + } + + VDOCompletion *parent = cursors->parent; + FREE(cursors); + + finishCompletion(parent, VDO_SUCCESS); +} + +/**********************************************************************/ +static void traverse(Cursor *cursor); + +/** + * Continue traversing a block map tree. + * + * @param completion The VIO doing a read or write + **/ +static void continueTraversal(VDOCompletion *completion) +{ + VIOPoolEntry *poolEntry = completion->parent; + Cursor *cursor = poolEntry->parent; + traverse(cursor); +} + +/** + * Continue traversing a block map tree now that a page has been loaded. + * + * @param completion The VIO doing the read + **/ +static void finishTraversalLoad(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + Cursor *cursor = entry->parent; + Height height = cursor->height; + CursorLevel *level = &cursor->levels[height]; + + TreePage *treePage + = &(cursor->tree->segments[0].levels[height][level->pageIndex]); + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + copyValidPage(entry->buffer, cursor->parent->map->nonce, + entry->vio->physical, page); + traverse(cursor); +} + +/** + * Traverse a single block map tree. This is the recursive heart of the + * traversal process. + * + * @param cursor The cursor doing the traversal + **/ +static void traverse(Cursor *cursor) +{ + for (; cursor->height < BLOCK_MAP_TREE_HEIGHT; cursor->height++) { + Height height = cursor->height; + CursorLevel *level = &cursor->levels[height]; + TreePage *treePage + = &(cursor->tree->segments[0].levels[height][level->pageIndex]); + BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; + if (!isBlockMapPageInitialized(page)) { + continue; + } + + for (; level->slot < BLOCK_MAP_ENTRIES_PER_PAGE; level->slot++) { + DataLocation location = unpackBlockMapEntry(&page->entries[level->slot]); + if (!isValidLocation(&location)) { + // This entry is invalid, so remove it from the page. + page->entries[level->slot] + = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + writeTreePage(treePage, cursor->parent->zone); + continue; + } + + if (!isMappedLocation(&location)) { + continue; + } + + PageNumber entryIndex + = (BLOCK_MAP_ENTRIES_PER_PAGE * level->pageIndex) + level->slot; + + // Erase mapped entries past the end of the logical space. + if (entryIndex >= cursor->boundary.levels[height]) { + page->entries[level->slot] + = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + writeTreePage(treePage, cursor->parent->zone); + continue; + } + + if (cursor->height < BLOCK_MAP_TREE_HEIGHT - 1) { + int result = cursor->parent->entryCallback(location.pbn, + cursor->parent->parent); + if (result != VDO_SUCCESS) { + page->entries[level->slot] + = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + writeTreePage(treePage, cursor->parent->zone); + continue; + } + } + + if (cursor->height == 0) { + continue; + } + + cursor->height--; + CursorLevel *nextLevel = &cursor->levels[cursor->height]; + nextLevel->pageIndex = entryIndex; + nextLevel->slot = 0; + level->slot++; + launchReadMetadataVIO(cursor->vioPoolEntry->vio, location.pbn, + finishTraversalLoad, continueTraversal); + return; + } + } + + finishCursor(cursor); +} + +/** + * Start traversing a single block map tree now that the Cursor has a VIO with + * which to load pages. + * + *

Implements WaiterCallback. + * + * @param waiter The Cursor + * @param context The VIOPoolEntry just acquired + **/ +static void launchCursor(Waiter *waiter, void *context) +{ + STATIC_ASSERT(offsetof(Cursor, waiter) == 0); + Cursor *cursor = (Cursor *) waiter; + cursor->vioPoolEntry = (VIOPoolEntry *) context; + cursor->vioPoolEntry->parent = cursor; + vioAsCompletion(cursor->vioPoolEntry->vio)->callbackThreadID + = cursor->parent->zone->mapZone->threadID; + traverse(cursor); +} + +/** + * Compute the number of pages used at each level of the given root's tree. + * + * @param map The block map + * @param rootIndex The index of the root to measure + * + * @return The list of page counts as a Boundary + **/ +static Boundary computeBoundary(BlockMap *map, RootCount rootIndex) +{ + PageCount leafPages = computeBlockMapPageCount(map->entryCount); + PageCount treeLeafPages = leafPages - map->flatPageCount; + + /* + * Compute the leaf pages for this root. If the number of leaf pages does + * not distribute evenly, we must determine if this root gets an extra page. + * Extra pages are assigned to roots starting at firstTreeRoot and going up. + */ + PageCount firstTreeRoot = map->flatPageCount % map->rootCount; + PageCount lastTreeRoot = (leafPages - 1) % map->rootCount; + + PageCount levelPages = treeLeafPages / map->rootCount; + if (inCyclicRange(firstTreeRoot, rootIndex, lastTreeRoot, map->rootCount)) { + levelPages++; + } + + Boundary boundary; + for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT - 1; height++) { + boundary.levels[height] = levelPages; + levelPages = computeBucketCount(levelPages, BLOCK_MAP_ENTRIES_PER_PAGE); + } + + // The root node always exists, even if the root is otherwise unused. + boundary.levels[BLOCK_MAP_TREE_HEIGHT - 1] = 1; + + return boundary; +} + +/**********************************************************************/ +void traverseForest(BlockMap *map, + EntryCallback *entryCallback, + VDOCompletion *parent) +{ + if (computeBlockMapPageCount(map->entryCount) <= map->flatPageCount) { + // There are no tree pages, so there's nothing to do. + finishCompletion(parent, VDO_SUCCESS); + return; + } + + Cursors *cursors; + int result = ALLOCATE_EXTENDED(Cursors, map->rootCount, Cursor, __func__, + &cursors); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + cursors->map = map; + cursors->zone = &(getBlockMapZone(map, 0)->treeZone); + cursors->pool = cursors->zone->vioPool; + cursors->entryCallback = entryCallback; + cursors->parent = parent; + cursors->activeRoots = map->rootCount; + for (RootCount root = 0; root < map->rootCount; root++) { + Cursor *cursor = &cursors->cursors[root]; + *cursor = (Cursor) { + .tree = &map->forest->trees[root], + .height = BLOCK_MAP_TREE_HEIGHT - 1, + .parent = cursors, + .boundary = computeBoundary(map, root), + }; + + cursor->waiter.callback = launchCursor; + acquireVIOFromPool(cursors->pool, &cursor->waiter); + }; +} + +/**********************************************************************/ +BlockCount computeForestSize(BlockCount logicalBlocks, RootCount rootCount) +{ + Boundary newSizes; + BlockCount approximateNonLeaves + = computeNewPages(rootCount, 0, NULL, logicalBlocks, &newSizes); + + // Exclude the tree roots since those aren't allocated from slabs, + // and also exclude the super-roots, which only exist in memory. + approximateNonLeaves + -= rootCount * (newSizes.levels[BLOCK_MAP_TREE_HEIGHT - 2] + + newSizes.levels[BLOCK_MAP_TREE_HEIGHT - 1]); + + BlockCount approximateLeaves + = computeBlockMapPageCount(logicalBlocks - approximateNonLeaves); + + // This can be a slight over-estimate since the tree will never have to + // address these blocks, so it might be a tiny bit smaller. + return (approximateNonLeaves + approximateLeaves); +} diff --git a/source/vdo/base/forest.h b/source/vdo/base/forest.h new file mode 100644 index 0000000..9a5a7cf --- /dev/null +++ b/source/vdo/base/forest.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/forest.h#2 $ + */ + +#ifndef FOREST_H +#define FOREST_H + +#include "blockMapTree.h" +#include "types.h" + +/** + * A function to be called for each allocated PBN when traversing the forest. + * + * @param pbn A PBN of a tree node + * @param completion The parent completion of the traversal + * + * @return VDO_SUCCESS or an error + **/ +typedef int EntryCallback(PhysicalBlockNumber pbn, VDOCompletion *completion); + +/** + * Get the tree page for a given height and page index. + * + * @param forest The forest which holds the page + * @param rootIndex The index of the tree that holds the page + * @param height The height of the desired page + * @param pageIndex The index of the desired page + * + * @return The requested page + **/ +TreePage *getTreePageByIndex(Forest *forest, + RootCount rootIndex, + Height height, + PageNumber pageIndex) + __attribute__((warn_unused_result)); + +/** + * Make a collection of trees for a BlockMap, expanding the existing forest if + * there is one. + * + * @param map The block map + * @param entries The number of entries the block map will hold + * + * @return VDO_SUCCESS or an error + **/ +int makeForest(BlockMap *map, BlockCount entries) + __attribute__((warn_unused_result)); + +/** + * Free a forest and all of the segments it contains and NULL out the reference + * to it. + * + * @param forestPtr A pointer to the forest to free + **/ +void freeForest(Forest **forestPtr); + +/** + * Abandon the unused next forest from a BlockMap. + * + * @param map The block map + **/ +void abandonForest(BlockMap *map); + +/** + * Replace a BlockMap's Forest with the already-prepared larger forest. + * + * @param map The block map + **/ +void replaceForest(BlockMap *map); + +/** + * Walk the entire forest of a block map. + * + * @param map The block map to traverse + * @param entryCallback A function to call with the pbn of each allocated node + * in the forest + * @param parent The completion to notify on each traversed PBN, and + * when the traversal is complete + **/ +void traverseForest(BlockMap *map, + EntryCallback *entryCallback, + VDOCompletion *parent); + +/** + * Compute the approximate number of pages which the forest will allocate in + * order to map the specified number of logical blocks. This method assumes + * that the block map is entirely arboreal. + * + * @param logicalBlocks The number of blocks to map + * @param rootCount The number of trees in the forest + * + * @return A (slight) over-estimate of the total number of possible forest + * pages including the leaves + **/ +BlockCount computeForestSize(BlockCount logicalBlocks, RootCount rootCount) + __attribute__((warn_unused_result)); +#endif // FOREST_H diff --git a/source/vdo/base/hashLock.c b/source/vdo/base/hashLock.c new file mode 100644 index 0000000..8494f1d --- /dev/null +++ b/source/vdo/base/hashLock.c @@ -0,0 +1,1605 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLock.c#5 $ + */ + +/** + * HashLock controls and coordinates writing, index access, and dedupe among + * groups of DataVIOs concurrently writing identical blocks, allowing them to + * deduplicate not only against advice but also against each other. This save + * on index queries and allows those DataVIOs to concurrently deduplicate + * against a single block instead of being serialized through a PBN read lock. + * Only one index query is needed for each HashLock, instead of one for every + * DataVIO. + * + * A HashLock acts like a state machine perhaps more than as a lock. Other + * than the starting and ending states INITIALIZING and DESTROYING, every + * state represents and is held for the duration of an asynchronous operation. + * All state transitions are performed on the thread of the HashZone + * containing the lock. An asynchronous operation is almost always performed + * upon entering a state, and the callback from that operation triggers + * exiting the state and entering a new state. + * + * In all states except DEDUPING, there is a single DataVIO, called the lock + * agent, performing the asynchronous operations on behalf of the lock. The + * agent will change during the lifetime of the lock if the lock is shared by + * more than one DataVIO. DataVIOs waiting to deduplicate are kept on a wait + * queue. Viewed a different way, the agent holds the lock exclusively until + * the lock enters the DEDUPING state, at which point it becomes a shared lock + * that all the waiters (and any new DataVIOs that arrive) use to share a PBN + * lock. In state DEDUPING, there is no agent. When the last DataVIO in the + * lock calls back in DEDUPING, it becomes the agent and the lock becomes + * exclusive again. New DataVIOs that arrive in the lock will also go on the + * wait queue. + * + * The existence of lock waiters is a key factor controlling which state the + * lock transitions to next. When the lock is new or has waiters, it will + * always try to reach DEDUPING, and when it doesn't, it will try to clean up + * and exit. + * + * Deduping requires holding a PBN lock on a block that is known to contain + * data identical to the DataVIOs in the lock, so the lock will send the + * agent to the duplicate zone to acquire the PBN lock (LOCKING), to the + * kernel I/O threads to read and verify the data (VERIFYING), or to write a + * new copy of the data to a full data block or a slot in a compressed block + * (WRITING). + * + * Cleaning up consists of updating the index when the data location is + * different from the initial index query (UPDATING, triggered by stale + * advice, compression, and rollover), releasing the PBN lock on the duplicate + * block (UNLOCKING), and releasing the HashLock itself back to the hash zone + * (DESTROYING). + * + * The shortest sequence of states is for non-concurrent writes of new data: + * INITIALIZING -> QUERYING -> WRITING -> DESTROYING + * This sequence is short because no PBN read lock or index update is needed. + * + * Non-concurrent, finding valid advice looks like this (endpoints elided): + * -> QUERYING -> LOCKING -> VERIFYING -> DEDUPING -> UNLOCKING -> + * Or with stale advice (endpoints elided): + * -> QUERYING -> LOCKING -> VERIFYING -> UNLOCKING -> WRITING -> UPDATING -> + * + * When there are not enough available reference count increments available on + * a PBN for a DataVIO to deduplicate, a new lock is forked and the excess + * waiters roll over to the new lock (which goes directly to WRITING). The new + * lock takes the place of the old lock in the lock map so new DataVIOs will + * be directed to it. The two locks will proceed independently, but only the + * new lock will have the right to update the index (unless it also forks). + * + * Since rollover happens in a lock instance, once a valid data location has + * been selected, it will not change. QUERYING and WRITING are only performed + * once per lock lifetime. All other non-endpoint states can be re-entered. + * + * XXX still need doc on BYPASSING + * + * The function names in this module follow a convention referencing the + * states and transitions in the state machine diagram for VDOSTORY-190. + * [XXX link or repository path to it?] + * For example, for the LOCKING state, there are startLocking() and + * finishLocking() functions. startLocking() is invoked by the finish function + * of the state (or states) that transition to LOCKING. It performs the actual + * lock state change and must be invoked on the hash zone thread. + * finishLocking() is called by (or continued via callback from) the code + * actually obtaining the lock. It does any bookkeeping or decision-making + * required and invokes the appropriate start function of the state being + * transitioned to after LOCKING. + **/ + +#include "hashLock.h" +#include "hashLockInternals.h" + +#include "logger.h" +#include "permassert.h" + +#include "compressionState.h" +#include "constants.h" +#include "dataVIO.h" +#include "hashZone.h" +#include "packer.h" +#include "pbnLock.h" +#include "physicalZone.h" +#include "ringNode.h" +#include "slab.h" +#include "slabDepot.h" +#include "trace.h" +#include "types.h" +#include "vdoInternal.h" +#include "vioWrite.h" +#include "waitQueue.h" + +static const char *LOCK_STATE_NAMES[] = { + [HASH_LOCK_BYPASSING] = "BYPASSING", + [HASH_LOCK_DEDUPING] = "DEDUPING", + [HASH_LOCK_DESTROYING] = "DESTROYING", + [HASH_LOCK_INITIALIZING] = "INITIALIZING", + [HASH_LOCK_LOCKING] = "LOCKING", + [HASH_LOCK_QUERYING] = "QUERYING", + [HASH_LOCK_UNLOCKING] = "UNLOCKING", + [HASH_LOCK_UPDATING] = "UPDATING", + [HASH_LOCK_VERIFYING] = "VERIFYING", + [HASH_LOCK_WRITING] = "WRITING", +}; + +// There are loops in the state diagram, so some forward decl's are needed. +static void startDeduping(HashLock *lock, DataVIO *agent, bool agentIsDone); +static void startLocking(HashLock *lock, DataVIO *agent); +static void startWriting(HashLock *lock, DataVIO *agent); +static void unlockDuplicatePBN(VDOCompletion *completion); +static void transferAllocationLock(DataVIO *dataVIO); + +/**********************************************************************/ +PBNLock *getDuplicateLock(DataVIO *dataVIO) +{ + if (dataVIO->hashLock == NULL) { + return NULL; + } + return dataVIO->hashLock->duplicateLock; +} + +/**********************************************************************/ +const char *getHashLockStateName(HashLockState state) +{ + // Catch if a state has been added without updating the name array. + STATIC_ASSERT((HASH_LOCK_DESTROYING + 1) == COUNT_OF(LOCK_STATE_NAMES)); + return (state < COUNT_OF(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] : NULL; +} + +/** + * Set the current state of a hash lock. + * + * @param lock The lock to update + * @param newState The new state + **/ +static void setHashLockState(HashLock *lock, HashLockState newState) +{ + if (false) { + logWarning("XXX %" PRIptr " %s -> %s", (void *) lock, + getHashLockStateName(lock->state), + getHashLockStateName(newState)); + } + lock->state = newState; +} + +/** + * Assert that a DataVIO is the agent of its hash lock, and that this is being + * called in the hash zone. + * + * @param dataVIO The DataVIO expected to be the lock agent + * @param where A string describing the function making the assertion + **/ +static void assertHashLockAgent(DataVIO *dataVIO, const char *where) +{ + // Not safe to access the agent field except from the hash zone. + assertInHashZone(dataVIO); + ASSERT_LOG_ONLY(dataVIO == dataVIO->hashLock->agent, + "%s must be for the hash lock agent", where); +} + +/** + * Set or clear the lock agent. + * + * @param lock The hash lock to update + * @param newAgent The new lock agent (may be NULL to clear the agent) + **/ +static void setAgent(HashLock *lock, DataVIO *newAgent) +{ + lock->agent = newAgent; +} + +/** + * Set the duplicate lock held by a hash lock. May only be called in the + * physical zone of the PBN lock. + * + * @param hashLock The hash lock to update + * @param pbnLock The PBN read lock to use as the duplicate lock + **/ +static void setDuplicateLock(HashLock *hashLock, PBNLock *pbnLock) +{ + ASSERT_LOG_ONLY((hashLock->duplicateLock == NULL), + "hash lock must not already hold a duplicate lock"); + + pbnLock->holderCount += 1; + hashLock->duplicateLock = pbnLock; +} + +/** + * Convert a pointer to the hashLockNode field in a DataVIO to the enclosing + * DataVIO. + * + * @param lockNode The RingNode to convert + * + * @return A pointer to the DataVIO containing the RingNode + **/ +static inline DataVIO *dataVIOFromLockNode(RingNode *lockNode) +{ + return (DataVIO *) ((byte *) lockNode - offsetof(DataVIO, hashLockNode)); +} + +/** + * Remove the first DataVIO from the lock's wait queue and return it. + * + * @param lock The lock containing the wait queue + * + * @return The first (oldest) waiter in the queue, or NULL if + * the queue is empty + **/ +static inline DataVIO *dequeueLockWaiter(HashLock *lock) +{ + return waiterAsDataVIO(dequeueNextWaiter(&lock->waiters)); +} + +/** + * Continue processing a DataVIO that has been waiting for an event, setting + * the result from the event, and continuing in a specified callback function. + * + * @param dataVIO The DataVIO to continue + * @param result The current result (will not mask older errors) + * @param callback The function in which to continue processing + **/ +static void continueDataVIOIn(DataVIO *dataVIO, + int result, + VDOAction *callback) +{ + dataVIOAsCompletion(dataVIO)->callback = callback; + continueDataVIO(dataVIO, result); +} + +/** + * Set, change, or clear the hash lock a DataVIO is using. Updates the hash + * lock (or locks) to reflect the change in membership. + * + * @param dataVIO The DataVIO to update + * @param newLock The hash lock the DataVIO is joining + **/ +static void setHashLock(DataVIO *dataVIO, HashLock *newLock) +{ + HashLock *oldLock = dataVIO->hashLock; + if (oldLock != NULL) { + ASSERT_LOG_ONLY(dataVIO->hashZone != NULL, + "must have a hash zone when halding a hash lock"); + ASSERT_LOG_ONLY(!isRingEmpty(&dataVIO->hashLockNode), + "must be on a hash lock ring when holding a hash lock"); + ASSERT_LOG_ONLY(oldLock->referenceCount > 0, + "hash lock reference must be counted"); + + if ((oldLock->state != HASH_LOCK_BYPASSING) + && (oldLock->state != HASH_LOCK_UNLOCKING)) { + // If the reference count goes to zero in a non-terminal state, we're + // most likely leaking this lock. + ASSERT_LOG_ONLY(oldLock->referenceCount > 1, + "hash locks should only become unreferenced in" + " a terminal state, not state %s", + getHashLockStateName(oldLock->state)); + } + + unspliceRingNode(&dataVIO->hashLockNode); + oldLock->referenceCount -= 1; + + dataVIO->hashLock = NULL; + } + + if (newLock != NULL) { + // Keep all DataVIOs sharing the lock on a ring since they can complete in + // any order and we'll always need a pointer to one to compare data. + pushRingNode(&newLock->duplicateRing, &dataVIO->hashLockNode); + newLock->referenceCount += 1; + + // XXX Not needed for VDOSTORY-190, but useful for checking whether a test + // is getting concurrent dedupe, and how much. + if (newLock->maxReferences < newLock->referenceCount) { + newLock->maxReferences = newLock->referenceCount; + } + + dataVIO->hashLock = newLock; + } +} + +/** + * Bottleneck for DataVIOs that have written or deduplicated and that are no + * longer needed to be an agent for the hash lock. + * + * @param dataVIO The DataVIO to complete and send to be cleaned up + **/ +static void exitHashLock(DataVIO *dataVIO) +{ + // XXX trace record? + + // Release the hash lock now, saving a thread transition in cleanup. + releaseHashLock(dataVIO); + + // Complete the DataVIO and start the clean-up path in vioWrite to release + // any locks it still holds. + finishDataVIO(dataVIO, VDO_SUCCESS); +} + +/** + * Retire the active lock agent, replacing it with the first lock waiter, and + * make the retired agent exit the hash lock. + * + * @param lock The hash lock to update + * + * @return The new lock agent (which will be NULL if there was no waiter) + **/ +static DataVIO *retireLockAgent(HashLock *lock) +{ + DataVIO *oldAgent = lock->agent; + DataVIO *newAgent = dequeueLockWaiter(lock); + setAgent(lock, newAgent); + exitHashLock(oldAgent); + if (newAgent != NULL) { + setDuplicateLocation(newAgent, lock->duplicate); + } + return newAgent; +} + +/** + * Callback to call compressData(), putting a DataVIO back on the write path. + * + * @param completion The DataVIO + **/ +static void compressDataCallback(VDOCompletion *completion) +{ + // XXX VDOSTORY-190 need an error check since compressData doesn't have one. + compressData(asDataVIO(completion)); +} + +/** + * Add a DataVIO to the lock's queue of waiters. + * + * @param lock The hash lock on which to wait + * @param dataVIO The DataVIO to add to the queue + **/ +static void waitOnHashLock(HashLock *lock, DataVIO *dataVIO) +{ + int result = enqueueDataVIO(&lock->waiters, dataVIO, THIS_LOCATION(NULL)); + if (result != VDO_SUCCESS) { + // This should be impossible, but if it somehow happens, give up on trying + // to dedupe the data. + setHashLock(dataVIO, NULL); + continueDataVIOIn(dataVIO, result, compressDataCallback); + return; + } + + // Make sure the agent doesn't block indefinitely in the packer since it now + // has at least one other DataVIO waiting on it. + if ((lock->state == HASH_LOCK_WRITING) && cancelCompression(lock->agent)) { + /* + * Even though we're waiting, we also have to send ourselves as a one-way + * message to the packer to ensure the agent continues executing. This is + * safe because cancelCompression() guarantees the agent won't continue + * executing until this message arrives in the packer, and because the + * wait queue link isn't used for sending the message. + */ + dataVIO->compression.lockHolder = lock->agent; + launchPackerCallback(dataVIO, removeLockHolderFromPacker, + THIS_LOCATION("$F;cb=removeLockHolderFromPacker")); + } +} + +/** + * WaiterCallback function that calls compressData on the DataVIO waiter. + * + * @param waiter The DataVIO's waiter link + * @param context Not used + **/ +static void compressWaiter(Waiter *waiter, + void *context __attribute__((unused))) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + dataVIO->isDuplicate = false; + compressData(dataVIO); +} + +/** + * Handle the result of the agent for the lock releasing a read lock on + * duplicate candidate due to aborting the hash lock. This continuation is + * registered in unlockDuplicatePBN(). + * + * @param completion The completion of the DataVIO acting as the lock's agent + **/ +static void finishBypassing(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + ASSERT_LOG_ONLY(lock->duplicateLock == NULL, + "must have released the duplicate lock for the hash lock"); + exitHashLock(agent); +} + +/** + * Stop using the hash lock, resuming the old write path for the lock agent + * and any DataVIOs waiting on it, and put it in a state where DataVIOs + * entering the lock will use the old dedupe path instead of waiting. + * + * @param lock The hash lock + * @param agent The DataVIO acting as the agent for the lock + **/ +static void startBypassing(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_BYPASSING); + + // Ensure we don't attempt to update advice when cleaning up. + lock->updateAdvice = false; + + ASSERT_LOG_ONLY(((agent != NULL) || !hasWaiters(&lock->waiters)), + "should not have waiters without an agent"); + notifyAllWaiters(&lock->waiters, compressWaiter, NULL); + + if (lock->duplicateLock != NULL) { + if (agent != NULL) { + // The agent must reference the duplicate zone to launch it. + agent->duplicate = lock->duplicate; + launchDuplicateZoneCallback(agent, unlockDuplicatePBN, + THIS_LOCATION(NULL)); + return; + } + ASSERT_LOG_ONLY(false, "hash lock holding a PBN lock must have an agent"); + } + + if (agent == NULL) { + return; + } + + setAgent(lock, NULL); + agent->isDuplicate = false; + compressData(agent); +} + +/** + * Abort processing on this hash lock when noticing an error. Currently, this + * moves the hash lock to the BYPASSING state, to release all pending DataVIOs. + * + * @param lock The HashLock + * @param dataVIO The DataVIO with the error + **/ +static void abortHashLock(HashLock *lock, DataVIO *dataVIO) +{ + // If we've already aborted the lock, don't try to re-abort it; just exit. + if (lock->state == HASH_LOCK_BYPASSING) { + exitHashLock(dataVIO); + return; + } + + if (dataVIO != lock->agent) { + if ((lock->agent != NULL) || (lock->referenceCount > 1)) { + // Other DataVIOs are still sharing the lock (which should be DEDUPING), + // so just kick this one out of the lock to report its error. + ASSERT_LOG_ONLY(lock->agent == NULL, + "only active agent should call abortHashLock"); + exitHashLock(dataVIO); + return; + } + // Make the lone DataVIO the lock agent so it can abort and clean up. + setAgent(lock, dataVIO); + } + + startBypassing(lock, dataVIO); +} + +/** + * Handle the result of the agent for the lock releasing a read lock on + * duplicate candidate. This continuation is registered in + * unlockDuplicatePBN(). + * + * @param completion The completion of the DataVIO acting as the lock's agent + **/ +static void finishUnlocking(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + ASSERT_LOG_ONLY(lock->duplicateLock == NULL, + "must have released the duplicate lock for the hash lock"); + + if (completion->result != VDO_SUCCESS) { + abortHashLock(lock, agent); + return; + } + + if (!lock->verified) { + /* + * UNLOCKING -> WRITING transition: The lock we released was on an + * unverified block, so it must have been a lock on advice we were + * verifying, not on a location that was used for deduplication. Go write + * (or compress) the block to get a location to dedupe against. + */ + startWriting(lock, agent); + return; + } + + // With the lock released, the verified duplicate block may already have + // changed and will need to be re-verified if a waiter arrived. + lock->verified = false; + + if (hasWaiters(&lock->waiters)) { + /* + * UNLOCKING -> LOCKING transition: A new DataVIO entered the hash lock + * while the agent was releasing the PBN lock. The current agent exits and + * the waiter has to re-lock and re-verify the duplicate location. + */ + // XXX VDOSTORY-190 If we used the current agent to re-acquire the PBN + // lock we wouldn't need to re-verify. + agent = retireLockAgent(lock); + startLocking(lock, agent); + return; + } + + /* + * UNLOCKING -> DESTROYING transition: The agent is done with the lock + * and no other DataVIOs reference it, so remove it from the lock map + * and return it to the pool. + */ + exitHashLock(agent); +} + +/** + * Release a read lock on the PBN of the block that may or may not have + * contained duplicate data. This continuation is launched by + * startUnlocking(), and calls back to finishUnlocking() on the hash zone + * thread. + * + * @param completion The completion of the DataVIO acting as the lock's agent + **/ +static void unlockDuplicatePBN(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertInDuplicateZone(agent); + HashLock *lock = agent->hashLock; + + ASSERT_LOG_ONLY(lock->duplicateLock != NULL, + "must have a duplicate lock to release"); + + releasePBNLock(agent->duplicate.zone, agent->duplicate.pbn, + &lock->duplicateLock); + + if (lock->state == HASH_LOCK_BYPASSING) { + launchHashZoneCallback(agent, finishBypassing, THIS_LOCATION(NULL)); + } else { + launchHashZoneCallback(agent, finishUnlocking, THIS_LOCATION(NULL)); + } +} + +/** + * Release a read lock on the PBN of the block that may or may not have + * contained duplicate data. + * + * @param lock The hash lock + * @param agent The DataVIO currently acting as the agent for the lock + **/ +static void startUnlocking(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_UNLOCKING); + + /* + * XXX If we arrange to continue on the duplicate zone thread when + * verification fails, and don't explicitly change lock states (or use an + * agent-local state, or an atomic), we can avoid a thread transition here. + */ + launchDuplicateZoneCallback(agent, unlockDuplicatePBN, THIS_LOCATION(NULL)); +} + +/** + * Process the result of a UDS update performed by the agent for the lock. + * This continuation is registered in startQuerying(). + * + * @param completion The completion of the DataVIO that performed the update + **/ +static void finishUpdating(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + if (completion->result != VDO_SUCCESS) { + abortHashLock(lock, agent); + return; + } + + // UDS was updated successfully, so don't update again unless the + // duplicate location changes due to rollover. + lock->updateAdvice = false; + + if (hasWaiters(&lock->waiters)) { + /* + * UPDATING -> DEDUPING transition: A new DataVIO arrived during the UDS + * update. Send it on the verified dedupe path. The agent is done with the + * lock, but the lock may still need to use it to clean up after rollover. + */ + startDeduping(lock, agent, true); + return; + } + + if (lock->duplicateLock != NULL) { + /* + * UPDATING -> UNLOCKING transition: No one is waiting to dedupe, but we + * hold a duplicate PBN lock, so go release it. + */ + startUnlocking(lock, agent); + } else { + /* + * UPDATING -> DESTROYING transition: No one is waiting to dedupe and + * there's no lock to release. + */ + // XXX startDestroying(lock, agent); + startBypassing(lock, NULL); + exitHashLock(agent); + } +} + +/** + * Continue deduplication with the last step, updating UDS with the location + * of the duplicate that should be returned as advice in the future. + * + * @param lock The hash lock + * @param agent The DataVIO currently acting as the agent for the lock + **/ +static void startUpdating(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_UPDATING); + + ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified"); + ASSERT_LOG_ONLY(lock->updateAdvice, "should only update advice if needed"); + + agent->lastAsyncOperation = UPDATE_INDEX; + setHashZoneCallback(agent, finishUpdating, THIS_LOCATION(NULL)); + dataVIOAsCompletion(agent)->layer->updateAlbireo(agent); +} + +/** + * Handle a DataVIO that has finished deduplicating against the block locked + * by the hash lock. If there are other DataVIOs still sharing the lock, this + * will just release the DataVIO's share of the lock and finish processing the + * DataVIO. If this is the last DataVIO holding the lock, this makes the + * DataVIO the lock agent and uses it to advance the state of the lock so it + * can eventually be released. + * + * @param lock The hash lock + * @param dataVIO The lock holder that has finished deduplicating + **/ +static void finishDeduping(HashLock *lock, DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING"); + ASSERT_LOG_ONLY(!hasWaiters(&lock->waiters), + "shouldn't have any lock waiters in DEDUPING"); + + // Just release the lock reference if other DataVIOs are still deduping. + if (lock->referenceCount > 1) { + exitHashLock(dataVIO); + return; + } + + // The hash lock must have an agent for all other lock states. + DataVIO *agent = dataVIO; + setAgent(lock, agent); + + if (lock->updateAdvice) { + /* + * DEDUPING -> UPDATING transition: The location of the duplicate block + * changed since the initial UDS query because of compression, rollover, + * or because the query agent didn't have an allocation. The UDS update + * was delayed in case there was another change in location, but with only + * this DataVIO using the hash lock, it's time to update the advice. + */ + startUpdating(lock, agent); + } else { + /* + * DEDUPING -> UNLOCKING transition: Release the PBN read lock on the + * duplicate location so the hash lock itself can be released (contingent + * on no new DataVIOs arriving in the lock before the agent returns). + */ + startUnlocking(lock, agent); + } +} + +/** + * Implements WaiterCallback. Binds the DataVIO that was waiting to a new hash + * lock and waits on that lock. + **/ +static void enterForkedLock(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + HashLock *newLock = (HashLock *) context; + + setHashLock(dataVIO, newLock); + waitOnHashLock(newLock, dataVIO); +} + +/** + * Fork a hash lock because it has run out of increments on the duplicate PBN. + * Transfers the new agent and any lock waiters to a new hash lock instance + * which takes the place of the old lock in the lock map. The old lock remains + * active, but will not update advice. + * + * @param oldLock The hash lock to fork + * @param newAgent The DataVIO that will be the agent for the new lock + **/ +static void forkHashLock(HashLock *oldLock, DataVIO *newAgent) +{ + HashLock *newLock; + int result = acquireHashLockFromZone(newAgent->hashZone, + &newAgent->chunkName, + oldLock, &newLock); + if (result != VDO_SUCCESS) { + abortHashLock(oldLock, newAgent); + return; + } + + // Only one of the two locks should update UDS. The old lock is out of + // references, so it would be poor dedupe advice in the short term. + oldLock->updateAdvice = false; + newLock->updateAdvice = true; + + setHashLock(newAgent, newLock); + setAgent(newLock, newAgent); + + notifyAllWaiters(&oldLock->waiters, enterForkedLock, newLock); + + newAgent->isDuplicate = false; + startWriting(newLock, newAgent); +} + +/** + * Reserve a reference count increment for a DataVIO and launch it on the + * dedupe path. If no increments are available, this will roll over to a new + * hash lock and launch the DataVIO as the writing agent for that lock. + * + * @param lock The hash lock + * @param dataVIO The DataVIO to deduplicate using the hash lock + * @param hasClaim true if the dataVIO already has claimed + * an increment from the duplicate lock + **/ +static void launchDedupe(HashLock *lock, DataVIO *dataVIO, bool hasClaim) +{ + if (!hasClaim && !claimPBNLockIncrement(lock->duplicateLock)) { + // Out of increments, so must roll over to a new lock. + forkHashLock(lock, dataVIO); + return; + } + + // Deduplicate against the lock's verified location. + setDuplicateLocation(dataVIO, lock->duplicate); + launchDuplicateZoneCallback(dataVIO, shareBlock, + THIS_LOCATION("$F;cb=shareBlock")); +} + +/** + * Enter the hash lock state where DataVIOs deduplicate in parallel against a + * true copy of their data on disk. If the agent itself needs to deduplicate, + * an increment for it must already have been claimed from the duplicate lock, + * ensuring the hash lock will still have a DataVIO holding it. + * + * @param lock The hash lock + * @param agent The DataVIO acting as the agent for the lock + * @param agentIsDone true only if the agent has already written + * or deduplicated against its data + **/ +static void startDeduping(HashLock *lock, DataVIO *agent, bool agentIsDone) +{ + setHashLockState(lock, HASH_LOCK_DEDUPING); + + // We don't take the downgraded allocation lock from the agent unless we + // actually need to deduplicate against it. + if (lock->duplicateLock == NULL) { + ASSERT_LOG_ONLY(!isCompressed(agent->newMapped.state), + "compression must have shared a lock"); + ASSERT_LOG_ONLY(agentIsDone, "agent must have written the new duplicate"); + transferAllocationLock(agent); + } + + ASSERT_LOG_ONLY(isPBNReadLock(lock->duplicateLock), + "duplicateLock must be a PBN read lock"); + + /* + * This state is not like any of the other states. There is no designated + * agent--the agent transitioning to this state and all the waiters will be + * launched to deduplicate in parallel. + */ + setAgent(lock, NULL); + + /* + * Launch the agent (if not already deduplicated) and as many lock waiters + * as we have available increments for on the dedupe path. If we run out of + * increments, rollover will be triggered and the remaining waiters will be + * transferred to the new lock. + */ + if (!agentIsDone) { + launchDedupe(lock, agent, true); + agent = NULL; + } + while (hasWaiters(&lock->waiters)) { + launchDedupe(lock, dequeueLockWaiter(lock), false); + } + + if (agentIsDone) { + /* + * In the degenerate case where all the waiters rolled over to a new lock, + * this will continue to use the old agent to clean up this lock, and + * otherwise it just lets the agent exit the lock. + */ + finishDeduping(lock, agent); + } +} + +/** + * Handle the result of the agent for the lock comparing its data to the + * duplicate candidate. This continuation is registered in startVerifying(). + * + * @param completion The completion of the DataVIO used to verify dedupe + **/ +static void finishVerifying(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + if (completion->result != VDO_SUCCESS) { + // XXX VDOSTORY-190 should convert verify IO errors to verification failure + abortHashLock(lock, agent); + return; + } + + lock->verified = agent->isDuplicate; + + // Only count the result of the initial verification of the advice as valid + // or stale, and not any re-verifications due to PBN lock releases. + if (!lock->verifyCounted) { + lock->verifyCounted = true; + if (lock->verified) { + bumpHashZoneValidAdviceCount(agent->hashZone); + } else { + bumpHashZoneStaleAdviceCount(agent->hashZone); + } + } + + // Even if the block is a verified duplicate, we can't start to deduplicate + // unless we can claim a reference count increment for the agent. + if (lock->verified && !claimPBNLockIncrement(lock->duplicateLock)) { + agent->isDuplicate = false; + lock->verified = false; + } + + if (lock->verified) { + /* + * VERIFYING -> DEDUPING transition: The advice is for a true duplicate, + * so start deduplicating against it, if references are available. + */ + startDeduping(lock, agent, false); + } else { + /* + * VERIFYING -> UNLOCKING transition: Either the verify failed or we'd try + * to dedupe and roll over immediately, which would fail because it would + * leave the lock without an agent to release the PBN lock. In both cases, + * the data will have to be written or compressed, but first the advice + * PBN must be unlocked by the VERIFYING agent. + */ + lock->updateAdvice = true; + startUnlocking(lock, agent); + } +} + +/** + * Continue the deduplication path for a hash lock by using the agent to read + * (and possibly decompress) the data at the candidate duplicate location, + * comparing it to the data in the agent to verify that the candidate is + * identical to all the DataVIOs sharing the hash. If so, it can be + * deduplicated against, otherwise a DataVIO allocation will have to be + * written to and used for dedupe. + * + * @param lock The hash lock (must be LOCKING) + * @param agent The DataVIO to use to read and compare candidate data + **/ +static void startVerifying(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_VERIFYING); + ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once"); + + /* + * XXX VDOSTORY-190 Optimization: This is one of those places where the zone + * and continuation we want to use depends on the outcome of the comparison. + * If we could choose which path in the layer thread before continuing, we + * could save a thread transition in one of the two cases (assuming we're + * willing to delay visibility of the the hash lock state change). + */ + VDOCompletion *completion = dataVIOAsCompletion(agent); + agent->lastAsyncOperation = VERIFY_DEDUPLICATION; + setHashZoneCallback(agent, finishVerifying, THIS_LOCATION(NULL)); + completion->layer->verifyDuplication(agent); +} + +/** + * Handle the result of the agent for the lock attempting to obtain a PBN read + * lock on the candidate duplicate block. this continuation is registered in + * lockDuplicatePBN(). + * + * @param completion The completion of the DataVIO that attempted to get + * the read lock + **/ +static void finishLocking(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + if (completion->result != VDO_SUCCESS) { + // XXX clearDuplicateLocation()? + agent->isDuplicate = false; + abortHashLock(lock, agent); + return; + } + + if (!agent->isDuplicate) { + ASSERT_LOG_ONLY(lock->duplicateLock == NULL, + "must not hold duplicateLock if not flagged as a duplicate"); + /* + * LOCKING -> WRITING transition: The advice block is being modified or + * has no available references, so try to write or compress the data, + * remembering to update UDS later with the new advice. + */ + bumpHashZoneStaleAdviceCount(agent->hashZone); + lock->updateAdvice = true; + startWriting(lock, agent); + return; + } + + ASSERT_LOG_ONLY(lock->duplicateLock != NULL, + "must hold duplicateLock if flagged as a duplicate"); + + if (!lock->verified) { + /* + * LOCKING -> VERIFYING transition: Continue on the unverified dedupe path, + * reading the candidate duplicate and comparing it to the agent's data to + * decide whether it is a true duplicate or stale advice. + */ + startVerifying(lock, agent); + return; + } + + if (!claimPBNLockIncrement(lock->duplicateLock)) { + /* + * LOCKING -> UNLOCKING transition: The verified block was re-locked, but + * has no available increments left. Must first release the useless PBN + * read lock before rolling over to a new copy of the block. + */ + agent->isDuplicate = false; + lock->verified = false; + lock->updateAdvice = true; + startUnlocking(lock, agent); + return; + } + + /* + * LOCKING -> DEDUPING transition: Continue on the verified dedupe path, + * deduplicating against a location that was previously verified or + * written to. + */ + startDeduping(lock, agent, false); +} + +/** + * Acquire a read lock on the PBN of the block containing candidate duplicate + * data (compressed or uncompressed). If the PBN is already locked for + * writing, the lock attempt is abandoned and isDuplicate will be cleared + * before calling back. this continuation is launched from startLocking(), and + * calls back to finishLocking() on the hash zone thread. + * + * @param completion The completion of the DataVIO attempting to acquire the + * physical block lock on behalf of its hash lock + **/ +static void lockDuplicatePBN(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + PhysicalZone *zone = agent->duplicate.zone; + assertInDuplicateZone(agent); + + setHashZoneCallback(agent, finishLocking, THIS_LOCATION(NULL)); + + // While in the zone that owns it, find out how many additional references + // can be made to the block if it turns out to truly be a duplicate. + SlabDepot *depot = getSlabDepot(getVDOFromDataVIO(agent)); + unsigned int incrementLimit = getIncrementLimit(depot, agent->duplicate.pbn); + if (incrementLimit == 0) { + // We could deduplicate against it later if a reference happened to be + // released during verification, but it's probably better to bail out now. + // XXX clearDuplicateLocation()? + agent->isDuplicate = false; + continueDataVIO(agent, VDO_SUCCESS); + return; + } + + PBNLock *lock; + int result = attemptPBNLock(zone, agent->duplicate.pbn, VIO_READ_LOCK, + &lock); + if (result != VDO_SUCCESS) { + continueDataVIO(agent, result); + return; + } + + if (!isPBNReadLock(lock)) { + /* + * There are three cases of write locks: uncompressed data block writes, + * compressed (packed) block writes, and block map page writes. In all + * three cases, we give up on trying to verify the advice and don't bother + * to try deduplicate against the data in the write lock holder. + * + * 1) We don't ever want to try to deduplicate against a block map page. + * + * 2a) It's very unlikely we'd deduplicate against an entire packed block, + * both because of the chance of matching it, and because we don't record + * advice for it, but for the uncompressed representation of all the + * fragments it contains. The only way we'd be getting lock contention is + * if we've written the same representation coincidentally before, had it + * become unreferenced, and it just happened to be packed together from + * compressed writes when we go to verify the lucky advice. Giving up is a + * miniscule loss of potential dedupe. + * + * 2b) If the advice is for a slot of a compressed block, it's about to + * get smashed, and the write smashing it cannot contain our data--it + * would have to be writing on behalf of our hash lock, but that's + * impossible since we're the lock agent. + * + * 3a) If the lock is held by a DataVIO with different data, the advice is + * already stale or is about to become stale. + * + * 3b) If the lock is held by a DataVIO that matches us, we may as well + * either write it ourselves (or reference the copy we already wrote) + * instead of potentially having many duplicates wait for the lock holder + * to write, journal, hash, and finally arrive in the hash lock. All we + * lose is a chance to avoid a UDS update in the very rare case of advice + * for a free block that just happened to be allocated to a DataVIO with + * the same hash. In async mode, there's also a chance to save on a block + * write, at the cost of a block verify. Saving on a full block compare in + * all stale advice cases almost certainly outweighs saving a UDS update + * in a lucky case where advice would have been saved from becoming stale. + */ + // XXX clearDuplicateLocation()? + agent->isDuplicate = false; + continueDataVIO(agent, VDO_SUCCESS); + return; + } + + if (lock->holderCount == 0) { + // Ensure that the newly-locked block is referenced. + Slab *slab = getSlab(depot, agent->duplicate.pbn); + result = acquireProvisionalReference(slab, agent->duplicate.pbn, lock); + if (result != VDO_SUCCESS) { + logWarningWithStringError(result, + "Error acquiring provisional reference for " + "dedupe candidate; aborting dedupe"); + agent->isDuplicate = false; + releasePBNLock(zone, agent->duplicate.pbn, &lock); + continueDataVIO(agent, result); + return; + } + + /* + * The increment limit we grabbed earlier is still valid. The lock now + * holds the rights to acquire all those references. Those rights will be + * claimed by hash locks sharing this read lock. + */ + lock->incrementLimit = incrementLimit; + } + + // We've successfully acquired a read lock on behalf of the hash lock, + // so mark it as such. + setDuplicateLock(agent->hashLock, lock); + + /* + * XXX VDOSTORY-190 Optimization: Same as startLocking() lazily changing + * state to save on having to switch back to the hash zone thread. Here we + * could directly launch the block verify, then switch to a hash thread. + */ + continueDataVIO(agent, VDO_SUCCESS); +} + +/** + * Continue deduplication for a hash lock that has obtained valid advice + * of a potential duplicate through its agent. + * + * @param lock The hash lock (currently must be QUERYING) + * @param agent The DataVIO bearing the dedupe advice + **/ +static void startLocking(HashLock *lock, DataVIO *agent) +{ + ASSERT_LOG_ONLY(lock->duplicateLock == NULL, + "must not acquire a duplicate lock when already holding it"); + + setHashLockState(lock, HASH_LOCK_LOCKING); + + /* + * XXX VDOSTORY-190 Optimization: If we arrange to continue on the duplicate + * zone thread when accepting the advice, and don't explicitly change lock + * states (or use an agent-local state, or an atomic), we can avoid a thread + * transition here. + */ + agent->lastAsyncOperation = ACQUIRE_PBN_READ_LOCK; + launchDuplicateZoneCallback(agent, lockDuplicatePBN, THIS_LOCATION(NULL)); +} + +/** + * Re-entry point for the lock agent after it has finished writing or + * compressing its copy of the data block. The agent will never need to dedupe + * against anything, so it's done with the lock, but the lock may not be + * finished with it, as a UDS update might still be needed. + * + * If there are other lock holders, the agent will hand the job to one of them + * and exit, leaving the lock to deduplicate against the just-written block. + * If there are no other lock holders, the agent either exits (and later tears + * down the hash lock), or it remains the agent and updates UDS. + * + * @param lock The hash lock, which must be in state WRITING + * @param agent The DataVIO that wrote its data for the lock + **/ +static void finishWriting(HashLock *lock, DataVIO *agent) +{ + // Dedupe against the data block or compressed block slot the agent wrote. + // Since we know the write succeeded, there's no need to verify it. + lock->duplicate = agent->newMapped; + lock->verified = true; + + if (isCompressed(lock->duplicate.state) && lock->registered) { + // Compression means the location we gave in the UDS query is not the + // location we're using to deduplicate. + lock->updateAdvice = true; + } + + // If there are any waiters, we need to start deduping them. + if (hasWaiters(&lock->waiters)) { + /* + * WRITING -> DEDUPING transition: an asynchronously-written block + * failed to compress, so the PBN lock on the written copy was already + * transferred. The agent is done with the lock, but the lock may + * still need to use it to clean up after rollover. + */ + startDeduping(lock, agent, true); + return; + } + + // There are no waiters and the agent has successfully written, so take a + // step towards being able to release the hash lock (or just release it). + if (lock->updateAdvice) { + /* + * WRITING -> UPDATING transition: There's no waiter and a UDS update is + * needed, so retain the WRITING agent and use it to launch the update. + * The happens on compression, rollover, or the QUERYING agent not having + * an allocation. + */ + startUpdating(lock, agent); + } else if (lock->duplicateLock != NULL) { + /* + * WRITING -> UNLOCKING transition: There's no waiter and no update + * needed, but the compressed write gave us a shared duplicate lock that + * we must release. + */ + setDuplicateLocation(agent, lock->duplicate); + startUnlocking(lock, agent); + } else { + /* + * WRITING -> DESTROYING transition: There's no waiter, no update needed, + * and no duplicate lock held, so both the agent and lock have no more + * work to do. The agent will release its allocation lock in cleanup. + */ + // XXX startDestroying(lock, agent); + startBypassing(lock, NULL); + exitHashLock(agent); + } +} + +/** + * Search through the lock waiters for a DataVIO that has an allocation. If + * one is found, swap agents, put the old agent at the head of the wait queue, + * then return the new agent. Otherwise, just return the current agent. + * + * @param lock The hash lock to modify + **/ +static DataVIO *selectWritingAgent(HashLock *lock) +{ + // This should-be-impossible condition is the only cause for + // enqueueDataVIO() to fail later on, where it would be a pain to handle. + int result = ASSERT(!isWaiting(dataVIOAsWaiter(lock->agent)), + "agent must not be waiting"); + if (result != VDO_SUCCESS) { + return lock->agent; + } + + WaitQueue tempQueue; + initializeWaitQueue(&tempQueue); + + // Move waiters to the temp queue one-by-one until we find an allocation. + // Not ideal to search, but it only happens when nearly out of space. + DataVIO *dataVIO; + while (((dataVIO = dequeueLockWaiter(lock)) != NULL) + && !hasAllocation(dataVIO)) { + // Use the lower-level enqueue since we're just moving waiters around. + int result = enqueueWaiter(&tempQueue, dataVIOAsWaiter(dataVIO)); + // The only error is the DataVIO already being on a wait queue, and since + // we just dequeued it, that could only happen due to a memory smash or + // concurrent use of that DataVIO. + ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueueWaiter error"); + } + + if (dataVIO != NULL) { + // Move the rest of the waiters over to the temp queue, preserving the + // order they arrived at the lock. + transferAllWaiters(&lock->waiters, &tempQueue); + + // The current agent is being replaced and will have to wait to dedupe; + // make it the first waiter since it was the first to reach the lock. + int result = enqueueDataVIO(&lock->waiters, lock->agent, + THIS_LOCATION(NULL)); + ASSERT_LOG_ONLY(result == VDO_SUCCESS, + "impossible enqueueDataVIO error after isWaiting checked"); + setAgent(lock, dataVIO); + } else { + // No one has an allocation, so keep the current agent. + dataVIO = lock->agent; + } + + // Swap all the waiters back onto the lock's queue. + transferAllWaiters(&tempQueue, &lock->waiters); + return dataVIO; +} + +/** + * Begin the non-duplicate write path for a hash lock that had no advice, + * selecting a DataVIO with an allocation as a new agent, if necessary, + * then resuming the agent on the DataVIO write path. + * + * @param lock The hash lock (currently must be QUERYING) + * @param agent The DataVIO currently acting as the agent for the lock + **/ +static void startWriting(HashLock *lock, DataVIO *agent) +{ + setHashLockState(lock, HASH_LOCK_WRITING); + + // The agent might not have received an allocation and so can't be used for + // writing, but it's entirely possible that one of the waiters did. + if (!hasAllocation(agent)) { + agent = selectWritingAgent(lock); + // If none of the waiters had an allocation, the writes all have to fail. + if (!hasAllocation(agent)) { + /* + * XXX VDOSTORY-190 Should we keep a variant of BYPASSING that causes + * new arrivals to fail immediately if they don't have an allocation? It + * might be possible that on some path there would be non-waiters still + * referencing the lock, so it would remain in the map as everything is + * currently spelled, even if the agent and all the waiters release. + */ + startBypassing(lock, agent); + return; + } + } + + // If the agent compresses, it might wait indefinitely in the packer, + // which would be bad if there are any other DataVIOs waiting. + if (hasWaiters(&lock->waiters)) { + // XXX in sync mode, transition directly to LOCKING to start dedupe? + cancelCompression(agent); + } + + /* + * Send the agent to the compress/pack/async-write path in vioWrite. If it + * succeeds, it will return to the hash lock via continueHashLock() and call + * finishWriting(). + */ + compressData(agent); +} + +/** + * Process the result of a UDS query performed by the agent for the lock. This + * continuation is registered in startQuerying(). + * + * @param completion The completion of the DataVIO that performed the query + **/ +static void finishQuerying(VDOCompletion *completion) +{ + DataVIO *agent = asDataVIO(completion); + assertHashLockAgent(agent, __func__); + HashLock *lock = agent->hashLock; + + if (completion->result != VDO_SUCCESS) { + abortHashLock(lock, agent); + return; + } + + if (agent->isDuplicate) { + lock->duplicate = agent->duplicate; + /* + * QUERYING -> LOCKING transition: Valid advice was obtained from UDS. + * Use the QUERYING agent to start the hash lock on the unverified dedupe + * path, verifying that the advice can be used. + */ + startLocking(lock, agent); + } else { + // The agent will be used as the duplicate if has an allocation; if it + // does, that location was posted to UDS, so no update will be needed. + lock->updateAdvice = !hasAllocation(agent); + /* + * QUERYING -> WRITING transition: There was no advice or the advice + * wasn't valid, so try to write or compress the data. + */ + startWriting(lock, agent); + } +} + +/** + * Start deduplication for a hash lock that has finished initializing by + * making the DataVIO that requested it the agent, entering the QUERYING + * state, and using the agent to perform the UDS query on behalf of the lock. + * + * @param lock The initialized hash lock + * @param dataVIO The DataVIO that has just obtained the new lock + **/ +static void startQuerying(HashLock *lock, DataVIO *dataVIO) +{ + setAgent(lock, dataVIO); + setHashLockState(lock, HASH_LOCK_QUERYING); + + VDOCompletion *completion = dataVIOAsCompletion(dataVIO); + dataVIO->lastAsyncOperation = CHECK_FOR_DEDUPLICATION; + setHashZoneCallback(dataVIO, finishQuerying, THIS_LOCATION(NULL)); + completion->layer->checkForDuplication(dataVIO); +} + +/** + * Complain that a DataVIO has entered a HashLock that is in an unimplemented + * or unusable state and continue the DataVIO with an error. + * + * @param lock The hash lock + * @param dataVIO The DataVIO attempting to enter the lock + **/ +static void reportBogusLockState(HashLock *lock, DataVIO *dataVIO) +{ + int result = ASSERT_FALSE("hash lock must not be in unimplemented state %s", + getHashLockStateName(lock->state)); + continueDataVIOIn(dataVIO, result, compressDataCallback); +} + +/**********************************************************************/ +void enterHashLock(DataVIO *dataVIO) +{ + HashLock *lock = dataVIO->hashLock; + switch (lock->state) { + case HASH_LOCK_INITIALIZING: + startQuerying(lock, dataVIO); + break; + + case HASH_LOCK_QUERYING: + case HASH_LOCK_WRITING: + case HASH_LOCK_UPDATING: + case HASH_LOCK_LOCKING: + case HASH_LOCK_VERIFYING: + case HASH_LOCK_UNLOCKING: + // The lock is busy, and can't be shared yet. + waitOnHashLock(lock, dataVIO); + break; + + case HASH_LOCK_BYPASSING: + // Bypass dedupe entirely. + compressData(dataVIO); + break; + + case HASH_LOCK_DEDUPING: + launchDedupe(lock, dataVIO, false); + break; + + case HASH_LOCK_DESTROYING: + // A lock in this state should not be acquired by new VIOs. + reportBogusLockState(lock, dataVIO); + break; + + default: + reportBogusLockState(lock, dataVIO); + } +} + +/**********************************************************************/ +void continueHashLock(DataVIO *dataVIO) +{ + HashLock *lock = dataVIO->hashLock; + // XXX VDOSTORY-190 Eventually we may be able to fold the error handling + // in at this point instead of using a separate entry point for it. + + switch (lock->state) { + case HASH_LOCK_WRITING: + ASSERT_LOG_ONLY(dataVIO == lock->agent, + "only the lock agent may continue the lock"); + finishWriting(lock, dataVIO); + break; + + case HASH_LOCK_DEDUPING: + finishDeduping(lock, dataVIO); + break; + + case HASH_LOCK_BYPASSING: + // This DataVIO has finished the write path and the lock doesn't need it. + // XXX This isn't going to be correct if DEDUPING ever uses BYPASSING. + finishDataVIO(dataVIO, VDO_SUCCESS); + break; + + case HASH_LOCK_INITIALIZING: + case HASH_LOCK_QUERYING: + case HASH_LOCK_UPDATING: + case HASH_LOCK_LOCKING: + case HASH_LOCK_VERIFYING: + case HASH_LOCK_UNLOCKING: + case HASH_LOCK_DESTROYING: + // A lock in this state should never be re-entered. + reportBogusLockState(lock, dataVIO); + break; + + default: + reportBogusLockState(lock, dataVIO); + } +} + +/**********************************************************************/ +void continueHashLockOnError(DataVIO *dataVIO) +{ + // XXX We could simply use continueHashLock() and check for errors in that. + abortHashLock(dataVIO->hashLock, dataVIO); +} + +/** + * Check whether the data in DataVIOs sharing a lock is different than in a + * DataVIO seeking to share the lock, which should only be possible in the + * extremely unlikely case of a hash collision. + * + * @param lock The lock to check + * @param candidate The DataVIO seeking to share the lock + * + * @return true if the given DataVIO must not share the lock + * because it doesn't have the same data as the lock holders + **/ +static bool isHashCollision(HashLock *lock, DataVIO *candidate) +{ + if (isRingEmpty(&lock->duplicateRing)) { + return false; + } + + DataVIO *lockHolder = dataVIOFromLockNode(lock->duplicateRing.next); + PhysicalLayer *layer = dataVIOAsCompletion(candidate)->layer; + bool collides = !layer->compareDataVIOs(lockHolder, candidate); + + if (collides) { + bumpHashZoneCollisionCount(candidate->hashZone); + } else { + bumpHashZoneDataMatchCount(candidate->hashZone); + } + + return collides; +} + +/**********************************************************************/ +static inline int assertHashLockPreconditions(const DataVIO *dataVIO) +{ + int result = ASSERT(dataVIO->hashLock == NULL, + "must not already hold a hash lock"); + if (result != VDO_SUCCESS) { + return result; + } + result = ASSERT(isRingEmpty(&dataVIO->hashLockNode), + "must not already be a member of a hash lock ring"); + if (result != VDO_SUCCESS) { + return result; + } + return ASSERT(dataVIO->recoverySequenceNumber == 0, + "must not hold a recovery lock when getting a hash lock"); +} + +/**********************************************************************/ +int acquireHashLock(DataVIO *dataVIO) +{ + int result = assertHashLockPreconditions(dataVIO); + if (result != VDO_SUCCESS) { + return result; + } + + HashLock *lock; + result = acquireHashLockFromZone(dataVIO->hashZone, &dataVIO->chunkName, + NULL, &lock); + if (result != VDO_SUCCESS) { + return result; + } + + if (isHashCollision(lock, dataVIO)) { + // Hash collisions are extremely unlikely, but the bogus dedupe would be a + // data corruption. Bypass dedupe entirely by leaving hashLock unset. + // XXX clear hashZone too? + return VDO_SUCCESS; + } + + setHashLock(dataVIO, lock); + return VDO_SUCCESS; +} + +/**********************************************************************/ +void releaseHashLock(DataVIO *dataVIO) +{ + HashLock *lock = dataVIO->hashLock; + if (lock == NULL) { + return; + } + + setHashLock(dataVIO, NULL); + + if (lock->referenceCount > 0) { + // The lock is still in use by other DataVIOs. + return; + } + + setHashLockState(lock, HASH_LOCK_DESTROYING); + returnHashLockToZone(dataVIO->hashZone, &lock); +} + +/** + * Transfer a DataVIO's downgraded allocation PBN lock to the DataVIO's hash + * lock, converting it to a duplicate PBN lock. + * + * @param dataVIO The DataVIO holding the allocation lock to transfer + **/ +static void transferAllocationLock(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(dataVIO->newMapped.pbn == getDataVIOAllocation(dataVIO), + "transferred lock must be for the block written"); + + AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); + PBNLock *pbnLock = allocatingVIO->allocationLock; + allocatingVIO->allocationLock = NULL; + allocatingVIO->allocation = ZERO_BLOCK; + + ASSERT_LOG_ONLY(isPBNReadLock(pbnLock), + "must have downgraded the allocation lock before transfer"); + + HashLock *hashLock = dataVIO->hashLock; + hashLock->duplicate = dataVIO->newMapped; + dataVIO->duplicate = dataVIO->newMapped; + + // Since the lock is being transferred, the holder count doesn't change (and + // isn't even safe to examine on this thread). + hashLock->duplicateLock = pbnLock; +} + +/**********************************************************************/ +void shareCompressedWriteLock(DataVIO *dataVIO, PBNLock *pbnLock) +{ + ASSERT_LOG_ONLY(getDuplicateLock(dataVIO) == NULL, + "a duplicate PBN lock should not exist when writing"); + ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state), + "lock transfer must be for a compressed write"); + assertInNewMappedZone(dataVIO); + + // First sharer downgrades the lock. + if (!isPBNReadLock(pbnLock)) { + downgradePBNWriteLock(pbnLock); + } + + // Get a share of the PBN lock, ensuring it cannot be released until + // after this DataVIO has had a chance to journal a reference. + dataVIO->duplicate = dataVIO->newMapped; + dataVIO->hashLock->duplicate = dataVIO->newMapped; + setDuplicateLock(dataVIO->hashLock, pbnLock); + + // Claim a reference for this DataVIO, which is necessary since another + // HashLock might start deduplicating against it before our incRef. + bool claimed = claimPBNLockIncrement(pbnLock); + ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment"); +} diff --git a/source/vdo/base/hashLock.h b/source/vdo/base/hashLock.h new file mode 100644 index 0000000..b21e465 --- /dev/null +++ b/source/vdo/base/hashLock.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLock.h#3 $ + */ + +#ifndef HASH_LOCK_H +#define HASH_LOCK_H + +#include "types.h" + +/** + * Get the PBN lock on the duplicate data location for a DataVIO from the + * HashLock the DataVIO holds (if there is one). + * + * @param dataVIO The DataVIO to query + * + * @return The PBN lock on the DataVIO's duplicate location + **/ +PBNLock *getDuplicateLock(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Acquire or share a lock on the hash (chunk name) of the data in a DataVIO, + * updating the DataVIO to reference the lock. This must only be called in the + * correct thread for the zone. In the unlikely case of a hash collision, this + * function will succeed, but the DataVIO will not get a lock reference. + * + * @param dataVIO The DataVIO acquiring a lock on its chunk name + **/ +int acquireHashLock(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Asynchronously process a DataVIO that has just acquired its reference to a + * hash lock. This may place the DataVIO on a wait queue, or it may use the + * DataVIO to perform operations on the lock's behalf. + * + * @param dataVIO The DataVIO that has just acquired a lock on its chunk name + **/ +void enterHashLock(DataVIO *dataVIO); + +/** + * Asynchronously continue processing a DataVIO in its hash lock after it has + * finished writing, compressing, or deduplicating, so it can share the result + * with any DataVIOs waiting in the hash lock, or update Albireo, or simply + * release its share of the lock. This must only be called in the correct + * thread for the hash zone. + * + * @param dataVIO The DataVIO to continue processing in its hash lock + **/ +void continueHashLock(DataVIO *dataVIO); + +/** + * Re-enter the hash lock after encountering an error, to clean up the hash + * lock. + * + * @param dataVIO The DataVIO with an error + **/ +void continueHashLockOnError(DataVIO *dataVIO); + +/** + * Release a DataVIO's share of a hash lock, if held, and null out the + * DataVIO's reference to it. This must only be called in the correct thread + * for the hash zone. + * + * If the DataVIO is the only one holding the lock, this also releases any + * resources or locks used by the hash lock (such as a PBN read lock on a + * block containing data with the same hash) and returns the lock to the hash + * zone's lock pool. + * + * @param dataVIO The DataVIO releasing its hash lock + **/ +void releaseHashLock(DataVIO *dataVIO); + +/** + * Make a DataVIO's hash lock a shared holder of the PBN lock on the + * compressed block to which its data was just written. If the lock is still a + * write lock (as it will be for the first share), it will be converted to a + * read lock. This also reserves a reference count increment for the DataVIO. + * + * @param dataVIO The DataVIO which was just compressed + * @param pbnLock The PBN lock on the compressed block + **/ +void shareCompressedWriteLock(DataVIO *dataVIO, PBNLock *pbnLock); + +#endif // HASH_LOCK_H diff --git a/source/vdo/base/hashLockInternals.h b/source/vdo/base/hashLockInternals.h new file mode 100644 index 0000000..67b5634 --- /dev/null +++ b/source/vdo/base/hashLockInternals.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLockInternals.h#2 $ + */ + +#ifndef HASH_LOCK_INTERNALS_H +#define HASH_LOCK_INTERNALS_H + +#include "completion.h" +#include "ringNode.h" +#include "types.h" +#include "uds.h" +#include "waitQueue.h" + +typedef enum { + /** State for locks that are not in use or are being initialized. */ + HASH_LOCK_INITIALIZING = 0, + + // This is the sequence of states typically used on the non-dedupe path. + HASH_LOCK_QUERYING, + HASH_LOCK_WRITING, + HASH_LOCK_UPDATING, + + // The remaining states are typically used on the dedupe path in this order. + HASH_LOCK_LOCKING, + HASH_LOCK_VERIFYING, + HASH_LOCK_DEDUPING, + HASH_LOCK_UNLOCKING, + + // XXX This is a temporary state denoting a lock which is sending VIOs back + // to the old dedupe and vioWrite pathways. It won't be in the final version + // of VDOSTORY-190. + HASH_LOCK_BYPASSING, + + /** + * Terminal state for locks returning to the pool. Must be last both because + * it's the final state, and also because it's used to count the states. + **/ + HASH_LOCK_DESTROYING, +} HashLockState; + +struct hashLock { + /** When the lock is unused, this RingNode allows the lock to be pooled */ + RingNode poolNode; + + /** The block hash covered by this lock */ + UdsChunkName hash; + + /** + * A ring containing the DataVIOs sharing this lock, all having the same + * chunk name and data block contents, linked by their hashLockNode fields. + **/ + RingNode duplicateRing; + + /** The number of DataVIOs sharing this lock instance */ + VIOCount referenceCount; + + /** The maximum value of referenceCount in the lifetime of this lock */ + VIOCount maxReferences; + + /** The current state of this lock */ + HashLockState state; + + /** True if the UDS index should be updated with new advice */ + bool updateAdvice; + + /** True if the advice has been verified to be a true duplicate */ + bool verified; + + /** True if the lock has already accounted for an initial verification */ + bool verifyCounted; + + /** True if this lock is registered in the lock map (cleared on rollover) */ + bool registered; + + /** + * If verified is false, this is the location of a possible duplicate. + * If verified is true, is is the verified location of a true duplicate. + **/ + ZonedPBN duplicate; + + /** The PBN lock on the block containing the duplicate data */ + PBNLock *duplicateLock; + + /** The DataVIO designated to act on behalf of the lock */ + DataVIO *agent; + + /** + * Other DataVIOs with data identical to the agent who are currently waiting + * for the agent to get the information they all need to deduplicate--either + * against each other, or against an existing duplicate on disk. + **/ + WaitQueue waiters; +}; + +/** + * Initialize a HashLock instance which has been newly allocated. + * + * @param lock The lock to initialize + **/ +static inline void initializeHashLock(HashLock *lock) +{ + initializeRing(&lock->poolNode); + initializeRing(&lock->duplicateRing); + initializeWaitQueue(&lock->waiters); +} + +/** + * Get the string representation of a hash lock state. + * + * @param state The hash lock state + * + * @return The short string representing the state + **/ +const char *getHashLockStateName(HashLockState state) + __attribute__((warn_unused_result)); + +#endif // HASH_LOCK_INTERNALS_H diff --git a/source/vdo/base/hashZone.c b/source/vdo/base/hashZone.c new file mode 100644 index 0000000..61345a7 --- /dev/null +++ b/source/vdo/base/hashZone.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashZone.c#3 $ + */ + +#include "hashZone.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "constants.h" +#include "dataVIO.h" +#include "hashLock.h" +#include "hashLockInternals.h" +#include "pointerMap.h" +#include "ringNode.h" +#include "statistics.h" +#include "threadConfig.h" +#include "types.h" +#include "vdoInternal.h" + +enum { + LOCK_POOL_CAPACITY = MAXIMUM_USER_VIOS, +}; + +/** + * These fields are only modified by the locks sharing the hash zone thread, + * but are queried by other threads. + **/ +typedef struct atomicHashLockStatistics { + /** Number of times the UDS advice proved correct */ + Atomic64 dedupeAdviceValid; + + /** Number of times the UDS advice proved incorrect */ + Atomic64 dedupeAdviceStale; + + /** Number of writes with the same data as another in-flight write */ + Atomic64 concurrentDataMatches; + + /** Number of writes whose hash collided with an in-flight write */ + Atomic64 concurrentHashCollisions; +} AtomicHashLockStatistics; + +struct hashZone { + /** Which hash zone this is */ + ZoneCount zoneNumber; + + /** The thread ID for this zone */ + ThreadID threadID; + + /** Mapping from chunkName fields to HashLocks */ + PointerMap *hashLockMap; + + /** Ring containing all unused HashLocks */ + RingNode lockPool; + + /** Statistics shared by all hash locks in this zone */ + AtomicHashLockStatistics statistics; + + /** Array of all HashLocks */ + HashLock *lockArray; +}; + +/** + * Implements PointerKeyComparator. + **/ +static bool compareKeys(const void *thisKey, const void *thatKey) +{ + // Null keys are not supported. + return (memcmp(thisKey, thatKey, sizeof(UdsChunkName)) == 0); +} + +/** + * Implements PointerKeyComparator. + **/ +static uint32_t hashKey(const void *key) +{ + const UdsChunkName *name = key; + /* + * Use a fragment of the chunk name as a hash code. It must not overlap with + * fragments used elsewhere to ensure uniform distributions. + */ + // XXX pick an offset in the chunk name that isn't used elsewhere + return getUInt32LE(&name->name[4]); +} + +/**********************************************************************/ +static inline HashLock *asHashLock(RingNode *poolNode) +{ + STATIC_ASSERT(offsetof(HashLock, poolNode) == 0); + return (HashLock *) poolNode; +} + +/**********************************************************************/ +int makeHashZone(VDO *vdo, ZoneCount zoneNumber, HashZone **zonePtr) +{ + HashZone *zone; + int result = ALLOCATE(1, HashZone, __func__, &zone); + if (result != VDO_SUCCESS) { + return result; + } + + result = makePointerMap(LOCK_MAP_CAPACITY, 0, compareKeys, hashKey, + &zone->hashLockMap); + if (result != VDO_SUCCESS) { + freeHashZone(&zone); + return result; + } + + zone->zoneNumber = zoneNumber; + zone->threadID = getHashZoneThread(getThreadConfig(vdo), zoneNumber); + initializeRing(&zone->lockPool); + + result = ALLOCATE(LOCK_POOL_CAPACITY, HashLock, "HashLock array", + &zone->lockArray); + if (result != VDO_SUCCESS) { + freeHashZone(&zone); + return result; + } + + for (VIOCount i = 0; i < LOCK_POOL_CAPACITY; i++) { + HashLock *lock = &zone->lockArray[i]; + initializeHashLock(lock); + pushRingNode(&zone->lockPool, &lock->poolNode); + } + + *zonePtr = zone; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeHashZone(HashZone **zonePtr) +{ + if (*zonePtr == NULL) { + return; + } + + HashZone *zone = *zonePtr; + freePointerMap(&zone->hashLockMap); + FREE(zone->lockArray); + FREE(zone); + *zonePtr = NULL; +} + +/**********************************************************************/ +ZoneCount getHashZoneNumber(const HashZone *zone) +{ + return zone->zoneNumber; +} + +/**********************************************************************/ +ThreadID getHashZoneThreadID(const HashZone *zone) +{ + return zone->threadID; +} + +/**********************************************************************/ +HashLockStatistics getHashZoneStatistics(const HashZone *zone) +{ + const AtomicHashLockStatistics *atoms = &zone->statistics; + return (HashLockStatistics) { + .dedupeAdviceValid = relaxedLoad64(&atoms->dedupeAdviceValid), + .dedupeAdviceStale = relaxedLoad64(&atoms->dedupeAdviceStale), + .concurrentDataMatches = relaxedLoad64(&atoms->concurrentDataMatches), + .concurrentHashCollisions + = relaxedLoad64(&atoms->concurrentHashCollisions), + }; +} + +/** + * Return a hash lock to the zone's pool and null out the reference to it. + * + * @param [in] zone The zone from which the lock was borrowed + * @param [in,out] lockPtr The last reference to the lock being returned + **/ +static void returnHashLockToPool(HashZone *zone, HashLock **lockPtr) +{ + HashLock *lock = *lockPtr; + *lockPtr = NULL; + + memset(lock, 0, sizeof(*lock)); + initializeHashLock(lock); + pushRingNode(&zone->lockPool, &lock->poolNode); +} + +/**********************************************************************/ +int acquireHashLockFromZone(HashZone *zone, + const UdsChunkName *hash, + HashLock *replaceLock, + HashLock **lockPtr) +{ + // Borrow and prepare a lock from the pool so we don't have to do two + // PointerMap accesses in the common case of no lock contention. + HashLock *newLock = asHashLock(popRingNode(&zone->lockPool)); + int result = ASSERT(newLock != NULL, + "never need to wait for a free hash lock"); + if (result != VDO_SUCCESS) { + return result; + } + + // Fill in the hash of the new lock so we can map it, since we have to use + // the hash as the map key. + newLock->hash = *hash; + + HashLock *lock; + result = pointerMapPut(zone->hashLockMap, &newLock->hash, newLock, + (replaceLock != NULL), (void **) &lock); + if (result != VDO_SUCCESS) { + returnHashLockToPool(zone, &newLock); + return result; + } + + if (replaceLock != NULL) { + // XXX on mismatch put the old lock back and return a severe error + ASSERT_LOG_ONLY(lock == replaceLock, + "old lock must have been in the lock map"); + // XXX check earlier and bail out? + ASSERT_LOG_ONLY(replaceLock->registered, + "old lock must have been marked registered"); + replaceLock->registered = false; + } + + if (lock == replaceLock) { + lock = newLock; + lock->registered = true; + } else { + // There's already a lock for the hash, so we don't need the borrowed lock. + returnHashLockToPool(zone, &newLock); + } + + *lockPtr = lock; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void returnHashLockToZone(HashZone *zone, HashLock **lockPtr) +{ + HashLock *lock = *lockPtr; + *lockPtr = NULL; + + if (lock->registered) { + HashLock *removed = pointerMapRemove(zone->hashLockMap, &lock->hash); + ASSERT_LOG_ONLY(lock == removed, + "hash lock being released must have been mapped"); + } else { + ASSERT_LOG_ONLY(lock != pointerMapGet(zone->hashLockMap, &lock->hash), + "unregistered hash lock must not be in the lock map"); + } + + ASSERT_LOG_ONLY(!hasWaiters(&lock->waiters), + "hash lock returned to zone must have no waiters"); + ASSERT_LOG_ONLY((lock->duplicateLock == NULL), + "hash lock returned to zone must not reference a PBN lock"); + ASSERT_LOG_ONLY((lock->state == HASH_LOCK_DESTROYING), + "returned hash lock must not be in use with state %s", + getHashLockStateName(lock->state)); + ASSERT_LOG_ONLY(isRingEmpty(&lock->poolNode), + "hash lock returned to zone must not be in a pool ring"); + ASSERT_LOG_ONLY(isRingEmpty(&lock->duplicateRing), + "hash lock returned to zone must not reference DataVIOs"); + + returnHashLockToPool(zone, &lock); +} + +/** + * Dump a compact description of HashLock to the log if the lock is not on the + * free list. + * + * @param lock The hash lock to dump + **/ +static void dumpHashLock(const HashLock *lock) +{ + if (!isRingEmpty(&lock->poolNode)) { + // This lock is on the free list. + return; + } + + // Necessarily cryptic since we can log a lot of these. First three chars of + // state is unambiguous. 'U' indicates a lock not registered in the map. + const char *state = getHashLockStateName(lock->state); + logInfo(" hl %" PRIptr ": %3.3s %c%llu/%u rc=%u wc=%zu agt=%" PRIptr, + (const void *) lock, + state, + (lock->registered ? 'D' : 'U'), + lock->duplicate.pbn, + lock->duplicate.state, + lock->referenceCount, + countWaiters(&lock->waiters), + (void *) lock->agent); +} + +/**********************************************************************/ +void bumpHashZoneValidAdviceCount(HashZone *zone) +{ + // Must only be mutated on the hash zone thread. + relaxedAdd64(&zone->statistics.dedupeAdviceValid, 1); +} + +/**********************************************************************/ +void bumpHashZoneStaleAdviceCount(HashZone *zone) +{ + // Must only be mutated on the hash zone thread. + relaxedAdd64(&zone->statistics.dedupeAdviceStale, 1); +} + +/**********************************************************************/ +void bumpHashZoneDataMatchCount(HashZone *zone) +{ + // Must only be mutated on the hash zone thread. + relaxedAdd64(&zone->statistics.concurrentDataMatches, 1); +} + +/**********************************************************************/ +void bumpHashZoneCollisionCount(HashZone *zone) +{ + // Must only be mutated on the hash zone thread. + relaxedAdd64(&zone->statistics.concurrentHashCollisions, 1); +} + +/**********************************************************************/ +void dumpHashZone(const HashZone *zone) +{ + if (zone->hashLockMap == NULL) { + logInfo("HashZone %u: NULL map", zone->zoneNumber); + return; + } + + logInfo("HashZone %u: mapSize=%zu", + zone->zoneNumber, pointerMapSize(zone->hashLockMap)); + for (VIOCount i = 0; i < LOCK_POOL_CAPACITY; i++) { + dumpHashLock(&zone->lockArray[i]); + } +} diff --git a/source/vdo/base/hashZone.h b/source/vdo/base/hashZone.h new file mode 100644 index 0000000..ac1b695 --- /dev/null +++ b/source/vdo/base/hashZone.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashZone.h#1 $ + */ + +#ifndef HASH_ZONE_H +#define HASH_ZONE_H + +#include "uds.h" + +#include "statistics.h" +#include "types.h" + +/** + * Create a hash zone. + * + * @param [in] vdo The VDO to which the zone will belong + * @param [in] zoneNumber The number of the zone to create + * @param [out] zonePtr A pointer to hold the new HashZone + * + * @return VDO_SUCCESS or an error code + **/ +int makeHashZone(VDO *vdo, ZoneCount zoneNumber, HashZone **zonePtr) + __attribute__((warn_unused_result)); + +/** + * Free a hash zone and null out the reference to it. + * + * @param zonePtr A pointer to the zone to free + **/ +void freeHashZone(HashZone **zonePtr); + +/** + * Get the zone number of a hash zone. + * + * @param zone The zone + * + * @return The number of the zone + **/ +ZoneCount getHashZoneNumber(const HashZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the ID of a hash zone's thread. + * + * @param zone The zone + * + * @return The zone's thread ID + **/ +ThreadID getHashZoneThreadID(const HashZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the statistics for this hash zone. + * + * @param zone The hash zone to query + * + * @return A copy of the current statistics for the hash zone + **/ +HashLockStatistics getHashZoneStatistics(const HashZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the lock for the hash (chunk name) of the data in a DataVIO, or if one + * does not exist (or if we are explicitly rolling over), initialize a new + * lock for the hash and register it in the zone. This must only be called in + * the correct thread for the zone. + * + * @param [in] zone The zone responsible for the hash + * @param [in] hash The hash to lock + * @param [in] replaceLock If non-NULL, the lock already registered for the + * hash which should be replaced by the new lock + * @param [out] lockPtr A pointer to receive the hash lock + * + * @return VDO_SUCCESS or an error code + **/ +int acquireHashLockFromZone(HashZone *zone, + const UdsChunkName *hash, + HashLock *replaceLock, + HashLock **lockPtr) + __attribute__((warn_unused_result)); + +/** + * Return a hash lock to the zone it was borrowed from, remove it from the + * zone's lock map, returning it to the pool, and nulling out the reference to + * it. This must only be called when the lock has been completely released, + * and only in the correct thread for the zone. + * + * @param [in] zone The zone from which the lock was borrowed + * @param [in,out] lockPtr The lock that is no longer in use + **/ +void returnHashLockToZone(HashZone *zone, HashLock **lockPtr); + +/** + * Increment the valid advice count in the hash zone statistics. + * Must only be called from the hash zone thread. + * + * @param zone The hash zone of the lock that received valid advice + **/ +void bumpHashZoneValidAdviceCount(HashZone *zone); + +/** + * Increment the stale advice count in the hash zone statistics. + * Must only be called from the hash zone thread. + * + * @param zone The hash zone of the lock that received stale advice + **/ +void bumpHashZoneStaleAdviceCount(HashZone *zone); + +/** + * Increment the concurrent dedupe count in the hash zone statistics. + * Must only be called from the hash zone thread. + * + * @param zone The hash zone of the lock that matched a new DataVIO + **/ +void bumpHashZoneDataMatchCount(HashZone *zone); + +/** + * Increment the concurrent hash collision count in the hash zone statistics. + * Must only be called from the hash zone thread. + * + * @param zone The hash zone of the lock that rejected a colliding DataVIO + **/ +void bumpHashZoneCollisionCount(HashZone *zone); + +/** + * Dump information about a hash zone to the log for debugging. + * + * @param zone The zone to dump + **/ +void dumpHashZone(const HashZone *zone); + +#endif // HASH_ZONE_H diff --git a/source/vdo/base/header.c b/source/vdo/base/header.c new file mode 100644 index 0000000..8f0582b --- /dev/null +++ b/source/vdo/base/header.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/header.c#5 $ + */ + +#include "header.h" + +#include "logger.h" +#include "permassert.h" +#include "statusCodes.h" + +/**********************************************************************/ +int validateVersion(VersionNumber expectedVersion, + VersionNumber actualVersion, + const char *componentName) +{ + if (!areSameVersion(expectedVersion, actualVersion)) { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "%s version mismatch," + " expected %d.%d, got %d.%d", + componentName, + expectedVersion.majorVersion, + expectedVersion.minorVersion, + actualVersion.majorVersion, + actualVersion.minorVersion); + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +int validateHeader(const Header *expectedHeader, + const Header *actualHeader, + bool exactSize, + const char *componentName) +{ + if (expectedHeader->id != actualHeader->id) { + return logErrorWithStringError(VDO_INCORRECT_COMPONENT, + "%s ID mismatch, expected %d, got %d", + componentName, + expectedHeader->id, + actualHeader->id); + } + + int result = validateVersion(expectedHeader->version, + actualHeader->version, + componentName); + if (result != VDO_SUCCESS) { + return result; + } + + if ((expectedHeader->size > actualHeader->size) + || (exactSize && (expectedHeader->size < actualHeader->size))) { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "%s size mismatch, expected %zu, got %zu", + componentName, + expectedHeader->size, + actualHeader->size); + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int encodeHeader(const Header *header, Buffer *buffer) +{ + if (!ensureAvailableSpace(buffer, ENCODED_HEADER_SIZE)) { + return UDS_BUFFER_ERROR; + } + + int result = putUInt32LEIntoBuffer(buffer, header->id); + if (result != UDS_SUCCESS) { + return result; + } + + result = encodeVersionNumber(header->version, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + return putUInt64LEIntoBuffer(buffer, header->size); +} + +/**********************************************************************/ +int encodeVersionNumber(VersionNumber version, Buffer *buffer) +{ + PackedVersionNumber packed = packVersionNumber(version); + return putBytes(buffer, sizeof(packed), &packed); +} + +/**********************************************************************/ +int decodeHeader(Buffer *buffer, Header *header) +{ + ComponentID id; + int result = getUInt32LEFromBuffer(buffer, &id); + if (result != UDS_SUCCESS) { + return result; + } + + VersionNumber version; + result = decodeVersionNumber(buffer, &version); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t size; + result = getUInt64LEFromBuffer(buffer, &size); + if (result != UDS_SUCCESS) { + return result; + } + + *header = (Header) { + .id = id, + .version = version, + .size = size, + }; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int decodeVersionNumber(Buffer *buffer, VersionNumber *version) +{ + PackedVersionNumber packed; + int result = getBytesFromBuffer(buffer, sizeof(packed), &packed); + if (result != UDS_SUCCESS) { + return result; + } + + *version = unpackVersionNumber(packed); + return UDS_SUCCESS; +} diff --git a/source/vdo/base/header.h b/source/vdo/base/header.h new file mode 100644 index 0000000..d5b4f0e --- /dev/null +++ b/source/vdo/base/header.h @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/header.h#4 $ + */ + +#ifndef HEADER_H +#define HEADER_H + +#include "buffer.h" +#include "numeric.h" + +#include "types.h" + +/** + * An in-memory representation of a version number for versioned structures on + * disk. + * + * A version number consists of two portions, a major version and a + * minor version. Any format change which does not require an explicit + * upgrade step from the previous version should increment the minor + * version. Any format change which either requires an explicit + * upgrade step, or is wholly incompatible (i.e. can not be upgraded + * to), should increment the major version, and set the minor version + * to 0. + **/ +typedef struct { + uint32_t majorVersion; + uint32_t minorVersion; +} __attribute__((packed)) VersionNumber; + +/** + * A packed, machine-independent, on-disk representation of a VersionNumber. + * Both fields are stored in little-endian byte order. + **/ +typedef struct { + byte majorVersion[4]; + byte minorVersion[4]; +} __attribute__((packed)) PackedVersionNumber; + +/** + * The registry of component ids for use in headers + **/ +typedef enum { + SUPER_BLOCK = 0, + FIXED_LAYOUT = 1, + RECOVERY_JOURNAL = 2, + SLAB_DEPOT = 3, + BLOCK_MAP = 4, + GEOMETRY_BLOCK = 5, +} ComponentID; + +/** + * The header for versioned data stored on disk. + **/ +typedef struct { + ComponentID id; // The component this is a header for + VersionNumber version; // The version of the data format + size_t size; // The size of the data following this header +} __attribute__((packed)) Header; + +enum { + ENCODED_HEADER_SIZE = sizeof(Header), +}; + +/** + * Check whether two version numbers are the same. + * + * @param versionA The first version + * @param versionB The second version + * + * @return true if the two versions are the same + **/ +static inline bool areSameVersion(VersionNumber versionA, + VersionNumber versionB) +{ + return ((versionA.majorVersion == versionB.majorVersion) + && (versionA.minorVersion == versionB.minorVersion)); +} + +/** + * Check whether an actual version is upgradable to an expected version. + * An actual version is upgradable if its major number is expected but + * its minor number differs, and the expected version's minor number + * is greater than the actual version's minor number. + * + * @param expectedVersion The expected version + * @param actualVersion The version being validated + * + * @return true if the actual version is upgradable + **/ +static inline bool isUpgradableVersion(VersionNumber expectedVersion, + VersionNumber actualVersion) +{ + return ((expectedVersion.majorVersion == actualVersion.majorVersion) + && (expectedVersion.minorVersion > actualVersion.minorVersion)); +} + +/** + * Check whether a version matches an expected version. Logs an error + * describing a mismatch. + * + * @param expectedVersion The expected version + * @param actualVersion The version being validated + * @param componentName The name of the component or the calling function + * (for error logging) + * + * @return VDO_SUCCESS if the versions are the same + * VDO_UNSUPPORTED_VERSION if the versions don't match + **/ +int validateVersion(VersionNumber expectedVersion, + VersionNumber actualVersion, + const char *componentName) + __attribute__((warn_unused_result)); + +/** + * Check whether a header matches expectations. Logs an error describing the + * first mismatch found. + * + * @param expectedHeader The expected header + * @param actualHeader The header being validated + * @param exactSize If true, the size fields of the two headers must be + * the same, otherwise actualSize >= expectedSize is OK + * @param componentName The name of the component or the calling function + * (for error logging) + * + * @return VDO_SUCCESS if the header meets expectations + * VDO_INCORRECT_COMPONENT if the component ids don't match + * VDO_UNSUPPORTED_VERSION if the versions or sizes don't match + **/ +int validateHeader(const Header *expectedHeader, + const Header *actualHeader, + bool exactSize, + const char *componentName) + __attribute__((warn_unused_result)); + +/** + * Encode a header into a buffer. + * + * @param header The header to encode + * @param buffer The buffer in which to encode the header + * + * @return UDS_SUCCESS or an error + **/ +int encodeHeader(const Header *header, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Encode a version number into a buffer. + * + * @param version The version to encode + * @param buffer The buffer in which to encode the version + * + * @return UDS_SUCCESS or an error + **/ +int encodeVersionNumber(VersionNumber version, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode a header from a buffer. + * + * @param [in] buffer The buffer from which to decode the header + * @param [out] header The header to decode + * + * @return UDS_SUCCESS or an error + **/ +int decodeHeader(Buffer *buffer, Header *header) + __attribute__((warn_unused_result)); + +/** + * Decode a version number from a buffer. + * + * @param buffer The buffer from which to decode the version + * @param version The version structure to decode into + * + * @return UDS_SUCCESS or an error + **/ +int decodeVersionNumber(Buffer *buffer, VersionNumber *version) + __attribute__((warn_unused_result)); + +/** + * Convert a VersionNumber to its packed on-disk representation. + * + * @param version The version number to convert + * + * @return the platform-independent representation of the version + **/ +static inline PackedVersionNumber packVersionNumber(VersionNumber version) +{ + PackedVersionNumber packed; + storeUInt32LE(packed.majorVersion, version.majorVersion); + storeUInt32LE(packed.minorVersion, version.minorVersion); + return packed; +} + +/** + * Convert a PackedVersionNumber to its native in-memory representation. + * + * @param version The version number to convert + * + * @return the platform-independent representation of the version + **/ +static inline VersionNumber unpackVersionNumber(PackedVersionNumber version) +{ + return (VersionNumber) { + .majorVersion = getUInt32LE(version.majorVersion), + .minorVersion = getUInt32LE(version.minorVersion), + }; +} + +#endif // HEADER_H diff --git a/source/vdo/base/heap.c b/source/vdo/base/heap.c new file mode 100644 index 0000000..0928023 --- /dev/null +++ b/source/vdo/base/heap.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/heap.c#2 $ + */ + +#include "heap.h" + +#include "errors.h" +#include "logger.h" +#include "numeric.h" + +#include "statusCodes.h" + +/**********************************************************************/ +void initializeHeap(Heap *heap, + HeapComparator *comparator, + HeapSwapper *swapper, + void *array, + size_t capacity, + size_t elementSize) +{ + *heap = (Heap) { + .comparator = comparator, + .swapper = swapper, + .capacity = capacity, + .elementSize = elementSize, + }; + if (array != NULL) { + // Calculating child indexes is simplified by pretending the element array + // is 1-based. + heap->array = ((byte *) array - elementSize); + } +} + +/**********************************************************************/ +static void siftHeapDown(Heap *heap, size_t topNode, size_t lastNode) +{ + // Keep sifting until the sub-heap rooted at topNode has no children. + size_t leftChild; + while ((leftChild = (2 * topNode)) <= lastNode) { + // If there are two children, select the largest child to swap with. + size_t swapNode = leftChild; + if (leftChild < lastNode) { + size_t rightChild = leftChild + heap->elementSize; + if (heap->comparator(&heap->array[leftChild], + &heap->array[rightChild]) < 0) { + swapNode = rightChild; + } + } + + // Stop sifting if topNode is at least as large as its largest child, + // which means the heap invariant was restored by the previous swap. + if (heap->comparator(&heap->array[topNode], &heap->array[swapNode]) >= 0) { + return; + } + + // Swap the element we've been sifting down with the larger child. + heap->swapper(&heap->array[topNode], &heap->array[swapNode]); + + // Descend into the sub-heap rooted at that child, going around the loop + // again in place of a tail-recursive call to siftHeapDown(). + topNode = swapNode; + } + + // We sifted the element all the way to a leaf node of the heap, so the heap + // invariant has now been restored. +} + +/**********************************************************************/ +void buildHeap(Heap *heap, size_t count) +{ + heap->count = minSizeT(count, heap->capacity); + + if ((heap->count < 2) || (heap->elementSize == 0)) { + return; + } + + /* + * All the leaf nodes are trivially valid sub-heaps. Starting with the parent + * of the right-most leaf node, restore the heap invariant in that sub-heap + * by sifting the top node of the sub-heap down into one of its children's + * valid sub-heaps (or not, if the top node is already larger than its + * children). Continue iterating through all the interior nodes in the heap, + * in sort of a reverse breadth-first traversal, restoring the heap + * invariant for each (increasingly larger) sub-heap until we reach the root + * of the heap. Once we sift the root node down into one of its two valid + * children, the entire heap must be valid, by induction. + * + * Even though we operate on every node and potentially perform an O(log N) + * traversal for each node, the combined probabilities of actually needing + * to do a swap and the heights of the sub-heaps sum to a constant, so + * restoring a heap from the bottom-up like this has only O(N) complexity. + */ + size_t size = heap->elementSize; + size_t lastParent = size * (heap->count / 2); + size_t lastNode = size * heap->count; + for (size_t topNode = lastParent; topNode > 0; topNode -= size) { + siftHeapDown(heap, topNode, lastNode); + } +} + +/**********************************************************************/ +bool popMaxHeapElement(Heap *heap, void *elementPtr) +{ + if (heap->count == 0) { + return false; + } + + size_t rootNode = (heap->elementSize * 1); + size_t lastNode = (heap->elementSize * heap->count); + + // Return the maximum element (the root of the heap) if the caller wanted it. + if (elementPtr != NULL) { + memcpy(elementPtr, &heap->array[rootNode], heap->elementSize); + } + + // Move the right-most leaf node to the vacated root node, reducing the + // number of elements by one and violating the heap invariant. + if (rootNode != lastNode) { + memcpy(&heap->array[rootNode], &heap->array[lastNode], heap->elementSize); + } + heap->count -= 1; + lastNode -= heap->elementSize; + + // Restore the heap invariant by sifting the root back down into the heap. + siftHeapDown(heap, rootNode, lastNode); + return true; +} + +/**********************************************************************/ +static inline size_t siftAndSort(Heap *heap, size_t rootNode, size_t lastNode) +{ + /* + * We have a valid heap, so the largest unsorted element is now at the top + * of the heap. That element belongs at the start of the partially-sorted + * array, preceding all the larger elements that we've already removed + * from the heap. Swap that largest unsorted element with the the + * right-most leaf node in the heap, moving it to its sorted position in + * the array. + */ + heap->swapper(&heap->array[rootNode], &heap->array[lastNode]); + // The sorted list is now one element larger and valid. The heap is + // one element smaller, and invalid. + lastNode -= heap->elementSize; + // Restore the heap invariant by sifting the swapped element back down + // into the heap. + siftHeapDown(heap, rootNode, lastNode); + return lastNode; +} + +/**********************************************************************/ +size_t sortHeap(Heap *heap) +{ + // All zero-length records are identical and therefore already sorted, as + // are empty or singleton arrays. + if ((heap->count < 2) || (heap->elementSize == 0)) { + return heap->count; + } + + // Get the byte array offset of the root node, and the right-most leaf node + // in the 1-based array of records that will form the heap. + size_t rootNode = (heap->elementSize * 1); + size_t lastNode = (heap->elementSize * heap->count); + + while (lastNode > rootNode) { + lastNode = siftAndSort(heap, rootNode, lastNode); + } + + size_t count = heap->count; + heap->count = 0; + return count; +} + +/**********************************************************************/ +void *sortNextHeapElement(Heap *heap) +{ + if ((heap->count == 0) || (heap->elementSize == 0)) { + return NULL; + } + + // Get the byte array offset of the root node, and the right-most leaf node + // in the 1-based array of records that will form the heap. + size_t rootNode = (heap->elementSize * 1); + size_t lastNode = (heap->elementSize * heap->count); + if (heap->count > 1) { + siftAndSort(heap, rootNode, lastNode); + } + heap->count--; + + return &heap->array[lastNode]; +} diff --git a/source/vdo/base/heap.h b/source/vdo/base/heap.h new file mode 100644 index 0000000..916f017 --- /dev/null +++ b/source/vdo/base/heap.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/heap.h#2 $ + */ + +#ifndef HEAP_H +#define HEAP_H + +#include "common.h" + +/** + * Prototype for functions which compare two array elements. All the time + * complexity claims in this module assume this operation has O(1) time + * complexity. + * + * @param item1 The first element to compare + * @param item2 The second element to compare + * + * @return An integer which is less than, equal to, or greater than 0 + * depending on whether item1 is less than, equal to, or greater + * than item2, respectively + **/ +typedef int HeapComparator(const void *item1, const void *item2); + +/** + * Prototype for functions which swap two array elements. + * + * @param item1 The first element to swap + * @param item2 The second element to swap + **/ +typedef void HeapSwapper(void *item1, void *item2); + +/** + * A heap array can be any array of fixed-length elements in which the heap + * invariant can be established. In a max-heap, every child of a node must be + * at least as large as its children. Once that invariant is established in an + * array by calling buildHeap(), all the other heap operations may be used on + * that array. + **/ +typedef struct heap { + /** the 1-based array of heap elements (nodes) */ + byte *array; + /** the function to use to compare two elements */ + HeapComparator *comparator; + /** the function to use to swap two elements */ + HeapSwapper *swapper; + /** the maximum number of elements that can be stored */ + size_t capacity; + /** the size of every element (in bytes) */ + size_t elementSize; + /** the current number of elements in the heap */ + size_t count; +} Heap; + +/** + * Initialize an binary heap by wrapping it around an array of elements. + * + * The heap will not own the array it wraps. Use buildHeap() subsequently to + * arrange any elements contained in the array into a valid heap. + * + * @param heap The heap to initialize + * @param comparator The function to use to compare two heap elements + * @param swapper The function to use to swap two heap elements + * @param array The array of elements (not modified by this call) + * @param capacity The maximum number of elements which fit in the array + * @param elementSize The size of every array element, in bytes + **/ +void initializeHeap(Heap *heap, + HeapComparator *comparator, + HeapSwapper *swapper, + void *array, + size_t capacity, + size_t elementSize); + +/** + * Build a max-heap in place in an array (heapify it) by re-ordering the + * elements to establish the heap invariant. Before calling this function, + * first copy the elements to be arranged into a heap into the array that was + * passed to initializeHeap(). This operation has O(N) time complexity in the + * number of elements in the array. + * + * @param heap The heap to build + * @param count The number of elements in the array to build into a heap + **/ +void buildHeap(Heap *heap, size_t count); + +/** + * Check whether the heap is currently empty. + * + * @param heap The heap to query + * + * @return true if there are no elements in the heap + **/ +static inline bool isHeapEmpty(const Heap *heap) +{ + return (heap->count == 0); +} + +/** + * Remove the largest element from the top of the heap and restore the heap + * invariant on the remaining elements. This operation has O(log2(N)) time + * complexity. + * + * @param [in] heap The heap to modify + * @param [out] elementPtr A pointer to receive the largest element (may be + * NULL if the caller just wishes to discard it) + * + * @return false if the heap was empty, so no element was removed + **/ +bool popMaxHeapElement(Heap *heap, void *elementPtr); + +/** + * Sort the elements contained in a heap. + * + * This function re-orders the elements contained in the heap to a sorted + * array in-place by repeatedly popping the maximum element off the heap and + * moving it to the spot vacated at the end of the heap array. When the + * function returns, the heap will be empty and the array will contain the + * elements in sorted order, from heap minimum to heap maximum. The sort is + * unstable--relative ordering of equal keys is not preserved. This operation + * has O(N*log2(N)) time complexity. + * + * @param heap The heap containing the elements to sort + * + * @return the number of elements that were sorted + **/ +size_t sortHeap(Heap *heap); + +/** + * Gets the next sorted heap element and returns a pointer to it, in O(log2(N)) + * time. + * + * @param heap The heap to sort one more step + * + * @return a pointer to the element sorted, or NULL if already fully sorted. + **/ +void *sortNextHeapElement(Heap *heap); + +#endif /* HEAP_H */ diff --git a/source/vdo/base/intMap.c b/source/vdo/base/intMap.c new file mode 100644 index 0000000..2c690a6 --- /dev/null +++ b/source/vdo/base/intMap.c @@ -0,0 +1,661 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/intMap.c#1 $ + */ + +/** + * Hash table implementation of a map from integers to pointers, implemented + * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see + * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does + * not contain any of the locking/concurrency features of the algorithm, just + * the collision resolution scheme. + * + * Hopscotch Hashing is based on hashing with open addressing and linear + * probing. All the entries are stored in a fixed array of buckets, with no + * dynamic allocation for collisions. Unlike linear probing, all the entries + * that hash to a given bucket are stored within a fixed neighborhood starting + * at that bucket. Chaining is effectively represented as a bit vector + * relative to each bucket instead of as pointers or explicit offsets. + * + * When an empty bucket cannot be found within a given neighborhood, + * subsequent neighborhoods are searched, and one or more entries will "hop" + * into those neighborhoods. When this process works, an empty bucket will + * move into the desired neighborhood, allowing the entry to be added. When + * that process fails (typically when the buckets are around 90% full), the + * table must be resized and the all entries rehashed and added to the + * expanded table. + * + * Unlike linear probing, the number of buckets that must be searched in the + * worst case has a fixed upper bound (the size of the neighborhood). Those + * entries occupy a small number of memory cache lines, leading to improved + * use of the cache (fewer misses on both successful and unsuccessful + * searches). Hopscotch hashing outperforms linear probing at much higher load + * factors, so even with the increased memory burden for maintaining the hop + * vectors, less memory is needed to achieve that performance. Hopscotch is + * also immune to "contamination" from deleting entries since entries are + * genuinely removed instead of being replaced by a placeholder. + * + * The published description of the algorithm used a bit vector, but the paper + * alludes to an offset scheme which is used by this implementation. Since the + * entries in the neighborhood are within N entries of the hash bucket at the + * start of the neighborhood, a pair of small offset fields each log2(N) bits + * wide is all that's needed to maintain the hops as a linked list. In order + * to encode "no next hop" (i.e. NULL) as the natural initial value of zero, + * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 => + * offset=1, etc.) We can represent neighborhoods of up to 255 entries with + * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the + * first entry in the list is always the bucket closest to the start of the + * neighborhood. + * + * While individual accesses tend to be very fast, the table resize operations + * are very very expensive. If an upper bound on the latency of adding an + * entry to the table is needed, we either need to ensure the table is + * pre-sized to be large enough so no resize is ever needed, or we'll need to + * develop an approach to incrementally resize the table. + **/ + +#include "intMap.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +enum { + DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table + NEIGHBORHOOD = 255, // the number of buckets in each neighborhood + MAX_PROBES = 1024, // limit on the number of probes for a free bucket + NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list + DEFAULT_LOAD = 75 // a compromise between memory use and performance +}; + +/** + * Buckets are packed together to reduce memory usage and improve cache + * efficiency. It would be tempting to encode the hop offsets separately and + * maintain alignment of key/value pairs, but it's crucial to keep the hop + * fields near the buckets that they use them so they'll tend to share cache + * lines. + **/ +typedef struct __attribute__((packed)) bucket { + uint8_t firstHop; // the biased offset of the first entry in the hop list + // of the neighborhood that hashes to this bucket + uint8_t nextHop; // the biased offset of the next bucket in the hop list + + uint64_t key; // the key stored in this bucket + void *value; // the value stored in this bucket (NULL if empty) +} Bucket; + +/** + * The concrete definition of the opaque IntMap type. To avoid having to wrap + * the neighborhoods of the last entries back around to the start of the + * bucket array, we allocate a few more buckets at the end of the array + * instead, which is why capacity and bucketCount are different. + **/ +struct intMap { + size_t size; // the number of entries stored in the map + size_t capacity; // the number of neighborhoods in the map + size_t bucketCount; // the number of buckets in the bucket array + Bucket *buckets; // the array of hash buckets +}; + +/** + * This is the Google CityHash 16-byte hash mixing function. + * + * @param input1 the first input value + * @param input2 the second input value + * + * @return a hash of the two inputs + **/ +static uint64_t mix(uint64_t input1, uint64_t input2) +{ + static const uint64_t CITY_MULTIPLIER = 0x9ddfea08eb382d69ULL; + + uint64_t hash = (input1 ^ input2); + hash *= CITY_MULTIPLIER; + hash ^= (hash >> 47); + hash ^= input2; + hash *= CITY_MULTIPLIER; + hash ^= (hash >> 47); + hash *= CITY_MULTIPLIER; + return hash; +} + +/** + * Calculate a 64-bit non-cryptographic hash value for the provided 64-bit + * integer key. The implementation is based on Google's CityHash, only + * handling the specific case of an 8-byte input. + * + * @param key the mapping key + * + * @return the hash of the mapping key + **/ +static uint64_t hashKey(uint64_t key) +{ + // Aliasing restrictions forbid us from casting pointer types, so use a + // union to convert a single uint64_t to two uint32_t values. + union { + uint64_t u64; + uint32_t u32[2]; + } pun = { .u64 = key }; + return mix(sizeof(key) + (((uint64_t) pun.u32[0]) << 3), pun.u32[1]); +} + +/** + * Initialize an IntMap. + * + * @param map the map to initialize + * @param capacity the initial capacity of the map + * + * @return UDS_SUCCESS or an error code + **/ +static int allocateBuckets(IntMap *map, size_t capacity) +{ + map->size = 0; + map->capacity = capacity; + + // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a + // full neighborhood without have to wrap back around to element zero. + map->bucketCount = capacity + (NEIGHBORHOOD - 1); + return ALLOCATE(map->bucketCount, Bucket, "IntMap buckets", &map->buckets); +} + +/**********************************************************************/ +int makeIntMap(size_t initialCapacity, + unsigned int initialLoad, + IntMap **mapPtr) +{ + // Use the default initial load if the caller did not specify one. + if (initialLoad == 0) { + initialLoad = DEFAULT_LOAD; + } + if (initialLoad > 100) { + return UDS_INVALID_ARGUMENT; + } + + IntMap *map; + int result = ALLOCATE(1, IntMap, "IntMap", &map); + if (result != UDS_SUCCESS) { + return result; + } + + // Use the default capacity if the caller did not specify one. + size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY; + + // Scale up the capacity by the specified initial load factor. + // (i.e to hold 1000 entries at 80% load we need a capacity of 1250) + capacity = capacity * 100 / initialLoad; + + result = allocateBuckets(map, capacity); + if (result != UDS_SUCCESS) { + freeIntMap(&map); + return result; + } + + *mapPtr = map; + return UDS_SUCCESS; +} + +/** + * Free the bucket array for the map. + * + * @param map the map whose bucket array is to be freed + **/ +static void freeBuckets(IntMap *map) +{ + FREE(map->buckets); + map->buckets = NULL; +} + +/**********************************************************************/ +void freeIntMap(IntMap **mapPtr) +{ + if (*mapPtr != NULL) { + freeBuckets(*mapPtr); + FREE(*mapPtr); + *mapPtr = NULL; + } +} + +/**********************************************************************/ +size_t intMapSize(const IntMap *map) +{ + return map->size; +} + +/** + * Convert a biased hop offset within a neighborhood to a pointer to the + * bucket it references. + * + * @param neighborhood the first bucket in the neighborhood + * @param hopOffset the biased hop offset to the desired bucket + * + * @return NULL if hopOffset is zero, otherwise a pointer to + * the bucket in the neighborhood at hopOffset - 1 + **/ +static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset) +{ + if (hopOffset == NULL_HOP_OFFSET) { + return NULL; + } + + STATIC_ASSERT(NULL_HOP_OFFSET == 0); + return &neighborhood[hopOffset - 1]; +} + +/** + * Add a bucket into the hop list for the neighborhood, inserting it into the + * list so the hop list remains sorted by hop offset. + * + * @param neighborhood the first bucket in the neighborhood + * @param newBucket the bucket to add to the hop list + **/ +static void insertInHopList(Bucket *neighborhood, Bucket *newBucket) +{ + // Zero indicates a NULL hop offset, so bias the hop offset by one. + int hopOffset = 1 + (newBucket - neighborhood); + + // Handle the special case of adding a bucket at the start of the list. + int nextHop = neighborhood->firstHop; + if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { + newBucket->nextHop = nextHop; + neighborhood->firstHop = hopOffset; + return; + } + + // Search the hop list for the insertion point that maintains the sort + // order. + for (;;) { + Bucket *bucket = dereferenceHop(neighborhood, nextHop); + nextHop = bucket->nextHop; + + if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { + newBucket->nextHop = nextHop; + bucket->nextHop = hopOffset; + return; + } + } +} + +/** + * Select and return the hash bucket for a given search key. + * + * @param map the map to search + * @param key the mapping key + **/ +static Bucket *selectBucket(const IntMap *map, uint64_t key) +{ + // Calculate a good hash value for the provided key. We want exactly 32 + // bits, so mask the result. + uint64_t hash = hashKey(key) & 0xFFFFFFFF; + + /* + * Scale the 32-bit hash to a bucket index by treating it as a binary + * fraction and multiplying that by the capacity. If the hash is uniformly + * distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be + * uniformly distributed over [0 .. capacity-1]. The multiply and shift is + * much faster than a divide (modulus) on X86 CPUs. + */ + return &map->buckets[(hash * map->capacity) >> 32]; +} + +/** + * Search the hop list associated with given hash bucket for a given search + * key. If the key is found, returns a pointer to the entry (bucket or + * collision), otherwise returns NULL. + * + * @param [in] map the map being searched + * @param [in] bucket the map bucket to search for the key + * @param [in] key the mapping key + * @param [out] previousPtr if not NULL, a pointer in which to + * store the bucket in the list preceding the one + * that had the matching key + * + * @return an entry that matches the key, or NULL if not found + **/ +static Bucket *searchHopList(IntMap *map __attribute__((unused)), + Bucket *bucket, + uint64_t key, + Bucket **previousPtr) +{ + Bucket *previous = NULL; + unsigned int nextHop = bucket->firstHop; + while (nextHop != NULL_HOP_OFFSET) { + // Check the neighboring bucket indexed by the offset for the desired key. + Bucket *entry = dereferenceHop(bucket, nextHop); + if ((key == entry->key) && (entry->value != NULL)) { + if (previousPtr != NULL) { + *previousPtr = previous; + } + return entry; + } + nextHop = entry->nextHop; + previous = entry; + } + return NULL; +} + +/**********************************************************************/ +void *intMapGet(IntMap *map, uint64_t key) +{ + Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL); + return ((match != NULL) ? match->value : NULL); +} + +/** + * Increase the number of hash buckets and rehash all the existing entries, + * storing them in the new buckets. + * + * @param map the map to resize + **/ +static int resizeBuckets(IntMap *map) +{ + // Copy the top-level map data to the stack. + IntMap oldMap = *map; + + // Re-initialize the map to be empty and 50% larger. + size_t newCapacity = map->capacity / 2 * 3; + logInfo("%s: attempting resize from %zu to %zu, current size=%zu", + __func__, map->capacity, newCapacity, map->size); + int result = allocateBuckets(map, newCapacity); + if (result != UDS_SUCCESS) { + *map = oldMap; + return result; + } + + // Populate the new hash table from the entries in the old bucket array. + for (size_t i = 0; i < oldMap.bucketCount; i++) { + Bucket *entry = &oldMap.buckets[i]; + if (entry->value == NULL) { + continue; + } + + result = intMapPut(map, entry->key, entry->value, true, NULL); + if (result != UDS_SUCCESS) { + // Destroy the new partial map and restore the map from the stack. + freeBuckets(map); + *map = oldMap; + return result; + } + } + + // Destroy the old bucket array. + freeBuckets(&oldMap); + return UDS_SUCCESS; +} + +/** + * Probe the bucket array starting at the given bucket for the next empty + * bucket, returning a pointer to it. NULL will be returned if + * the search reaches the end of the bucket array or if the number of linear + * probes exceeds a specified limit. + * + * @param map the map containing the buckets to search + * @param bucket the bucket at which to start probing + * @param maxProbes the maximum number of buckets to search + * + * @return the next empty bucket, or NULL if the search failed + **/ +static Bucket *findEmptyBucket(IntMap *map, + Bucket *bucket, + unsigned int maxProbes) +{ + // Limit the search to either the nearer of the end of the bucket array or a + // fixed distance beyond the initial bucket. + size_t remaining = &map->buckets[map->bucketCount] - bucket; + Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)]; + + for (Bucket *entry = bucket; entry < sentinel; entry++) { + if (entry->value == NULL) { + return entry; + } + } + return NULL; +} + +/** + * Move an empty bucket closer to the start of the bucket array. This searches + * the neighborhoods that contain the empty bucket for a non-empty bucket + * closer to the start of the array. If such a bucket is found, this swaps the + * two buckets by moving the entry to the empty bucket. + * + * @param map the map containing the bucket + * @param hole the empty bucket to fill with an entry that precedes it in one + * of its enclosing neighborhoods + * + * @return the bucket that was vacated by moving its entry to the provided + * hole, or NULL if no entry could be moved + **/ +static Bucket *moveEmptyBucket(IntMap *map __attribute__((unused)), + Bucket *hole) +{ + /* + * Examine every neighborhood that the empty bucket is part of, starting + * with the one in which it is the last bucket. No boundary check is needed + * for the negative array arithmetic since this function is only called when + * hole is at least NEIGHBORHOOD cells deeper into the array than a valid + * bucket. + */ + for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { + // Find the entry that is nearest to the bucket, which means it will be + // nearest to the hash bucket whose neighborhood is full. + Bucket *newHole = dereferenceHop(bucket, bucket->firstHop); + if (newHole == NULL) { + // There are no buckets in this neighborhood that are in use by this one + // (they must all be owned by overlapping neighborhoods). + continue; + } + + // Skip this bucket if its first entry is actually further away than the + // hole that we're already trying to fill. + if (hole < newHole) { + continue; + } + + /* + * We've found an entry in this neighborhood that we can "hop" further + * away, moving the hole closer to the hash bucket, if not all the way + * into its neighborhood. + */ + + // The entry that will be the new hole is the first bucket in the list, + // so setting firstHop is all that's needed remove it from the list. + bucket->firstHop = newHole->nextHop; + newHole->nextHop = NULL_HOP_OFFSET; + + // Move the entry into the original hole. + hole->key = newHole->key; + hole->value = newHole->value; + newHole->value = NULL; + + // Insert the filled hole into the hop list for the neighborhood. + insertInHopList(bucket, hole); + return newHole; + } + + // We couldn't find an entry to relocate to the hole. + return NULL; +} + +/** + * Find and update any existing mapping for a given key, returning the value + * associated with the key in the provided pointer. + * + * @param [in] map the IntMap to attempt to modify + * @param [in] neighborhood the first bucket in the neighborhood that + * would contain the search key + * @param [in] key the key with which to associate the new value + * @param [in] newValue the value to be associated with the key + * @param [in] update whether to overwrite an existing value + * @param [out] oldValuePtr a pointer in which to store the old value + * (unmodified if no mapping was found) + * + * @return true if the map contains a mapping for the key + * false if it does not + **/ +static bool updateMapping(IntMap *map, + Bucket *neighborhood, + uint64_t key, + void *newValue, + bool update, + void **oldValuePtr) +{ + Bucket *bucket = searchHopList(map, neighborhood, key, NULL); + if (bucket == NULL) { + // There is no bucket containing the key in the neighborhood. + return false; + } + + // Return the value of the current mapping (if desired) and update the + // mapping with the new value (if desired). + if (oldValuePtr != NULL) { + *oldValuePtr = bucket->value; + } + if (update) { + bucket->value = newValue; + } + return true; +} + +/** + * Find an empty bucket in a specified neighborhood for a new mapping or + * attempt to re-arrange mappings so there is such a bucket. This operation + * may fail (returning NULL) if an empty bucket is not available or could not + * be relocated to the neighborhood. + * + * @param map the IntMap to search or modify + * @param neighborhood the first bucket in the neighborhood in which + * an empty bucket is needed for a new mapping + * + * @return a pointer to an empty bucket in the desired neighborhood, or + * NULL if a vacancy could not be found or arranged + **/ +static Bucket *findOrMakeVacancy(IntMap *map, Bucket *neighborhood) +{ + // Probe within and beyond the neighborhood for the first empty bucket. + Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES); + + // Keep trying until the empty bucket is in the bucket's neighborhood or we + // are unable to move it any closer by swapping it with a filled bucket. + while (hole != NULL) { + int distance = hole - neighborhood; + if (distance < NEIGHBORHOOD) { + // We've found or relocated an empty bucket close enough to the initial + // hash bucket to be referenced by its hop vector. + return hole; + } + + // The nearest empty bucket isn't within the neighborhood that must + // contain the new entry, so try to swap it with bucket that is closer. + hole = moveEmptyBucket(map, hole); + } + + return NULL; +} + +/**********************************************************************/ +int intMapPut(IntMap *map, + uint64_t key, + void *newValue, + bool update, + void **oldValuePtr) +{ + if (newValue == NULL) { + return UDS_INVALID_ARGUMENT; + } + + // Select the bucket at the start of the neighborhood that must contain any + // entry for the provided key. + Bucket *neighborhood = selectBucket(map, key); + + // Check whether the neighborhood already contains an entry for the key, in + // which case we optionally update it, returning the old value. + if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) { + return UDS_SUCCESS; + } + + /* + * Find an empty bucket in the desired neighborhood for the new entry or + * re-arrange entries in the map so there is such a bucket. This operation + * will usually succeed; the loop body will only be executed on the rare + * occasions that we have to resize the map. + */ + Bucket *bucket; + while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) { + /* + * There is no empty bucket in which to put the new entry in the current + * map, so we're forced to allocate a new bucket array with a larger + * capacity, re-hash all the entries into those buckets, and try again (a + * very expensive operation for large maps). + */ + int result = resizeBuckets(map); + if (result != UDS_SUCCESS) { + return result; + } + + // Resizing the map invalidates all pointers to buckets, so recalculate + // the neighborhood pointer. + neighborhood = selectBucket(map, key); + } + + // Put the new entry in the empty bucket, adding it to the neighborhood. + bucket->key = key; + bucket->value = newValue; + insertInHopList(neighborhood, bucket); + map->size += 1; + + // There was no existing entry, so there was no old value to be returned. + if (oldValuePtr != NULL) { + *oldValuePtr = NULL; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void *intMapRemove(IntMap *map, uint64_t key) +{ + // Select the bucket to search and search it for an existing entry. + Bucket *bucket = selectBucket(map, key); + Bucket *previous; + Bucket *victim = searchHopList(map, bucket, key, &previous); + + if (victim == NULL) { + // There is no matching entry to remove. + return NULL; + } + + // We found an entry to remove. Save the mapped value to return later and + // empty the bucket. + map->size -= 1; + void *value = victim->value; + victim->value = NULL; + victim->key = 0; + + // The victim bucket is now empty, but it still needs to be spliced out of + // the hop list. + if (previous == NULL) { + // The victim is the head of the list, so swing firstHop. + bucket->firstHop = victim->nextHop; + } else { + previous->nextHop = victim->nextHop; + } + victim->nextHop = NULL_HOP_OFFSET; + + return value; +} diff --git a/source/vdo/base/intMap.h b/source/vdo/base/intMap.h new file mode 100644 index 0000000..0b18209 --- /dev/null +++ b/source/vdo/base/intMap.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/intMap.h#1 $ + */ + +#ifndef INT_MAP_H +#define INT_MAP_H + +#include "common.h" + +/** + * IntMap associates pointers (void *) with integer keys + * (uint64_t). NULL pointer values are not + * supported. + * + * The map is implemented as hash table, which should provide constant-time + * insert, query, and remove operations, although the insert may occasionally + * grow the table, which is linear in the number of entries in the map. The + * table will grow as needed to hold new entries, but will not shrink as + * entries are removed. + **/ + +typedef struct intMap IntMap; + +/** + * Allocate and initialize an IntMap. + * + * @param [in] initialCapacity the number of entries the map should + * initially be capable of holding (zero tells + * the map to use its own small default) + * @param [in] initialLoad the load factor of the map, expressed as an + * integer percentage (typically in the range + * 50 to 90, with zero telling the map to use + * its own default) + * @param [out] mapPtr a pointer to hold the new IntMap + * + * @return UDS_SUCCESS or an error code + **/ +int makeIntMap(size_t initialCapacity, + unsigned int initialLoad, + IntMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Free an IntMap and null out the reference to it. NOTE: The map does not own + * the pointer values stored in the map and they are not freed by this call. + * + * @param [in,out] mapPtr the reference to the IntMap to free + **/ +void freeIntMap(IntMap **mapPtr); + +/** + * Get the number of entries stored in an IntMap. + * + * @param map the IntMap to query + * + * @return the number of entries in the map + **/ +size_t intMapSize(const IntMap *map); + +/** + * Retrieve the value associated with a given key from the IntMap. + * + * @param map the IntMap to query + * @param key the key to look up + * + * @return the value associated with the given key, or NULL + * if the key is not mapped to any value + **/ +void *intMapGet(IntMap *map, uint64_t key); + +/** + * Try to associate a value (a pointer) with an integer in an IntMap. If the + * map already contains a mapping for the provided key, the old value is + * only replaced with the specified value if update is true. In either case + * the old value is returned. If the map does not already contain a value for + * the specified key, the new value is added regardless of the value of update. + * + * @param [in] map the IntMap to attempt to modify + * @param [in] key the key with which to associate the new value + * @param [in] newValue the value to be associated with the key + * @param [in] update whether to overwrite an existing value + * @param [out] oldValuePtr a pointer in which to store either the old value + * (if the key was already mapped) or + * NULL if the map did not contain the + * key; NULL may be provided if the + * caller does not need to know the old value + * + * @return UDS_SUCCESS or an error code + **/ +int intMapPut(IntMap *map, + uint64_t key, + void *newValue, + bool update, + void **oldValuePtr) + __attribute__((warn_unused_result)); + +/** + * Remove the mapping for a given key from the IntMap. + * + * @param map the IntMap from which to remove the mapping + * @param key the key whose mapping is to be removed + * + * @return the value that was associated with the key, or + * NULL if it was not mapped + **/ +void *intMapRemove(IntMap *map, uint64_t key); + +#endif /* INT_MAP_H */ diff --git a/source/vdo/base/journalPoint.h b/source/vdo/base/journalPoint.h new file mode 100644 index 0000000..30d44cd --- /dev/null +++ b/source/vdo/base/journalPoint.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/journalPoint.h#2 $ + */ + +#ifndef JOURNAL_POINT_H +#define JOURNAL_POINT_H + +#include "numeric.h" +#include "types.h" + +typedef uint16_t JournalEntryCount; + +/** + * The absolute position of an entry in a recovery journal or slab journal. + **/ +typedef struct { + SequenceNumber sequenceNumber; + JournalEntryCount entryCount; +} JournalPoint; + +/** + * A packed, platform-independent encoding of a JournalPoint. + **/ +typedef struct { + /** + * The packed representation is the little-endian 64-bit representation of + * the low-order 48 bits of the sequence number, shifted up 16 bits, or'ed + * with the 16-bit entry count. + * + * Very long-term, the top 16 bits of the sequence number may not always be + * zero, as this encoding assumes--see BZ 1523240. + **/ + byte encodedPoint[8]; +} __attribute__((packed)) PackedJournalPoint; + +/** + * Move the given journal point forward by one entry. + * + * @param point the journal point to adjust + * @param entriesPerBlock the number of entries in one full block + **/ +static inline void advanceJournalPoint(JournalPoint *point, + JournalEntryCount entriesPerBlock) +{ + point->entryCount++; + if (point->entryCount == entriesPerBlock) { + point->sequenceNumber++; + point->entryCount = 0; + } +} + +/** + * Check whether a journal point is valid. + * + * @param point the journal point + * + * @return true if the journal point is valid + **/ +static inline bool isValidJournalPoint(const JournalPoint *point) +{ + return ((point != NULL) && (point->sequenceNumber > 0)); +} + +/** + * Check whether the first point precedes the second point. + * + * @param first the first journal point + * @param second the second journal point + + * + * @return true if the first point precedes the second point. + **/ +static inline bool beforeJournalPoint(const JournalPoint *first, + const JournalPoint *second) +{ + return ((first->sequenceNumber < second->sequenceNumber) + || ((first->sequenceNumber == second->sequenceNumber) + && (first->entryCount < second->entryCount))); +} + +/** + * Check whether the first point is the same as the second point. + * + * @param first the first journal point + * @param second the second journal point + * + * @return true if both points reference the same logical + * position of an entry the journal + **/ +static inline bool areEquivalentJournalPoints(const JournalPoint *first, + const JournalPoint *second) +{ + return ((first->sequenceNumber == second->sequenceNumber) + && (first->entryCount == second->entryCount)); +} + +/** + * Encode the journal location represented by a JournalPoint into a + * PackedJournalPoint. + * + * @param unpacked The unpacked input point + * @param packed The packed output point + **/ +static inline void packJournalPoint(const JournalPoint *unpacked, + PackedJournalPoint *packed) +{ + uint64_t native = ((unpacked->sequenceNumber << 16) | unpacked->entryCount); + storeUInt64LE(packed->encodedPoint, native); +} + +/** + * Decode the journal location represented by a PackedJournalPoint into a + * JournalPoint. + * + * @param packed The packed input point + * @param unpacked The unpacked output point + **/ +static inline void unpackJournalPoint(const PackedJournalPoint *packed, + JournalPoint *unpacked) +{ + uint64_t native = getUInt64LE(packed->encodedPoint); + unpacked->sequenceNumber = (native >> 16); + unpacked->entryCount = (native & 0xffff); +} + +#endif // JOURNAL_POINT_H diff --git a/source/vdo/base/lockCounter.c b/source/vdo/base/lockCounter.c new file mode 100644 index 0000000..e762576 --- /dev/null +++ b/source/vdo/base/lockCounter.c @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lockCounter.c#3 $ + */ + +#include "lockCounter.h" + +#include "atomic.h" +#include "memoryAlloc.h" + +/** + * LockCounter is intended to keep all of the locks for the blocks in the + * recovery journal. The per-zone counters are all kept in a single array which + * is arranged by zone (i.e. zone 0's lock 0 is at index 0, zone 0's lock 1 is + * at index 1, and zone 1's lock 0 is at index 'locks'. This arrangement is + * intended to minimize cache-line contention for counters from different + * zones. + * + * The locks are implemented as a single object instead of as a lock counter + * per lock both to afford this opportunity to reduce cache line contention and + * also to eliminate the need to have a completion per lock. + * + * Lock sets are laid out with the set for recovery journal first, followed by + * the logical zones, and then the physical zones. + **/ +typedef enum lockCounterState { + LOCK_COUNTER_STATE_NOT_NOTIFYING = 0, + LOCK_COUNTER_STATE_NOTIFYING, + LOCK_COUNTER_STATE_SUSPENDED, +} LockCounterState; + +struct lockCounter { + /** The completion for notifying the owner of a lock release */ + VDOCompletion completion; + /** The number of logical zones which may hold locks */ + ZoneCount logicalZones; + /** The number of physical zones which may hold locks */ + ZoneCount physicalZones; + /** The number of locks */ + BlockCount locks; + /** Whether the lock release notification is in flight */ + Atomic32 state; + /** The number of logical zones which hold each lock */ + Atomic32 *logicalZoneCounts; + /** The number of physical zones which hold each lock */ + Atomic32 *physicalZoneCounts; + /** The per-zone, per-lock counts for the journal zone */ + uint16_t *journalCounters; + /** The per-zone, per-lock decrement counts for the journal zone */ + Atomic32 *journalDecrementCounts; + /** The per-zone, per-lock reference counts for logical zones */ + uint16_t *logicalCounters; + /** The per-zone, per-lock reference counts for physical zones */ + uint16_t *physicalCounters; +}; + +/**********************************************************************/ +int makeLockCounter(PhysicalLayer *layer, + void *parent, + VDOAction callback, + ThreadID threadID, + ZoneCount logicalZones, + ZoneCount physicalZones, + BlockCount locks, + LockCounter **lockCounterPtr) +{ + LockCounter *lockCounter; + + int result = ALLOCATE(1, LockCounter, __func__, &lockCounter); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(locks, uint16_t, __func__, &lockCounter->journalCounters); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks, Atomic32, __func__, + &lockCounter->journalDecrementCounts); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks * logicalZones, uint16_t, __func__, + &lockCounter->logicalCounters); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks, Atomic32, __func__, + &lockCounter->logicalZoneCounts); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks * physicalZones, uint16_t, __func__, + &lockCounter->physicalCounters); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = ALLOCATE(locks, Atomic32, __func__, + &lockCounter->physicalZoneCounts); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + result = initializeEnqueueableCompletion(&lockCounter->completion, + LOCK_COUNTER_COMPLETION, layer); + if (result != VDO_SUCCESS) { + freeLockCounter(&lockCounter); + return result; + } + + setCallbackWithParent(&lockCounter->completion, callback, threadID, parent); + lockCounter->logicalZones = logicalZones; + lockCounter->physicalZones = physicalZones; + lockCounter->locks = locks; + *lockCounterPtr = lockCounter; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeLockCounter(LockCounter **lockCounterPtr) +{ + if (*lockCounterPtr == NULL) { + return; + } + + LockCounter *lockCounter = *lockCounterPtr; + destroyEnqueueable(&lockCounter->completion); + freeVolatile(lockCounter->physicalZoneCounts); + freeVolatile(lockCounter->logicalZoneCounts); + freeVolatile(lockCounter->journalDecrementCounts); + FREE(lockCounter->journalCounters); + FREE(lockCounter->logicalCounters); + FREE(lockCounter->physicalCounters); + FREE(lockCounter); + *lockCounterPtr = NULL; +} + +/** + * Get a pointer to the zone count for a given lock on a given zone. + * + * @param counter The lock counter + * @param lockNumber The lock to get + * @param zoneType The zone type whose count is desired + * + * @return A pointer to the zone count for the given lock and zone + **/ +static inline Atomic32 *getZoneCountPtr(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType) +{ + return ((zoneType == ZONE_TYPE_LOGICAL) + ? &counter->logicalZoneCounts[lockNumber] + : &counter->physicalZoneCounts[lockNumber]); +} + +/** + * Get the zone counter for a given lock on a given zone. + * + * @param counter The lock counter + * @param lockNumber The lock to get + * @param zoneType The zone type whose count is desired + * @param zoneID The zone index whose count is desired + * + * @return The counter for the given lock and zone + **/ +static inline uint16_t *getCounter(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + BlockCount zoneCounter = (counter->locks * zoneID) + lockNumber; + if (zoneType == ZONE_TYPE_JOURNAL) { + return &counter->journalCounters[zoneCounter]; + } + + if (zoneType == ZONE_TYPE_LOGICAL) { + return &counter->logicalCounters[zoneCounter]; + } + + return &counter->physicalCounters[zoneCounter]; +} + +/** + * Check whether the journal zone is locked for a given lock. + * + * @param counter The LockCounter + * @param lockNumber The lock to check + * + * @return true if the journal zone is locked + **/ +static bool isJournalZoneLocked(LockCounter *counter, BlockCount lockNumber) +{ + uint16_t journalValue + = *(getCounter(counter, lockNumber, ZONE_TYPE_JOURNAL, 0)); + uint32_t decrements + = atomicLoad32(&(counter->journalDecrementCounts[lockNumber])); + ASSERT_LOG_ONLY((decrements <= journalValue), + "journal zone lock counter must not underflow"); + + return (journalValue != decrements); +} + +/**********************************************************************/ +bool isLocked(LockCounter *lockCounter, + BlockCount lockNumber, + ZoneType zoneType) +{ + ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), + "isLocked() called for non-journal zone"); + return (isJournalZoneLocked(lockCounter, lockNumber) + || (atomicLoad32(getZoneCountPtr(lockCounter, lockNumber, zoneType)) + != 0)); +} + +/** + * Check that we are on the journal thread. + * + * @param counter The LockCounter + * @param caller The name of the caller (for logging) + **/ +static void assertOnJournalThread(LockCounter *counter, const char *caller) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == counter->completion.callbackThreadID), + "%s() called from journal zone", caller); +} + +/**********************************************************************/ +void initializeLockCount(LockCounter *counter, + BlockCount lockNumber, + uint16_t value) +{ + assertOnJournalThread(counter, __func__); + uint16_t *journalValue = getCounter(counter, lockNumber, ZONE_TYPE_JOURNAL, + 0); + Atomic32 *decrementCount = &(counter->journalDecrementCounts[lockNumber]); + ASSERT_LOG_ONLY((*journalValue == atomicLoad32(decrementCount)), + "count to be initialized not in use"); + + *journalValue = value; + atomicStore32(decrementCount, 0); +} + +/**********************************************************************/ +void acquireLockCountReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), + "invalid lock count increment from journal zone"); + + uint16_t *currentValue = getCounter(counter, lockNumber, zoneType, zoneID); + ASSERT_LOG_ONLY(*currentValue < UINT16_MAX, + "increment of lock counter must not overflow"); + + if (*currentValue == 0) { + // This zone is acquiring this lock for the first time. + atomicAdd32(getZoneCountPtr(counter, lockNumber, zoneType), 1); + } + *currentValue += 1; +} + +/** + * Decrement a non-atomic counter. + * + * @param counter The LockCounter + * @param lockNumber Which lock to decrement + * @param zoneType The type of the zone releasing the reference + * @param zoneID The ID of the zone releasing the reference + * + * @return The new value of the counter + **/ +static uint16_t releaseReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + uint16_t *currentValue = getCounter(counter, lockNumber, zoneType, zoneID); + ASSERT_LOG_ONLY((*currentValue >= 1), + "decrement of lock counter must not underflow"); + + *currentValue -= 1; + return *currentValue; +} + +/** + * Attempt to notify the owner of this LockCounter that some lock has been + * released for some zone type. Will do nothing if another notification is + * already in progress. + * + * @param counter The LockCounter + **/ +static void attemptNotification(LockCounter *counter) +{ + if (compareAndSwap32(&counter->state, + LOCK_COUNTER_STATE_NOT_NOTIFYING, + LOCK_COUNTER_STATE_NOTIFYING)) { + resetCompletion(&counter->completion); + invokeCallback(&counter->completion); + } +} + +/**********************************************************************/ +void releaseLockCountReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), + "invalid lock count decrement from journal zone"); + if (releaseReference(counter, lockNumber, zoneType, zoneID) != 0) { + return; + } + + if (atomicAdd32(getZoneCountPtr(counter, lockNumber, zoneType), -1) == 0) { + // This zone was the last lock holder of its type, so try to notify the + // owner. + attemptNotification(counter); + } +} + +/**********************************************************************/ +void releaseJournalZoneReference(LockCounter *counter, BlockCount lockNumber) +{ + assertOnJournalThread(counter, __func__); + releaseReference(counter, lockNumber, ZONE_TYPE_JOURNAL, 0); + if (!isJournalZoneLocked(counter, lockNumber)) { + // The journal zone is not locked, so try to notify the owner. + attemptNotification(counter); + } +} + +/**********************************************************************/ +void releaseJournalZoneReferenceFromOtherZone(LockCounter *counter, + BlockCount lockNumber) +{ + atomicAdd32(&(counter->journalDecrementCounts[lockNumber]), 1); +} + +/**********************************************************************/ +void acknowledgeUnlock(LockCounter *counter) +{ + atomicStore32(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING); +} + +/**********************************************************************/ +bool suspendLockCounter(LockCounter *counter) +{ + assertOnJournalThread(counter, __func__); + return ((atomicLoad32(&counter->state) == LOCK_COUNTER_STATE_SUSPENDED) + || compareAndSwap32(&counter->state, + LOCK_COUNTER_STATE_NOT_NOTIFYING, + LOCK_COUNTER_STATE_SUSPENDED)); +} + +/**********************************************************************/ +bool resumeLockCounter(LockCounter *counter) +{ + assertOnJournalThread(counter, __func__); + return compareAndSwap32(&counter->state, + LOCK_COUNTER_STATE_SUSPENDED, + LOCK_COUNTER_STATE_NOT_NOTIFYING); +} diff --git a/source/vdo/base/lockCounter.h b/source/vdo/base/lockCounter.h new file mode 100644 index 0000000..cbda7bd --- /dev/null +++ b/source/vdo/base/lockCounter.h @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lockCounter.h#2 $ + */ + +#ifndef LOCK_COUNTER_H +#define LOCK_COUNTER_H + +#include "completion.h" +#include "types.h" + +/** + * LockCounter provides a set of shared reference count locks which is safe + * across multiple zones with a minimum of cross-thread synchronization + * operations. For each lock in the set, it maintains a set of per-zone lock + * counts, and a single, atomic count of the number of zones holding locks. + * Whenever a zone's individual counter for a lock goes from 0 to 1, the + * zone count for that lock is incremented. Whenever a zone's individual + * counter for a lock goes from 1 to 0, the zone count for that lock is + * decremented. If the zone count goes to 0, and the lock counter's + * completion is not in use, the completion is launched to inform the counter's + * owner that some lock has been released. It is the owner's responsibility to + * check for which locks have been released, and to inform the lock counter + * that it has received the notification by calling acknowledgeUnlock(). + **/ + +/** + * Create a lock counter. + * + * @param [in] layer The physical layer of the VDO + * @param [in] parent The parent to notify when the lock count goes + * to zero + * @param [in] callback The function to call when the lock count goes + * to zero + * @param [in] threadID The id of thread on which to run the callback + * @param [in] logicalZones The total number of logical zones + * @param [in] physicalZones The total number of physical zones + * @param [in] locks The number of locks + * @param [out] lockCounterPtr A pointer to hold the new counter + * + * @return VDO_SUCCESS or an error + **/ +int makeLockCounter(PhysicalLayer *layer, + void *parent, + VDOAction callback, + ThreadID threadID, + ZoneCount logicalZones, + ZoneCount physicalZones, + BlockCount locks, + LockCounter **lockCounterPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a lock counter and NULL out the reference to it. + * + * @param lockCounterPtr A pointer to the lock counter reference to free + **/ +void freeLockCounter(LockCounter **lockCounterPtr); + +/** + * Check whether a lock is locked for a zone type. If the recovery journal has + * a lock on the lock number, both logical and physical zones are considered + * locked. + * + * @param lockCounter The set of locks to check + * @param lockNumber The lock to check + * @param zoneType The type of the zone + * + * @return true if the specified lock has references (is locked) + **/ +bool isLocked(LockCounter *lockCounter, + BlockCount lockNumber, + ZoneType zoneType) + __attribute__((warn_unused_result)); + +/** + * Initialize the value of the journal zone's counter for a given lock. This + * must be called from the journal zone. + * + * @param counter The counter to initialize + * @param lockNumber Which lock to initialize + * @param value The value to set + **/ +void initializeLockCount(LockCounter *counter, + BlockCount lockNumber, + uint16_t value); + +/** + * Acquire a reference to a given lock in the specified zone. This method must + * not be used from the journal zone. + * + * @param counter The LockCounter + * @param lockNumber Which lock to increment + * @param zoneType The type of the zone acquiring the reference + * @param zoneID The ID of the zone acquiring the reference + **/ +void acquireLockCountReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID); + +/** + * Release a reference to a given lock in the specified zone. This method + * must not be used from the journal zone. + * + * @param counter The LockCounter + * @param lockNumber Which lock to increment + * @param zoneType The type of the zone releasing the reference + * @param zoneID The ID of the zone releasing the reference + **/ +void releaseLockCountReference(LockCounter *counter, + BlockCount lockNumber, + ZoneType zoneType, + ZoneCount zoneID); + +/** + * Release a single journal zone reference from the journal zone. This method + * must be called from the journal zone. + * + * @param counter The counter from which to release a reference + * @param lockNumber The lock from which to release a reference + **/ +void releaseJournalZoneReference(LockCounter *counter, BlockCount lockNumber); + +/** + * Release a single journal zone reference from any zone. This method shouldn't + * be called from the journal zone as it would be inefficient; use + * releaseJournalZoneReference() instead. + * + * @param counter The counter from which to release a reference + * @param lockNumber The lock from which to release a reference + **/ +void releaseJournalZoneReferenceFromOtherZone(LockCounter *counter, + BlockCount lockNumber); + +/** + * Inform a lock counter that an unlock notification was received by the + * caller. + * + * @param counter The counter to inform + **/ +void acknowledgeUnlock(LockCounter *counter); + +/** + * Prevent the lock counter from issuing notifications. + * + * @param counter The counter + * + * @return true if the lock counter was not notifying and hence + * the suspend was efficacious + **/ +bool suspendLockCounter(LockCounter *counter) + __attribute__((warn_unused_result)); + +/** + * Re-allow notifications from a suspended lock counter. + * + * @param counter The counter + * + * @return true if the lock counter was suspended + **/ +bool resumeLockCounter(LockCounter *counter) + __attribute__((warn_unused_result)); + +#endif // LOCK_COUNTER_H diff --git a/source/vdo/base/logicalZone.c b/source/vdo/base/logicalZone.c new file mode 100644 index 0000000..0834ff1 --- /dev/null +++ b/source/vdo/base/logicalZone.c @@ -0,0 +1,463 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/logicalZone.c#6 $ + */ + +#include "logicalZone.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "actionManager.h" +#include "adminState.h" +#include "allocationSelector.h" +#include "atomic.h" +#include "blockMap.h" +#include "completion.h" +#include "constants.h" +#include "dataVIO.h" +#include "flush.h" +#include "intMap.h" +#include "vdoInternal.h" + +struct logicalZone { + /** The completion for flush notifications */ + VDOCompletion completion; + /** The owner of this zone */ + LogicalZones *zones; + /** Which logical zone this is */ + ZoneCount zoneNumber; + /** The thread id for this zone */ + ThreadID threadID; + /** In progress operations keyed by LBN */ + IntMap *lbnOperations; + /** The logical to physical map */ + BlockMapZone *blockMapZone; + /** The current flush generation */ + SequenceNumber flushGeneration; + /** The oldest active generation in this zone */ + SequenceNumber oldestActiveGeneration; + /** The number of IOs in the current flush generation */ + BlockCount iosInFlushGeneration; + /** + * The oldest locked generation in this zone (an atomic copy of + * oldestActiveGeneration) + **/ + Atomic64 oldestLockedGeneration; + /** The youngest generation of the current notification */ + SequenceNumber notificationGeneration; + /** Whether a notification is in progress */ + bool notifying; + /** The queue of active data write VIOs */ + RingNode writeVIOs; + /** The administrative state of the zone */ + AdminState state; + /** The selector for determining which physical zone to allocate from */ + AllocationSelector *selector; +}; + +struct logicalZones { + /** The VDO whose zones these are */ + VDO *vdo; + /** The manager for administrative actions */ + ActionManager *manager; + /** The number of zones */ + ZoneCount zoneCount; + /** The logical zones themselves */ + LogicalZone zones[]; +}; + +/** + * Convert a generic VDOCompletion to a LogicalZone. + * + * @param completion The completion to convert + * + * @return The completion as a LogicalZone + **/ +static LogicalZone *asLogicalZone(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(LogicalZone, completion) == 0); + assertCompletionType(completion->type, GENERATION_FLUSHED_COMPLETION); + return (LogicalZone *) completion; +} + +/**********************************************************************/ +LogicalZone *getLogicalZone(LogicalZones *zones, ZoneCount zoneNumber) +{ + return (zoneNumber < zones->zoneCount) ? &zones->zones[zoneNumber] : NULL; +} + +/** + * Implements ZoneThreadGetter + **/ +static ThreadID getThreadIDForZone(void *context, ZoneCount zoneNumber) +{ + return getLogicalZoneThreadID(getLogicalZone(context, zoneNumber)); +} + +/** + * Initialize a logical zone. + * + * @param zones The LogicalZones to which this zone belongs + * @param zoneNumber The LogicalZone's index + **/ +static int initializeZone(LogicalZones *zones, ZoneCount zoneNumber) +{ + LogicalZone *zone = &zones->zones[zoneNumber]; + zone->zones = zones; + int result = makeIntMap(LOCK_MAP_CAPACITY, 0, &zone->lbnOperations); + if (result != VDO_SUCCESS) { + return result; + } + + VDO *vdo = zones->vdo; + result = initializeEnqueueableCompletion(&zone->completion, + GENERATION_FLUSHED_COMPLETION, + vdo->layer); + if (result != VDO_SUCCESS) { + return result; + } + + zone->zoneNumber = zoneNumber; + zone->threadID = getLogicalZoneThread(getThreadConfig(vdo), + zoneNumber); + zone->blockMapZone = getBlockMapZone(vdo->blockMap, zoneNumber); + initializeRing(&zone->writeVIOs); + atomicStore64(&zone->oldestLockedGeneration, 0); + + return makeAllocationSelector(getThreadConfig(vdo)->physicalZoneCount, + zone->threadID, &zone->selector); +} + +/**********************************************************************/ +int makeLogicalZones(VDO *vdo, LogicalZones **zonesPtr) +{ + const ThreadConfig *threadConfig = getThreadConfig(vdo); + if (threadConfig->logicalZoneCount == 0) { + return VDO_SUCCESS; + } + + LogicalZones *zones; + int result = ALLOCATE_EXTENDED(LogicalZones, threadConfig->logicalZoneCount, + LogicalZone, __func__, &zones); + if (result != VDO_SUCCESS) { + return result; + } + + zones->vdo = vdo; + zones->zoneCount = threadConfig->logicalZoneCount; + for (ZoneCount zone = 0; zone < threadConfig->logicalZoneCount; zone++) { + result = initializeZone(zones, zone); + if (result != VDO_SUCCESS) { + freeLogicalZones(&zones); + return result; + } + } + + result = makeActionManager(zones->zoneCount, getThreadIDForZone, + getAdminThread(threadConfig), zones, NULL, + vdo->layer, &zones->manager); + if (result != VDO_SUCCESS) { + freeLogicalZones(&zones); + return result; + } + + *zonesPtr = zones; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeLogicalZones(LogicalZones **zonesPtr) +{ + LogicalZones *zones = *zonesPtr; + if (zones == NULL) { + return; + } + + freeActionManager(&zones->manager); + + for (ZoneCount index = 0; index < zones->zoneCount; index++) { + LogicalZone *zone = &zones->zones[index]; + freeAllocationSelector(&zone->selector); + destroyEnqueueable(&zone->completion); + freeIntMap(&zone->lbnOperations); + } + + FREE(zones); + *zonesPtr = NULL; +} + +/**********************************************************************/ +static inline void assertOnZoneThread(LogicalZone *zone, const char *what) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == zone->threadID), + "%s() called on correct thread", what); +} + +/** + * Check whether this zone has drained. + * + * @param zone The zone to check + **/ +static void checkForDrainComplete(LogicalZone *zone) +{ + if (!isDraining(&zone->state) || zone->notifying + || !isRingEmpty(&zone->writeVIOs)) { + return; + } + + finishDraining(&zone->state); +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + checkForDrainComplete(container_of(state, LogicalZone, state)); +} + +/** + * Drain a logical zone. + * + *

Implements ZoneAction. + **/ +static void drainLogicalZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + LogicalZone *zone = getLogicalZone(context, zoneNumber); + startDraining(&zone->state, getCurrentManagerOperation(zone->zones->manager), + parent, initiateDrain); +} + +/**********************************************************************/ +void drainLogicalZones(LogicalZones *zones, + AdminStateCode operation, + VDOCompletion *parent) +{ + scheduleOperation(zones->manager, operation, NULL, drainLogicalZone, NULL, + parent); +} + +/** + * Resume a logical zone. + * + *

Implements ZoneAction. + **/ +static void resumeLogicalZone(void *context, + ZoneCount zoneNumber, + VDOCompletion *parent) +{ + LogicalZone *zone = getLogicalZone(context, zoneNumber); + finishCompletion(parent, resumeIfQuiescent(&zone->state)); +} + +/**********************************************************************/ +void resumeLogicalZones(LogicalZones *zones, VDOCompletion *parent) +{ + scheduleOperation(zones->manager, ADMIN_STATE_RESUMING, NULL, + resumeLogicalZone, NULL, parent); +} + +/**********************************************************************/ +ThreadID getLogicalZoneThreadID(const LogicalZone *zone) +{ + return zone->threadID; +} + +/**********************************************************************/ +BlockMapZone *getBlockMapForZone(const LogicalZone *zone) +{ + return zone->blockMapZone; +} + +/**********************************************************************/ +IntMap *getLBNLockMap(const LogicalZone *zone) +{ + return zone->lbnOperations; +} + +/**********************************************************************/ +LogicalZone *getNextLogicalZone(const LogicalZone *zone) +{ + return getLogicalZone(zone->zones, zone->zoneNumber + 1); +} + +/** + * Convert a RingNode to a DataVIO. + * + * @param ringNode The RingNode to convert + * + * @return The DataVIO which owns the RingNode + **/ +static inline DataVIO *dataVIOFromRingNode(RingNode *ringNode) +{ + return (DataVIO *) ((byte *) ringNode - offsetof(DataVIO, writeNode)); +} + +/** + * Update the oldest active generation. If it has changed, update the + * atomic copy as well. + * + * @param zone The zone + * + * @return true if the oldest active generation has changed + **/ +static bool updateOldestActiveGeneration(LogicalZone *zone) +{ + SequenceNumber currentOldest = zone->oldestActiveGeneration; + if (isRingEmpty(&zone->writeVIOs)) { + zone->oldestActiveGeneration = zone->flushGeneration; + } else { + zone->oldestActiveGeneration + = dataVIOFromRingNode(zone->writeVIOs.next)->flushGeneration; + } + + if (zone->oldestActiveGeneration == currentOldest) { + return false; + } + + atomicStore64(&zone->oldestLockedGeneration, zone->oldestActiveGeneration); + return true; +} + +/**********************************************************************/ +void incrementFlushGeneration(LogicalZone *zone, + SequenceNumber expectedGeneration) +{ + assertOnZoneThread(zone, __func__); + ASSERT_LOG_ONLY((zone->flushGeneration == expectedGeneration), + "logical zone %u flush generation %" PRIu64 + " should be %llu before increment", + zone->zoneNumber, zone->flushGeneration, + expectedGeneration); + + zone->flushGeneration++; + zone->iosInFlushGeneration = 0; + updateOldestActiveGeneration(zone); +} + +/**********************************************************************/ +SequenceNumber getOldestLockedGeneration(const LogicalZone *zone) +{ + return (SequenceNumber) atomicLoad64(&zone->oldestLockedGeneration); +} + +/**********************************************************************/ +int acquireFlushGenerationLock(DataVIO *dataVIO) +{ + LogicalZone *zone = dataVIO->logical.zone; + assertOnZoneThread(zone, __func__); + if (!isNormal(&zone->state)) { + return VDO_INVALID_ADMIN_STATE; + } + + dataVIO->flushGeneration = zone->flushGeneration; + pushRingNode(&zone->writeVIOs, &dataVIO->writeNode); + dataVIO->hasFlushGenerationLock = true; + zone->iosInFlushGeneration++; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void attemptGenerationCompleteNotification(VDOCompletion *completion); + +/** + * Notify the flush that at least one generation no longer has active VIOs. + * This callback is registered in attemptGenerationCompleteNotification(). + * + * @param completion The zone completion + **/ +static void notifyFlusher(VDOCompletion *completion) +{ + LogicalZone *zone = asLogicalZone(completion); + completeFlushes(zone->zones->vdo->flusher); + launchCallback(completion, attemptGenerationCompleteNotification, + zone->threadID); +} + +/** + * Notify the flusher if some generation no longer has active VIOs. + * + * @param completion The zone completion + **/ +static void attemptGenerationCompleteNotification(VDOCompletion *completion) +{ + LogicalZone *zone = asLogicalZone(completion); + assertOnZoneThread(zone, __func__); + if (zone->oldestActiveGeneration <= zone->notificationGeneration) { + zone->notifying = false; + checkForDrainComplete(zone); + return; + } + + zone->notifying = true; + zone->notificationGeneration = zone->oldestActiveGeneration; + launchCallback(&zone->completion, notifyFlusher, + getFlusherThreadID(zone->zones->vdo->flusher)); +} + +/**********************************************************************/ +void releaseFlushGenerationLock(DataVIO *dataVIO) +{ + LogicalZone *zone = dataVIO->logical.zone; + assertOnZoneThread(zone, __func__); + if (isRingEmpty(&dataVIO->writeNode)) { + // This VIO never got a lock, either because it is a read, or because + // we are in read-only mode. + ASSERT_LOG_ONLY(!dataVIO->hasFlushGenerationLock, + "hasFlushGenerationLock false for VIO not on active list"); + return; + } + + unspliceRingNode(&dataVIO->writeNode); + dataVIO->hasFlushGenerationLock = false; + ASSERT_LOG_ONLY(zone->oldestActiveGeneration <= dataVIO->flushGeneration, + "DataVIO releasing lock on generation %" PRIu64 + " is not older than oldest active generation %llu", + dataVIO->flushGeneration, zone->oldestActiveGeneration); + + if (!updateOldestActiveGeneration(zone) || zone->notifying) { + return; + } + + attemptGenerationCompleteNotification(&zone->completion); +} + +/**********************************************************************/ +AllocationSelector *getAllocationSelector(LogicalZone *zone) +{ + return zone->selector; +} + +/**********************************************************************/ +void dumpLogicalZone(const LogicalZone *zone) +{ + logInfo("LogicalZone %u", zone->zoneNumber); + logInfo(" flushGeneration=%llu oldestActiveGeneration=%" PRIu64 + " oldestLockedGeneration=%llu notificationGeneration=%" PRIu64 + " notifying=%s iosInCurrentGeneration=%llu", + zone->flushGeneration, zone->oldestActiveGeneration, + relaxedLoad64(&zone->oldestLockedGeneration), + zone->notificationGeneration, boolToString(zone->notifying), + zone->iosInFlushGeneration); +} diff --git a/source/vdo/base/logicalZone.h b/source/vdo/base/logicalZone.h new file mode 100644 index 0000000..8e0eae6 --- /dev/null +++ b/source/vdo/base/logicalZone.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/logicalZone.h#3 $ + */ + +#ifndef LOGICAL_ZONE_H +#define LOGICAL_ZONE_H + +#include "adminState.h" +#include "intMap.h" +#include "types.h" + +/** + * Get a logical zone by number. + * + * @param zones A set of logical zones + * @param zoneNumber The number of the zone to get + * + * @return The requested zone + **/ +LogicalZone *getLogicalZone(LogicalZones *zones, ZoneCount zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Create a set of logical zones. + * + * @param [in] vdo The VDO to which the zones will belong + * @param [out] zonesPtr A pointer to hold the new zones + * + * @return VDO_SUCCESS or an error code + **/ +int makeLogicalZones(VDO *vdo, LogicalZones **zonesPtr) + __attribute__((warn_unused_result)); + +/** + * Free a set of logical zones and null out the reference to it. + * + * @param zonePtr A pointer to the zone to free + **/ +void freeLogicalZones(LogicalZones **zonePtr); + +/** + * Drain a set of logical zones. + * + * @param zones The logical zones to suspend + * @param operation The type of drain to perform + * @param completion The object to notify when the zones are suspended + **/ +void drainLogicalZones(LogicalZones *zones, + AdminStateCode operation, + VDOCompletion *completion); + +/** + * Resume a set of logical zones. + * + * @param zones The logical zones to resume + * @param parent The object to notify when the zones have resumed + **/ +void resumeLogicalZones(LogicalZones *zones, VDOCompletion *parent); + +/** + * Get the ID of a logical zone's thread. + * + * @param zone The zone + * + * @return The zone's thread ID + **/ +ThreadID getLogicalZoneThreadID(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the portion of the block map for this zone. + * + * @param zone The zone + * + * @return The block map zone + **/ +BlockMapZone *getBlockMapForZone(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the logical lock map for this zone. + * + * @param zone The zone + * + * @return The logical lock map for the zone + **/ +IntMap *getLBNLockMap(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the next-highest-numbered logical zone, or NULL if the + * zone is the highest-numbered zone in its VDO. + * + * @param zone The logical zone to query + * + * @return The logical zone whose zone number is one greater than the given + * zone, or NULL if there is no such zone + **/ +LogicalZone *getNextLogicalZone(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Increment the flush generation in a logical zone. + * + * @param zone The logical zone + * @param expectedGeneration The expected value of the flush generation + * before the increment + **/ +void incrementFlushGeneration(LogicalZone *zone, + SequenceNumber expectedGeneration); + +/** + * Get the oldest flush generation which is locked by a logical zone. + * + * @param zone The logical zone + * + * @return The oldest generation locked by the zone + **/ +SequenceNumber getOldestLockedGeneration(const LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Acquire the shared lock on a flush generation by a write DataVIO. + * + * @param dataVIO The DataVIO + * + * @return VDO_SUCCESS or an error code + **/ +int acquireFlushGenerationLock(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Release the shared lock on a flush generation held by a write DataVIO. If + * there are pending flushes, and this DataVIO completes the oldest generation + * active in this zone, an attempt will be made to finish any flushes which may + * now be complete. + * + * @param dataVIO The DataVIO whose lock is to be released + **/ +void releaseFlushGenerationLock(DataVIO *dataVIO); + +/** + * Get the selector for deciding which physical zone should be allocated from + * next for activities in a logical zone. + * + * @param zone The logical zone of the operation which needs an allocation + * + * @return The allocation selector for this zone + **/ +AllocationSelector *getAllocationSelector(LogicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Dump information about a logical zone to the log for debugging, in a + * thread-unsafe fashion. + * + * @param zone The zone to dump + **/ +void dumpLogicalZone(const LogicalZone *zone); + +#endif // LOGICAL_ZONE_H diff --git a/source/vdo/base/lz4.c b/source/vdo/base/lz4.c new file mode 100644 index 0000000..1114aa8 --- /dev/null +++ b/source/vdo/base/lz4.c @@ -0,0 +1,886 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lz4.c#2 $ + */ + +// Get the memcpy fixup from common.h. +#include "common.h" + +/* + LZ4 - Fast LZ compression algorithm + Copyright (C) 2011-2012, Yann Collet. + BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + You can contact the author at : + - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + - LZ4 source repository : http://code.google.com/p/lz4/ +*/ +/* + * With authors permission dual licensed as BSD/GPL for linux kernel + * + * Origin: http://lz4.googlecode.com/svn/trunk + * Revision: 88 + */ + +//************************************** +// Tuning parameters +//************************************** +// MEMORY_USAGE : +// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +// Increasing memory usage improves compression ratio +// Reduced memory usage can improve speed, due to cache effect +// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache +#define MEMORY_USAGE 14 + +// NOTCOMPRESSIBLE_DETECTIONLEVEL : +// Decreasing this value will make the algorithm skip faster data segments considered "incompressible" +// This may decrease compression ratio dramatically, but will be faster on incompressible data +// Increasing this value will make the algorithm search more before declaring a segment "incompressible" +// This could improve compression a bit, but will be slower on incompressible data +// The default value (6) is recommended +#define NOTCOMPRESSIBLE_DETECTIONLEVEL 6 + +// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : +// This will provide a small boost to performance for big endian cpu, but the resulting compressed stream will be incompatible with little-endian CPU. +// You can set this option to 1 in situations where data will remain within closed environment +// This option is useless on Little_Endian CPU (such as x86) +//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 + + + +//************************************** +// CPU Feature Detection +//************************************** +// 32 or 64 bits ? +#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) || defined(__ppc64__) || defined(_WIN64) || defined(__LP64__) || defined(_LP64) ) // Detects 64 bits mode +# define LZ4_ARCH64 1 +#else +# define LZ4_ARCH64 0 +#endif + +// Little Endian or Big Endian ? +// GCC normally defines these three macros (and PDP-endian which we ignore). +#if !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) \ + || !defined(__BYTE_ORDER__) +#error "GCC byte order macros not defined?" +#endif +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define LZ4_BIG_ENDIAN 1 +#elif __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ +# error "fix byte order check" +#endif + +// Unaligned memory access is automatically enabled for "common" CPU, such as x86. +// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected +// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance +#if defined(__ARM_FEATURE_UNALIGNED) +# define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +// Define this parameter if your target system or compiler does not support hardware bit count +#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count +# define LZ4_FORCE_SW_BITCOUNT +#endif + + +//************************************** +// Compiler Options +//************************************** +#if __STDC_VERSION__ >= 199901L // C99 +/* "restrict" is a known keyword */ +#else +# define restrict // Disable restrict +#endif + +#define _GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef _MSC_VER // Visual Studio +# include // For Visual 2005 +# if LZ4_ARCH64 // 64-bit +# pragma intrinsic(_BitScanForward64) // For Visual 2005 +# pragma intrinsic(_BitScanReverse64) // For Visual 2005 +# else +# pragma intrinsic(_BitScanForward) // For Visual 2005 +# pragma intrinsic(_BitScanReverse) // For Visual 2005 +# endif +#endif + +#ifdef _MSC_VER +# define lz4_bswap16(x) _byteswap_ushort(x) +#else +# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) +#endif + +#if (_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +# define expect(expr,value) (__builtin_expect ((expr),(value)) ) +#else +# define expect(expr,value) (expr) +#endif + +//************************************** +// Includes +//************************************** +#ifdef __KERNEL__ +# include // for memset +#else /* __KERNEL__ */ +# include // for malloc +# include // for memset +#endif /* __KERNEL__ */ +#include "lz4.h" + + +//************************************** +// Basic Types +//************************************** +#if defined(_MSC_VER) // Visual Studio does not support 'stdint' natively +# define BYTE unsigned __int8 +# define U16 unsigned __int16 +# define U32 unsigned __int32 +# define S32 __int32 +# define U64 unsigned __int64 +#else +# ifdef __KERNEL__ +# include +# else /* __KERNEL__ */ +# include +# endif /* __KERNEL__ */ +# define BYTE uint8_t +# define U16 uint16_t +# define U32 uint32_t +# define S32 int32_t +# define U64 uint64_t +#endif + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +# pragma pack(push, 1) +#endif + +typedef struct _U16_S { U16 v; } U16_S; +typedef struct _U32_S { U32 v; } U32_S; +typedef struct _U64_S { U64 v; } U64_S; + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +# pragma pack(pop) +#endif + +#define A64(x) (((U64_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A16(x) (((U16_S *)(x))->v) + + +//************************************** +// Constants +//************************************** +#define MINMATCH 4 + +#define HASH_LOG (MEMORY_USAGE-2) +#define HASHTABLESIZE (1 << HASH_LOG) +#define HASH_MASK (HASHTABLESIZE - 1) + +#define SKIPSTRENGTH (NOTCOMPRESSIBLE_DETECTIONLEVEL>2?NOTCOMPRESSIBLE_DETECTIONLEVEL:2) +#define STACKLIMIT 13 +#define HEAPMODE (HASH_LOG>STACKLIMIT) // Defines if memory is allocated into the stack (local variable), or into the heap (malloc()). +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH+MINMATCH) +#define MINLENGTH (MFLIMIT+1) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<> ((MINMATCH*8)-HASH_LOG)) +#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) +#define LZ4_WILDCOPY(s,d,e) do { LZ4_COPYPACKET(s,d) } while (d>3); + #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); + #else + int r; + if (!(val>>32)) { r=4; } else { r=0; val>>=32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; + #endif +#else + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward64( &r, val ); + return (int)(r>>3); + #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); + #else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58]; + #endif +#endif +} + +#else + +static inline int LZ4_NbCommonBytes (register U32 val) +{ +#if defined(LZ4_BIG_ENDIAN) + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanReverse( &r, val ); + return (int)(r>>3); + #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); + #else + int r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; + #endif +#else + #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) + unsigned long r = 0; + _BitScanForward( &r, val ); + return (int)(r>>3); + #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); + #else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; + #endif +#endif +} + +#endif + + + +//****************************** +// Compression functions +//****************************** + +// LZ4_compressCtx : +// ----------------- +// Compress 'isize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. +// If it cannot achieve it, compression will stop, and result of the function will be zero. +// return : the number of bytes written in buffer 'dest', or 0 if the compression fails + +static inline int LZ4_compressCtx(void** ctx, + const char* source, + char* dest, + int isize, + int maxOutputSize) +{ +#if HEAPMODE + struct refTables *srt = (struct refTables *) (*ctx); + HTYPE* HashTable; +#else + HTYPE HashTable[HASHTABLESIZE] = {0}; +#endif + + const BYTE* ip = (BYTE*) source; + INITBASE(base); + const BYTE* anchor = ip; + const BYTE* const iend = ip + isize; + const BYTE* const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + + // Init + if (isizehashTable); + memset((void*)HashTable, 0, sizeof(srt->hashTable)); +#else + (void) ctx; +#endif + + + // First Byte + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; forwardH = LZ4_HASH_VALUE(ip); + + // Main Loop + for ( ; ; ) + { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE* forwardIp = ip; + const BYTE* ref; + BYTE* token; + + // Find a match + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (unlikely(forwardIp > mflimit)) { goto _last_literals; } + + forwardH = LZ4_HASH_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + // Catch up + while ((ip>anchor) && (ref>(BYTE*)source) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } + + // Encode Literal length + length = (int)(ip - anchor); + token = op++; + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend)) return 0; // Check output limit +#ifdef _MSC_VER + if (length>=(int)RUN_MASK) + { + int len = length-RUN_MASK; + *token=(RUN_MASK<254) + { + do { *op++ = 255; len -= 255; } while (len>254); + *op++ = (BYTE)len; + memcpy(op, anchor, length); + op += length; + goto _next_match; + } + else + *op++ = (BYTE)len; + } + else *token = (length<=(int)RUN_MASK) { *token=(RUN_MASK< 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } + else *token = (length<>8) > oend)) return 0; // Check output limit + if (len>=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; } + else *token += len; + + // Test end of chunk + if (ip > mflimit) { anchor = ip; break; } + + // Fill table + HashTable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; + + // Test next position + ref = base + HashTable[LZ4_HASH_VALUE(ip)]; + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } + + // Prepare next loop + anchor = ip++; + forwardH = LZ4_HASH_VALUE(ip); + } + +_last_literals: + // Encode Last Literals + { + int lastRun = (int)(iend - anchor); + if (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) return 0; + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (lastRun<> ((MINMATCH*8)-HASHLOG64K)) +#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) +static inline int LZ4_compress64kCtx(void** ctx, + const char* source, + char* dest, + int isize, + int maxOutputSize) +{ +#if HEAPMODE + struct refTables *srt = (struct refTables *) (*ctx); + U16* HashTable; +#else + U16 HashTable[HASH64KTABLESIZE] = {0}; +#endif + + const BYTE* ip = (BYTE*) source; + const BYTE* anchor = ip; + const BYTE* const base = ip; + const BYTE* const iend = ip + isize; + const BYTE* const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE* op = (BYTE*) dest; + BYTE* const oend = op + maxOutputSize; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + + // Init + if (isizehashTable); + memset((void*)HashTable, 0, sizeof(srt->hashTable)); +#else + (void) ctx; +#endif + + + // First Byte + ip++; forwardH = LZ4_HASH64K_VALUE(ip); + + // Main Loop + for ( ; ; ) + { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE* forwardIp = ip; + const BYTE* ref; + BYTE* token; + + // Find a match + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (forwardIp > mflimit) { goto _last_literals; } + + forwardH = LZ4_HASH64K_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = (U16)(ip - base); + + } while (A32(ref) != A32(ip)); + + // Catch up + while ((ip>anchor) && (ref>(BYTE*)source) && (ip[-1]==ref[-1])) { ip--; ref--; } + + // Encode Literal length + length = (int)(ip - anchor); + token = op++; + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend)) return 0; // Check output limit +#ifdef _MSC_VER + if (length>=(int)RUN_MASK) + { + int len = length-RUN_MASK; + *token=(RUN_MASK<254) + { + do { *op++ = 255; len -= 255; } while (len>254); + *op++ = (BYTE)len; + memcpy(op, anchor, length); + op += length; + goto _next_match; + } + else + *op++ = (BYTE)len; + } + else *token = (length<=(int)RUN_MASK) { *token=(RUN_MASK< 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } + else *token = (length<>8) > oend)) return 0; // Check output limit + if (len>=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; } + else *token += len; + + // Test end of chunk + if (ip > mflimit) { anchor = ip; break; } + + // Fill table + HashTable[LZ4_HASH64K_VALUE(ip-2)] = (U16)(ip - 2 - base); + + // Test next position + ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; + HashTable[LZ4_HASH64K_VALUE(ip)] = (U16)(ip - base); + if (A32(ref) == A32(ip)) { token = op++; *token=0; goto _next_match; } + + // Prepare next loop + anchor = ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + } + +_last_literals: + // Encode Last Literals + { + int lastRun = (int)(iend - anchor); + if (op + lastRun + 1 + (lastRun-RUN_MASK+255)/255 > oend) return 0; + if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } + else *op++ = (lastRun<>ML_BITS)) == RUN_MASK) { size_t len; for (;(len=*ip++)==255;length+=255){} length += len; } + + // copy literals + cpy = op+length; + if (unlikely(cpy>oend-COPYLENGTH)) + { + if (cpy != oend) goto _output_error; // Error : not enough place for another match (min 4) + 5 literals + memcpy(op, ip, length); + ip += length; + break; // EOF + } + LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy; + + // get offset + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if (unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset create reference outside destination buffer + + // get matchlength + if ((length=(token&ML_MASK)) == ML_MASK) { for (;*ip==255;length+=255) {ip++;} length += *ip++; } + + // copy repeated sequence + if (unlikely((op-ref)oend-COPYLENGTH) + { + if (cpy > oend) goto _output_error; // Error : request to write beyond destination buffer + LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH)); + while(op>ML_BITS)) == RUN_MASK) { int s=255; while ((ipoend-COPYLENGTH) || (ip+length>iend-COPYLENGTH)) + { + if (cpy > oend) goto _output_error; // Error : writes beyond output buffer + if (ip+length != iend) goto _output_error; // Error : LZ4 format requires to consume all input at this stage + memcpy(op, ip, length); + op += length; + break; // Necessarily EOF, due to parsing restrictions + } + LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy; + + // get offset + LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; + if (ref < (BYTE* const)dest) goto _output_error; // Error : offset creates reference outside of destination buffer + + // get matchlength + if ((length=(token&ML_MASK)) == ML_MASK) { while (ipoend-COPYLENGTH) + { + if (cpy > oend) goto _output_error; // Error : request to write outside of destination buffer + LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH)); + while(op 0) && ((n & (n - 1)) == 0); +} + +/** + * Efficiently calculate the base-2 logarithm of a number truncated to an + * integer value. + * + * This also happens to be the bit index of the highest-order non-zero bit in + * the binary representation of the number, which can easily be used to + * calculate the bit shift corresponding to a bit mask or an array capacity, + * or to calculate the binary floor or ceiling (next lowest or highest power + * of two). + * + * @param n The input value + * + * @return the integer log2 of the value, or -1 if the value is zero + **/ +static inline int logBaseTwo(uint64_t n) +{ + if (n == 0) { + return -1; + } + // Many CPUs, including x86, directly support this calculation, so use the + // GCC function for counting the number of leading high-order zero bits. + return 63 - __builtin_clzll(n); +} + +/** + * Find the minimum of two physical block numbers. + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber minBlock(PhysicalBlockNumber a, + PhysicalBlockNumber b) +{ + return (a < b) ? a : b; +} + +/** + * Find the maximum of two physical block numbers. + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber maxBlock(PhysicalBlockNumber a, + PhysicalBlockNumber b) +{ + return (a > b) ? a : b; +} + +/** + * Find the minimum of two block counts. + **/ +__attribute__((warn_unused_result)) +static inline BlockCount minBlockCount(BlockCount a, BlockCount b) +{ + return (a < b) ? a : b; +} + +/** + * Find the maximum of two block counts. + **/ +__attribute__((warn_unused_result)) +static inline BlockCount maxBlockCount(BlockCount a, BlockCount b) +{ + return (a > b) ? a : b; +} + +/** + * Find the minimum of two sequence numbers. + **/ +__attribute__((warn_unused_result)) +static inline SequenceNumber minSequenceNumber(SequenceNumber a, + SequenceNumber b) +{ + return (a < b) ? a : b; +} + +/** + * Return the minimum of two page counts. + **/ +__attribute__((warn_unused_result)) +static inline PageCount minPageCount(PageCount a, PageCount b) +{ + return (a < b) ? a : b; +} + +/** + * Return the maximum of two page counts. + **/ +__attribute__((warn_unused_result)) +static inline PageCount maxPageCount(PageCount a, PageCount b) +{ + return (a > b) ? a : b; +} + +/** + * Round upward towards the nearest multiple of quantum. + * + * @param number a number + * @param quantum the quantum + * + * @return the least multiple of quantum not less than number + **/ +__attribute__((warn_unused_result)) +static inline size_t roundUpToMultipleSizeT(size_t number, size_t quantum) +{ + return number + quantum - 1 - ((number + quantum - 1) % quantum); +} + +/** + * Round upward towards the nearest multiple of quantum for uint64_t + * + * @param number a number + * @param quantum the quantum + * + * @return the least multiple of quantum not less than number + **/ +__attribute__((warn_unused_result)) +static inline uint64_t roundUpToMultipleUInt64T(uint64_t number, + uint64_t quantum) +{ + return number + quantum - 1 - ((number + quantum - 1) % quantum); +} + +/** + * Check whether the given value is between the lower and upper bounds, + * within a cyclic range of values from 0 to (modulus - 1). The value + * and both bounds must be smaller than the modulus. + * + * @param lower The lowest value to accept + * @param value The value to check + * @param upper The highest value to accept + * @param modulus The size of the cyclic space, no more than 2^15 + * + * @return true if the value is in range + **/ +static inline bool inCyclicRange(uint16_t lower, + uint16_t value, + uint16_t upper, + uint16_t modulus) +{ + if (value < lower) { + value += modulus; + } + if (upper < lower) { + upper += modulus; + } + return (value <= upper); +} + +/** + * Compute the number of buckets of a given size which are required to hold a + * given number of objects. + * + * @param objectCount The number of objects to hold + * @param bucketSize The size of a bucket + * + * @return The number of buckets required + **/ +static inline uint64_t computeBucketCount(uint64_t objectCount, + uint64_t bucketSize) +{ + uint64_t quotient = objectCount / bucketSize; + if ((objectCount % bucketSize) > 0) { + ++quotient; + } + return quotient; +} + +#endif // NUM_UTILS_H diff --git a/source/vdo/base/packedRecoveryJournalBlock.h b/source/vdo/base/packedRecoveryJournalBlock.h new file mode 100644 index 0000000..b592225 --- /dev/null +++ b/source/vdo/base/packedRecoveryJournalBlock.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packedRecoveryJournalBlock.h#3 $ + */ + +#ifndef PACKED_RECOVERY_JOURNAL_BLOCK_H +#define PACKED_RECOVERY_JOURNAL_BLOCK_H + +#include "numeric.h" + +#include "constants.h" +#include "recoveryJournalEntry.h" +#include "types.h" + +typedef struct { + SequenceNumber blockMapHead; // Block map head sequence number + SequenceNumber slabJournalHead; // Slab journal head sequence number + SequenceNumber sequenceNumber; // Sequence number for this block + Nonce nonce; // A given VDO instance's nonce + BlockCount logicalBlocksUsed; // Count of logical blocks in use + BlockCount blockMapDataBlocks; // Count of allocated block map pages + JournalEntryCount entryCount; // Number of entries written + uint8_t checkByte; // The protection check byte + uint8_t recoveryCount; // The number of recoveries completed + VDOMetadataType metadataType; // Metadata type +} RecoveryBlockHeader; + +/** + * The packed, on-disk representation of a recovery journal block header. + * All fields are kept in little-endian byte order. + **/ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** Block map head 64-bit sequence number */ + byte blockMapHead[8]; + + /** Slab journal head 64-bit sequence number */ + byte slabJournalHead[8]; + + /** The 64-bit sequence number for this block */ + byte sequenceNumber[8]; + + /** A given VDO instance's 64-bit nonce */ + byte nonce[8]; + + /** 8-bit metadata type (should always be one for the recovery journal) */ + uint8_t metadataType; + + /** 16-bit count of the entries encoded in the block */ + byte entryCount[2]; + + /** 64-bit count of the logical blocks used when this block was opened */ + byte logicalBlocksUsed[8]; + + /** 64-bit count of the block map blocks used when this block was opened */ + byte blockMapDataBlocks[8]; + + /** The protection check byte */ + uint8_t checkByte; + + /** The number of recoveries completed */ + uint8_t recoveryCount; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[8 + 8 + 8 + 8 + 1 + 2 + 8 + 8 + 1 + 1]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + SequenceNumber blockMapHead; + SequenceNumber slabJournalHead; + SequenceNumber sequenceNumber; + Nonce nonce; + VDOMetadataType metadataType; + JournalEntryCount entryCount; + BlockCount logicalBlocksUsed; + BlockCount blockMapDataBlocks; + uint8_t checkByte; + uint8_t recoveryCount; + } littleEndian; +#endif +} PackedJournalHeader; + +typedef struct { + /** The protection check byte */ + uint8_t checkByte; + + /** The number of recoveries completed */ + uint8_t recoveryCount; + + /** The number of entries in this sector */ + uint8_t entryCount; + + /** Journal entries for this sector */ + PackedRecoveryJournalEntry entries[]; +} __attribute__((packed)) PackedJournalSector; + +enum { + // Allowing more than 311 entries in each block changes the math + // concerning the amortization of metadata writes and recovery speed. + RECOVERY_JOURNAL_ENTRIES_PER_BLOCK = 311, + /** The number of entries in each sector (except the last) when filled */ + RECOVERY_JOURNAL_ENTRIES_PER_SECTOR + = ((VDO_SECTOR_SIZE - sizeof(PackedJournalSector)) + / sizeof(PackedRecoveryJournalEntry)), + /** The number of entries in the last sector when a block is full */ + RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR + = (RECOVERY_JOURNAL_ENTRIES_PER_BLOCK + % RECOVERY_JOURNAL_ENTRIES_PER_SECTOR), +}; + +/** + * Find the recovery journal sector from the block header and sector number. + * + * @param header The header of the recovery journal block + * @param sectorNumber The index of the sector (1-based) + * + * @return A packed recovery journal sector + **/ +__attribute__((warn_unused_result)) +static inline +PackedJournalSector *getJournalBlockSector(PackedJournalHeader *header, + int sectorNumber) +{ + char *sectorData = ((char *) header) + (VDO_SECTOR_SIZE * sectorNumber); + return (PackedJournalSector *) sectorData; +} + +/** + * Generate the packed representation of a recovery block header. + * + * @param header The header containing the values to encode + * @param packed The header into which to pack the values + **/ +static inline void packRecoveryBlockHeader(const RecoveryBlockHeader *header, + PackedJournalHeader *packed) +{ + storeUInt64LE(packed->fields.blockMapHead, header->blockMapHead); + storeUInt64LE(packed->fields.slabJournalHead, header->slabJournalHead); + storeUInt64LE(packed->fields.sequenceNumber, header->sequenceNumber); + storeUInt64LE(packed->fields.nonce, header->nonce); + storeUInt64LE(packed->fields.logicalBlocksUsed, header->logicalBlocksUsed); + storeUInt64LE(packed->fields.blockMapDataBlocks, header->blockMapDataBlocks); + storeUInt16LE(packed->fields.entryCount, header->entryCount); + + packed->fields.checkByte = header->checkByte; + packed->fields.recoveryCount = header->recoveryCount; + packed->fields.metadataType = header->metadataType; +} + +/** + * Decode the packed representation of a recovery block header. + * + * @param packed The packed header to decode + * @param header The header into which to unpack the values + **/ +static inline void unpackRecoveryBlockHeader(const PackedJournalHeader *packed, + RecoveryBlockHeader *header) +{ + *header = (RecoveryBlockHeader) { + .blockMapHead = getUInt64LE(packed->fields.blockMapHead), + .slabJournalHead = getUInt64LE(packed->fields.slabJournalHead), + .sequenceNumber = getUInt64LE(packed->fields.sequenceNumber), + .nonce = getUInt64LE(packed->fields.nonce), + .logicalBlocksUsed = getUInt64LE(packed->fields.logicalBlocksUsed), + .blockMapDataBlocks = getUInt64LE(packed->fields.blockMapDataBlocks), + .entryCount = getUInt16LE(packed->fields.entryCount), + .checkByte = packed->fields.checkByte, + .recoveryCount = packed->fields.recoveryCount, + .metadataType = packed->fields.metadataType, + }; +} + +#endif // PACKED_RECOVERY_JOURNAL_BLOCK_H diff --git a/source/vdo/base/packer.c b/source/vdo/base/packer.c new file mode 100644 index 0000000..efb4dd4 --- /dev/null +++ b/source/vdo/base/packer.c @@ -0,0 +1,1023 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packer.c#8 $ + */ + +#include "packerInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminState.h" +#include "allocatingVIO.h" +#include "allocationSelector.h" +#include "compressionState.h" +#include "dataVIO.h" +#include "hashLock.h" +#include "pbnLock.h" +#include "vdo.h" +#include "vdoInternal.h" + +/** + * Check that we are on the packer thread. + * + * @param packer The packer + * @param caller The function which is asserting + **/ +static inline void assertOnPackerThread(Packer *packer, const char *caller) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == packer->threadID), + "%s() called from packer thread", caller); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static inline InputBin *inputBinFromRingNode(RingNode *node) +{ + STATIC_ASSERT(offsetof(InputBin, ring) == 0); + return (InputBin *) node; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static inline OutputBin *outputBinFromRingNode(RingNode *node) +{ + STATIC_ASSERT(offsetof(OutputBin, ring) == 0); + return (OutputBin *) node; +} + +/**********************************************************************/ +InputBin *nextBin(const Packer *packer, InputBin *bin) +{ + if (bin->ring.next == &packer->inputBins) { + return NULL; + } else { + return inputBinFromRingNode(bin->ring.next); + } +} + +/**********************************************************************/ +InputBin *getFullestBin(const Packer *packer) +{ + if (isRingEmpty(&packer->inputBins)) { + return NULL; + } else { + return inputBinFromRingNode(packer->inputBins.next); + } +} + +/** + * Insert an input bin to the list, which is in ascending order of free space. + * Since all bins are already in the list, this actually moves the bin to the + * correct position in the list. + * + * @param packer The packer + * @param bin The input bin to move to its sorted position + **/ +static void insertInSortedList(Packer *packer, InputBin *bin) +{ + for (InputBin *activeBin = getFullestBin(packer); + activeBin != NULL; + activeBin = nextBin(packer, activeBin)) { + if (activeBin->freeSpace > bin->freeSpace) { + pushRingNode(&activeBin->ring, &bin->ring); + return; + } + } + + pushRingNode(&packer->inputBins, &bin->ring); +} + +/** + * Allocate an input bin and put it into the packer's list. + * + * @param packer The packer + **/ +__attribute__((warn_unused_result)) +static int makeInputBin(Packer *packer) +{ + InputBin *bin; + int result = ALLOCATE_EXTENDED(InputBin, MAX_COMPRESSION_SLOTS, VIO *, + __func__, &bin); + if (result != VDO_SUCCESS) { + return result; + } + + bin->freeSpace = packer->binDataSize; + initializeRing(&bin->ring); + pushRingNode(&packer->inputBins, &bin->ring); + return VDO_SUCCESS; +} + +/** + * Push an output bin onto the stack of idle bins. + * + * @param packer The packer + * @param bin The output bin + **/ +static void pushOutputBin(Packer *packer, OutputBin *bin) +{ + ASSERT_LOG_ONLY(!hasWaiters(&bin->outgoing), + "idle output bin has no waiters"); + packer->idleOutputBins[packer->idleOutputBinCount++] = bin; +} + +/** + * Pop an output bin off the end of the stack of idle bins. + * + * @param packer The packer + * + * @return an idle output bin, or NULL if there are no idle bins + **/ +__attribute__((warn_unused_result)) +static OutputBin *popOutputBin(Packer *packer) +{ + if (packer->idleOutputBinCount == 0) { + return NULL; + } + + size_t index = --packer->idleOutputBinCount; + OutputBin *bin = packer->idleOutputBins[index]; + packer->idleOutputBins[index] = NULL; + return bin; +} + +/** + * Allocate a new output bin and push it onto the packer's stack of idle bins. + * + * @param packer The packer + * @param layer The physical layer that will receive the compressed block + * writes from the output bin + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int makeOutputBin(Packer *packer, PhysicalLayer *layer) +{ + OutputBin *output; + int result = ALLOCATE(1, OutputBin, __func__, &output); + if (result != VDO_SUCCESS) { + return result; + } + + // Add the bin to the stack even before it's fully initialized so it will + // be freed even if we fail to initialize it below. + initializeRing(&output->ring); + pushRingNode(&packer->outputBins, &output->ring); + pushOutputBin(packer, output); + + result = ALLOCATE_EXTENDED(CompressedBlock, packer->binDataSize, char, + "compressed block", &output->block); + if (result != VDO_SUCCESS) { + return result; + } + + return layer->createCompressedWriteVIO(layer, output, (char *) output->block, + &output->writer); +} + +/** + * Free an idle output bin and null out the reference to it. + * + * @param binPtr The reference to the output bin to free + **/ +static void freeOutputBin(OutputBin **binPtr) +{ + OutputBin *bin = *binPtr; + if (bin == NULL) { + return; + } + + unspliceRingNode(&bin->ring); + + VIO *vio = allocatingVIOAsVIO(bin->writer); + freeVIO(&vio); + FREE(bin->block); + FREE(bin); + *binPtr = NULL; +} + +/**********************************************************************/ +int makePacker(PhysicalLayer *layer, + BlockCount inputBinCount, + BlockCount outputBinCount, + const ThreadConfig *threadConfig, + Packer **packerPtr) +{ + Packer *packer; + int result = ALLOCATE_EXTENDED(Packer, outputBinCount, + OutputBin *, __func__, &packer); + if (result != VDO_SUCCESS) { + return result; + } + + packer->threadID = getPackerZoneThread(threadConfig); + packer->binDataSize = VDO_BLOCK_SIZE - sizeof(CompressedBlockHeader); + packer->size = inputBinCount; + packer->maxSlots = MAX_COMPRESSION_SLOTS; + packer->outputBinCount = outputBinCount; + initializeRing(&packer->inputBins); + initializeRing(&packer->outputBins); + + result = makeAllocationSelector(threadConfig->physicalZoneCount, + packer->threadID, &packer->selector); + if (result != VDO_SUCCESS) { + freePacker(&packer); + return result; + } + + for (BlockCount i = 0; i < inputBinCount; i++) { + int result = makeInputBin(packer); + if (result != VDO_SUCCESS) { + freePacker(&packer); + return result; + } + } + + /* + * The canceled bin can hold up to half the number of user VIOs. Every + * canceled VIO in the bin must have a canceler for which it is waiting, and + * any canceler will only have canceled one lock holder at a time. + */ + result = ALLOCATE_EXTENDED(InputBin, MAXIMUM_USER_VIOS / 2, VIO *, __func__, + &packer->canceledBin); + if (result != VDO_SUCCESS) { + freePacker(&packer); + return result; + } + + for (BlockCount i = 0; i < outputBinCount; i++) { + int result = makeOutputBin(packer, layer); + if (result != VDO_SUCCESS) { + freePacker(&packer); + return result; + } + } + + *packerPtr = packer; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freePacker(Packer **packerPtr) +{ + Packer *packer = *packerPtr; + if (packer == NULL) { + return; + } + + InputBin *input; + while ((input = getFullestBin(packer)) != NULL) { + unspliceRingNode(&input->ring); + FREE(input); + } + + FREE(packer->canceledBin); + + OutputBin *output; + while ((output = popOutputBin(packer)) != NULL) { + freeOutputBin(&output); + } + + freeAllocationSelector(&packer->selector); + FREE(packer); + *packerPtr = NULL; +} + +/** + * Get the Packer from a DataVIO. + * + * @param dataVIO The DataVIO + * + * @return The Packer from the VDO to which the DataVIO belongs + **/ +static inline Packer *getPackerFromDataVIO(DataVIO *dataVIO) +{ + return getVDOFromDataVIO(dataVIO)->packer; +} + +/**********************************************************************/ +bool isSufficientlyCompressible(DataVIO *dataVIO) +{ + Packer *packer = getPackerFromDataVIO(dataVIO); + return (dataVIO->compression.size < packer->binDataSize); +} + +/**********************************************************************/ +ThreadID getPackerThreadID(Packer *packer) +{ + return packer->threadID; +} + +/**********************************************************************/ +PackerStatistics getPackerStatistics(const Packer *packer) +{ + /* + * This is called from getVDOStatistics(), which is called from outside the + * packer thread. These are just statistics with no semantics that could + * rely on memory order, so unfenced reads are sufficient. + */ + return (PackerStatistics) { + .compressedFragmentsWritten = relaxedLoad64(&packer->fragmentsWritten), + .compressedBlocksWritten = relaxedLoad64(&packer->blocksWritten), + .compressedFragmentsInPacker = relaxedLoad64(&packer->fragmentsPending), + }; +} + +/** + * Abort packing a DataVIO. + * + * @param dataVIO The DataVIO to abort + **/ +static void abortPacking(DataVIO *dataVIO) +{ + setCompressionDone(dataVIO); + relaxedAdd64(&getPackerFromDataVIO(dataVIO)->fragmentsPending, -1); + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + continueDataVIO(dataVIO, VDO_SUCCESS); +} + +/** + * This continues the VIO completion without packing the VIO. + * + * @param waiter The wait queue entry of the VIO to continue + * @param unused An argument required so this function may be called + * from notifyAllWaiters + **/ +static void continueVIOWithoutPacking(Waiter *waiter, + void *unused __attribute__((unused))) +{ + abortPacking(waiterAsDataVIO(waiter)); +} + +/** + * Check whether the packer has drained. + * + * @param packer The packer + **/ +static void checkForDrainComplete(Packer *packer) +{ + if (isDraining(&packer->state) + && (packer->canceledBin->slotsUsed == 0) + && (packer->idleOutputBinCount == packer->outputBinCount)) { + finishDraining(&packer->state); + } +} + +/**********************************************************************/ +static void writePendingBatches(Packer *packer); + +/** + * Ensure that a completion is running on the packer thread. + * + * @param completion The compressed write VIO + * + * @return true if the completion is on the packer thread + **/ +__attribute__((warn_unused_result)) +static bool switchToPackerThread(VDOCompletion *completion) +{ + VIO *vio = asVIO(completion); + ThreadID threadID = vio->vdo->packer->threadID; + if (completion->callbackThreadID == threadID) { + return true; + } + + completion->callbackThreadID = threadID; + invokeCallback(completion); + return false; +} + +/** + * Finish processing an output bin whose write has completed. If there was + * an error, any DataVIOs waiting on the bin write will be notified. + * + * @param packer The packer which owns the bin + * @param bin The bin which has finished + **/ +static void finishOutputBin(Packer *packer, OutputBin *bin) +{ + if (hasWaiters(&bin->outgoing)) { + notifyAllWaiters(&bin->outgoing, continueVIOWithoutPacking, NULL); + } else { + // No waiters implies no error, so the compressed block was written. + relaxedAdd64(&packer->fragmentsPending, -bin->slotsUsed); + relaxedAdd64(&packer->fragmentsWritten, bin->slotsUsed); + relaxedAdd64(&packer->blocksWritten, 1); + } + + bin->slotsUsed = 0; + pushOutputBin(packer, bin); +} + +/** + * This finishes the bin write process after the bin is written to disk. This + * is the VIO callback function registered by writeOutputBin(). + * + * @param completion The compressed write VIO + **/ +static void completeOutputBin(VDOCompletion *completion) +{ + if (!switchToPackerThread(completion)) { + return; + } + + VIO *vio = asVIO(completion); + if (completion->result != VDO_SUCCESS) { + updateVIOErrorStats(vio, + "Completing compressed write VIO for physical block %" + PRIu64 " with error", + vio->physical); + } + + Packer *packer = vio->vdo->packer; + finishOutputBin(packer, completion->parent); + writePendingBatches(packer); + checkForDrainComplete(packer); +} + +/** + * Implements WaiterCallback. Continues the DataVIO waiter. + **/ +static void continueWaiter(Waiter *waiter, + void *context __attribute__((unused))) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + continueDataVIO(dataVIO, VDO_SUCCESS); +} + +/** + * Implements WaiterCallback. Updates the DataVIO waiter to refer to its slot + * in the compressed block, gives the DataVIO a share of the PBN lock on that + * block, and reserves a reference count increment on the lock. + **/ +static void shareCompressedBlock(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + OutputBin *bin = context; + + dataVIO->newMapped = (ZonedPBN) { + .pbn = bin->writer->allocation, + .zone = bin->writer->zone, + .state = getStateForSlot(dataVIO->compression.slot), + }; + dataVIOAsVIO(dataVIO)->physical = dataVIO->newMapped.pbn; + + shareCompressedWriteLock(dataVIO, bin->writer->allocationLock); + + // Wait again for all the waiters to get a share. + int result = enqueueWaiter(&bin->outgoing, waiter); + // Cannot fail since this waiter was just dequeued. + ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueueWaiter error"); +} + +/** + * Finish a compressed block write. This callback is registered in + * continueAfterAllocation(). + * + * @param completion The compressed write completion + **/ +static void finishCompressedWrite(VDOCompletion *completion) +{ + OutputBin *bin = completion->parent; + assertInPhysicalZone(bin->writer); + + if (completion->result != VDO_SUCCESS) { + releaseAllocationLock(bin->writer); + // Invokes completeOutputBin() on the packer thread, which will deal with + // the waiters. + vioDoneCallback(completion); + return; + } + + // First give every DataVIO/HashLock a share of the PBN lock to ensure it + // can't be released until they've all done their incRefs. + notifyAllWaiters(&bin->outgoing, shareCompressedBlock, bin); + + // The waiters now hold the (downgraded) PBN lock. + bin->writer->allocationLock = NULL; + + // Invokes the callbacks registered before entering the packer. + notifyAllWaiters(&bin->outgoing, continueWaiter, NULL); + + // Invokes completeOutputBin() on the packer thread. + vioDoneCallback(completion); +} + +/** + * Continue the write path for a compressed write AllocatingVIO now that block + * allocation is complete (the AllocatingVIO may or may not have actually + * received an allocation). + * + * @param allocatingVIO The AllocatingVIO which has finished the allocation + * process + **/ +static void continueAfterAllocation(AllocatingVIO *allocatingVIO) +{ + VIO *vio = allocatingVIOAsVIO(allocatingVIO); + VDOCompletion *completion = vioAsCompletion(vio); + if (allocatingVIO->allocation == ZERO_BLOCK) { + completion->requeue = true; + setCompletionResult(completion, VDO_NO_SPACE); + vioDoneCallback(completion); + return; + } + + setPhysicalZoneCallback(allocatingVIO, finishCompressedWrite, + THIS_LOCATION("$F(meta);cb=finishCompressedWrite")); + completion->layer->writeCompressedBlock(allocatingVIO); +} + +/** + * Launch an output bin. + * + * @param packer The packer which owns the bin + * @param bin The output bin to launch + **/ +static void launchCompressedWrite(Packer *packer, OutputBin *bin) +{ + if (isReadOnly(getVDOFromAllocatingVIO(bin->writer)->readOnlyNotifier)) { + finishOutputBin(packer, bin); + return; + } + + VIO *vio = allocatingVIOAsVIO(bin->writer); + resetCompletion(vioAsCompletion(vio)); + vio->callback = completeOutputBin; + vio->priority = VIO_PRIORITY_COMPRESSED_DATA; + allocateDataBlock(bin->writer, packer->selector, VIO_COMPRESSED_WRITE_LOCK, + continueAfterAllocation); +} + +/** + * Consume from the pending queue the next batch of VIOs that can be packed + * together in a single compressed block. VIOs that have been mooted since + * being placed in the pending queue will not be returned. + * + * @param packer The packer + * @param batch The counted array to fill with the next batch of VIOs + **/ +static void getNextBatch(Packer *packer, OutputBatch *batch) +{ + BlockSize spaceRemaining = packer->binDataSize; + batch->slotsUsed = 0; + + DataVIO *dataVIO; + while ((dataVIO = waiterAsDataVIO(getFirstWaiter(&packer->batchedDataVIOs))) + != NULL) { + // If there's not enough space for the next DataVIO, the batch is done. + if ((dataVIO->compression.size > spaceRemaining) + || (batch->slotsUsed == packer->maxSlots)) { + break; + } + + // Remove the next DataVIO from the queue and put it in the output batch. + dequeueNextWaiter(&packer->batchedDataVIOs); + batch->slots[batch->slotsUsed++] = dataVIO; + spaceRemaining -= dataVIO->compression.size; + } +} + +/** + * Pack the next batch of compressed VIOs from the batched queue into an + * output bin and write the output bin. + * + * @param packer The packer + * @param output The output bin to fill + * + * @return true if a write was issued for the output bin + **/ +__attribute__((warn_unused_result)) +static bool writeNextBatch(Packer *packer, OutputBin *output) +{ + OutputBatch batch; + getNextBatch(packer, &batch); + + if (batch.slotsUsed == 0) { + // The pending queue must now be empty (there may have been mooted VIOs). + return false; + } + + // If the batch contains only a single VIO, then we save nothing by saving + // the compressed form. Continue processing the single VIO in the batch. + if (batch.slotsUsed == 1) { + abortPacking(batch.slots[0]); + return false; + } + + resetCompressedBlockHeader(&output->block->header); + + size_t spaceUsed = 0; + for (SlotNumber slot = 0; slot < batch.slotsUsed; slot++) { + DataVIO *dataVIO = batch.slots[slot]; + dataVIO->compression.slot = slot; + putCompressedBlockFragment(output->block, slot, spaceUsed, + dataVIO->compression.data, + dataVIO->compression.size); + spaceUsed += dataVIO->compression.size; + + int result = enqueueDataVIO(&output->outgoing, dataVIO, + THIS_LOCATION(NULL)); + if (result != VDO_SUCCESS) { + abortPacking(dataVIO); + continue; + } + + output->slotsUsed += 1; + } + + launchCompressedWrite(packer, output); + return true; +} + +/** + * Put a DataVIO in a specific InputBin in which it will definitely fit. + * + * @param bin The bin in which to put the DataVIO + * @param dataVIO The DataVIO to add + **/ +static void addToInputBin(InputBin *bin, DataVIO *dataVIO) +{ + dataVIO->compression.bin = bin; + dataVIO->compression.slot = bin->slotsUsed; + bin->incoming[bin->slotsUsed++] = dataVIO; +} + +/** + * Start a new batch of VIOs in an InputBin, moving the existing batch, if + * any, to the queue of pending batched VIOs in the packer. + * + * @param packer The packer + * @param bin The bin to prepare + **/ +static void startNewBatch(Packer *packer, InputBin *bin) +{ + // Move all the DataVIOs in the current batch to the batched queue so they + // will get packed into the next free output bin. + for (SlotNumber slot = 0; slot < bin->slotsUsed; slot++) { + DataVIO *dataVIO = bin->incoming[slot]; + dataVIO->compression.bin = NULL; + + if (!mayWriteCompressedDataVIO(dataVIO)) { + /* + * Compression of this DataVIO was canceled while it was waiting; put it + * in the canceled bin so it can be rendezvous with the canceling + * DataVIO. + */ + addToInputBin(packer->canceledBin, dataVIO); + continue; + } + + int result = enqueueDataVIO(&packer->batchedDataVIOs, dataVIO, + THIS_LOCATION(NULL)); + if (result != VDO_SUCCESS) { + // Impossible but we're required to check the result from enqueue. + abortPacking(dataVIO); + } + } + + // The bin is now empty. + bin->slotsUsed = 0; + bin->freeSpace = packer->binDataSize; +} + +/** + * Add a DataVIO to a bin's incoming queue, handle logical space change, and + * call physical space processor. + * + * @param packer The packer + * @param bin The bin to which to add the the DataVIO + * @param dataVIO The DataVIO to add to the bin's queue + **/ +static void addDataVIOToInputBin(Packer *packer, + InputBin *bin, + DataVIO *dataVIO) +{ + // If the selected bin doesn't have room, start a new batch to make room. + if (bin->freeSpace < dataVIO->compression.size) { + startNewBatch(packer, bin); + } + + addToInputBin(bin, dataVIO); + bin->freeSpace -= dataVIO->compression.size; + + // If we happen to exactly fill the bin, start a new input batch. + if ((bin->slotsUsed == packer->maxSlots) || (bin->freeSpace == 0)) { + startNewBatch(packer, bin); + } + + // Now that we've finished changing the free space, restore the sort order. + insertInSortedList(packer, bin); +} + +/** + * Move DataVIOs in pending batches from the batchedDataVIOs to all free output + * bins, issuing writes for the output bins as they are packed. This will loop + * until either the pending queue is drained or all output bins are busy + * writing a compressed block. + * + * @param packer The packer + **/ +static void writePendingBatches(Packer *packer) +{ + if (packer->writingBatches) { + /* + * We've attempted to re-enter this function recursively due to completion + * handling, which can lead to kernel stack overflow as in VDO-1340. It's + * perfectly safe to break the recursion and do nothing since we know any + * pending batches will eventually be handled by the earlier call. + */ + return; + } + + // Record that we are in this function for the above check. IMPORTANT: never + // return from this function without clearing this flag. + packer->writingBatches = true; + + OutputBin *output; + while (hasWaiters(&packer->batchedDataVIOs) + && ((output = popOutputBin(packer)) != NULL)) { + if (!writeNextBatch(packer, output)) { + // We didn't use the output bin to write, so push it back on the stack. + pushOutputBin(packer, output); + } + } + + packer->writingBatches = false; +} + +/** + * Select the input bin that should be used to pack the compressed data in a + * DataVIO with other DataVIOs. + * + * @param packer The packer + * @param dataVIO The DataVIO + **/ +__attribute__((warn_unused_result)) +static InputBin *selectInputBin(Packer *packer, DataVIO *dataVIO) +{ + // First best fit: select the bin with the least free space that has enough + // room for the compressed data in the DataVIO. + InputBin *fullestBin = getFullestBin(packer); + for (InputBin *bin = fullestBin; bin != NULL; bin = nextBin(packer, bin)) { + if (bin->freeSpace >= dataVIO->compression.size) { + return bin; + } + } + + /* + * None of the bins have enough space for the DataVIO. We're not allowed to + * create new bins, so we have to overflow one of the existing bins. It's + * pretty intuitive to select the fullest bin, since that "wastes" the least + * amount of free space in the compressed block. But if the space currently + * used in the fullest bin is smaller than the compressed size of the + * incoming block, it seems wrong to force that bin to write when giving up + * on compressing the incoming DataVIO would likewise "waste" the the least + * amount of free space. + */ + if (dataVIO->compression.size + >= (packer->binDataSize - fullestBin->freeSpace)) { + return NULL; + } + + // The fullest bin doesn't have room, but writing it out and starting a new + // batch with the incoming DataVIO will increase the packer's free space. + return fullestBin; +} + +/**********************************************************************/ +void attemptPacking(DataVIO *dataVIO) +{ + Packer *packer = getPackerFromDataVIO(dataVIO); + assertOnPackerThread(packer, __func__); + + VIOCompressionState state = getCompressionState(dataVIO); + int result = ASSERT((state.status == VIO_COMPRESSING), + "attempt to pack DataVIO not ready for packing, state: " + "%u", + state.status); + if (result != VDO_SUCCESS) { + return; + } + + /* + * Increment whether or not this DataVIO will be packed or not since + * abortPacking() always decrements the counter. + */ + relaxedAdd64(&packer->fragmentsPending, 1); + + // If packing of this DataVIO is disallowed for administrative reasons, give + // up before making any state changes. + if (!isNormal(&packer->state) + || (dataVIO->flushGeneration < packer->flushGeneration)) { + abortPacking(dataVIO); + return; + } + + /* + * The check of mayBlockInPacker() here will set the DataVIO's compression + * state to VIO_PACKING if the DataVIO is allowed to be compressed (if it has + * already been canceled, we'll fall out here). Once the DataVIO is in the + * VIO_PACKING state, it must be guaranteed to be put in an input bin before + * any more requests can be processed by the packer thread. Otherwise, a + * canceling DataVIO could attempt to remove the canceled DataVIO from the + * packer and fail to rendezvous with it (VDO-2809). We must also make sure + * that we will actually bin the DataVIO and not give up on it as being + * larger than the space used in the fullest bin. Hence we must call + * selectInputBin() before calling mayBlockInPacker() (VDO-2826). + */ + InputBin *bin = selectInputBin(packer, dataVIO); + if ((bin == NULL) || !mayBlockInPacker(dataVIO)) { + abortPacking(dataVIO); + return; + } + + addDataVIOToInputBin(packer, bin, dataVIO); + writePendingBatches(packer); +} + +/** + * Force a pending write for all non-empty bins on behalf of a flush or + * suspend. + * + * @param packer The packer being flushed + **/ +static void writeAllNonEmptyBins(Packer *packer) +{ + for (InputBin *bin = getFullestBin(packer); + bin != NULL; + bin = nextBin(packer, bin)) { + startNewBatch(packer, bin); + // We don't need to re-sort the bin here since this loop will make every + // bin have the same amount of free space, so every ordering is sorted. + } + + writePendingBatches(packer); +} + +/**********************************************************************/ +void flushPacker(Packer *packer) +{ + assertOnPackerThread(packer, __func__); + if (isNormal(&packer->state)) { + writeAllNonEmptyBins(packer); + } +} + +/* + * This method is only exposed for unit tests and should not normally be called + * directly; use removeLockHolderFromPacker() instead. + */ +void removeFromPacker(DataVIO *dataVIO) +{ + InputBin *bin = dataVIO->compression.bin; + ASSERT_LOG_ONLY((bin != NULL), "DataVIO in packer has an input bin"); + + SlotNumber slot = dataVIO->compression.slot; + bin->slotsUsed--; + if (slot < bin->slotsUsed) { + bin->incoming[slot] = bin->incoming[bin->slotsUsed]; + bin->incoming[slot]->compression.slot = slot; + } + + dataVIO->compression.bin = NULL; + dataVIO->compression.slot = 0; + + Packer *packer = getPackerFromDataVIO(dataVIO); + if (bin != packer->canceledBin) { + bin->freeSpace += dataVIO->compression.size; + insertInSortedList(packer, bin); + } + + abortPacking(dataVIO); + checkForDrainComplete(packer); +} + +/**********************************************************************/ +void removeLockHolderFromPacker(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInPackerZone(dataVIO); + + DataVIO *lockHolder = dataVIO->compression.lockHolder; + dataVIO->compression.lockHolder = NULL; + removeFromPacker(lockHolder); +} + +/**********************************************************************/ +void incrementPackerFlushGeneration(Packer *packer) +{ + assertOnPackerThread(packer, __func__); + packer->flushGeneration++; + flushPacker(packer); +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + Packer *packer = container_of(state, Packer, state); + writeAllNonEmptyBins(packer); + checkForDrainComplete(packer); +} + +/**********************************************************************/ +void drainPacker(Packer *packer, VDOCompletion *completion) +{ + assertOnPackerThread(packer, __func__); + startDraining(&packer->state, ADMIN_STATE_SUSPENDING, completion, + initiateDrain); +} + +/**********************************************************************/ +void resumePacker(Packer *packer, VDOCompletion *parent) +{ + assertOnPackerThread(packer, __func__); + finishCompletion(parent, resumeIfQuiescent(&packer->state)); +} + +/**********************************************************************/ +void resetSlotCount(Packer *packer, CompressedFragmentCount slots) +{ + if (slots > MAX_COMPRESSION_SLOTS) { + return; + } + + packer->maxSlots = slots; +} + +/**********************************************************************/ +static void dumpInputBin(const InputBin *bin, bool canceled) +{ + if (bin->slotsUsed == 0) { + // Don't dump empty input bins. + return; + } + + logInfo(" %sBin slotsUsed=%u freeSpace=%zu", + (canceled ? "Canceled" : "Input"), bin->slotsUsed, bin->freeSpace); + + // XXX dump VIOs in bin->incoming? The VIOs should have been dumped from the + // VIO pool. Maybe just dump their addresses so it's clear they're here? +} + +/**********************************************************************/ +static void dumpOutputBin(const OutputBin *bin) +{ + size_t count = countWaiters(&bin->outgoing); + if (bin->slotsUsed == 0) { + // Don't dump empty output bins. + return; + } + + logInfo(" OutputBin contains %zu outgoing waiters", count); + + // XXX dump VIOs in bin->outgoing? The VIOs should have been dumped from the + // VIO pool. Maybe just dump their addresses so it's clear they're here? + + // XXX dump writer VIO? +} + +/**********************************************************************/ +void dumpPacker(const Packer *packer) +{ + logInfo("Packer"); + logInfo(" flushGeneration=%llu state %s writingBatches=%s", + packer->flushGeneration, getAdminStateName(&packer->state), + boolToString(packer->writingBatches)); + + logInfo(" inputBinCount=%llu", packer->size); + for (InputBin *bin = getFullestBin(packer); + bin != NULL; + bin = nextBin(packer, bin)) { + dumpInputBin(bin, false); + } + + dumpInputBin(packer->canceledBin, true); + + logInfo(" outputBinCount=%zu idleOutputBinCount=%zu", + packer->outputBinCount, packer->idleOutputBinCount); + const RingNode *head = &packer->outputBins; + for (RingNode *node = head->next; node != head; node = node->next) { + dumpOutputBin(outputBinFromRingNode(node)); + } +} diff --git a/source/vdo/base/packer.h b/source/vdo/base/packer.h new file mode 100644 index 0000000..6661552 --- /dev/null +++ b/source/vdo/base/packer.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packer.h#3 $ + */ + +#ifndef PACKER_H +#define PACKER_H + +#include "completion.h" +#include "physicalLayer.h" +#include "statistics.h" +#include "threadConfig.h" +#include "types.h" + +enum { + DEFAULT_PACKER_INPUT_BINS = 16, + DEFAULT_PACKER_OUTPUT_BINS = 256, +}; + +typedef struct packer Packer; + +/** + * Make a new block packer. + * + * @param [in] layer The physical layer to which compressed blocks + * will be written + * @param [in] inputBinCount The number of partial bins to keep in memory + * @param [in] outputBinCount The number of compressed blocks that can be + * written concurrently + * @param [in] threadConfig The thread configuration of the VDO + * @param [out] packerPtr A pointer to hold the new packer + * + * @return VDO_SUCCESS or an error + **/ +int makePacker(PhysicalLayer *layer, + BlockCount inputBinCount, + BlockCount outputBinCount, + const ThreadConfig *threadConfig, + Packer **packerPtr) + __attribute__((warn_unused_result)); + +/** + * Free a block packer and null out the reference to it. + * + * @param packerPtr A pointer to the packer to free + **/ +void freePacker(Packer **packerPtr); + +/** + * Check whether the compressed data in a DataVIO will fit in a packer bin. + * + * @param dataVIO The DataVIO + * + * @return true if the DataVIO will fit in a bin + **/ +bool isSufficientlyCompressible(DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Get the thread ID of the packer's zone. + * + * @param packer The packer + * + * @return The packer's thread ID + **/ +ThreadID getPackerThreadID(Packer *packer); + +/** + * Get the current statistics from the packer. + * + * @param packer The packer to query + * + * @return a copy of the current statistics for the packer + **/ +PackerStatistics getPackerStatistics(const Packer *packer) + __attribute__((warn_unused_result)); + +/** + * Attempt to rewrite the data in this DataVIO as part of a compressed block. + * + * @param dataVIO The DataVIO to pack + **/ +void attemptPacking(DataVIO *dataVIO); + +/** + * Request that the packer flush asynchronously. All bins with at least two + * compressed data blocks will be written out, and any solitary pending VIOs + * will be released from the packer. While flushing is in progress, any VIOs + * submitted to attemptPacking() will be continued immediately without + * attempting to pack them. + * + * @param packer The packer to flush + **/ +void flushPacker(Packer *packer); + +/** + * Remove a lock holder from the packer. + * + * @param completion The DataVIO which needs a lock held by a DataVIO in the + * packer. The dataVIO's compressedVIO.lockHolder field will + * point to the DataVIO to remove. + **/ +void removeLockHolderFromPacker(VDOCompletion *completion); + +/** + * Increment the flush generation in the packer. This will also cause the + * packer to flush so that any VIOs from previous generations will exit the + * packer. + * + * @param packer The packer + **/ +void incrementPackerFlushGeneration(Packer *packer); + +/** + * Drain the packer by preventing any more VIOs from entering the packer and + * then flushing. + * + * @param packer The packer to drain + * @param completion The completion to finish when the packer has drained + **/ +void drainPacker(Packer *packer, VDOCompletion *completion); + +/** + * Resume a packer which has been suspended. + * + * @param packer The packer to resume + * @param parent The completion to finish when the packer has resumed + * + * @return VDO_SUCCESS or an error + **/ +void resumePacker(Packer *packer, VDOCompletion *parent); + +/** + * Dump the packer, in a thread-unsafe fashion. + * + * @param packer The packer + **/ +void dumpPacker(const Packer *packer); + +#endif /* PACKER_H */ diff --git a/source/vdo/base/packerInternals.h b/source/vdo/base/packerInternals.h new file mode 100644 index 0000000..e5aa500 --- /dev/null +++ b/source/vdo/base/packerInternals.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packerInternals.h#4 $ + */ + +#ifndef PACKER_INTERNALS_H +#define PACKER_INTERNALS_H + +#include "packer.h" + +#include "atomic.h" + +#include "adminState.h" +#include "compressedBlock.h" +#include "header.h" +#include "types.h" +#include "waitQueue.h" + +/** + * Each InputBin holds an incomplete batch of DataVIOs that only partially fill + * a compressed block. The InputBins are kept in a ring sorted by the amount of + * unused space so the first bin with enough space to hold a newly-compressed + * DataVIO can easily be found. When the bin fills up or is flushed, the + * incoming DataVIOs are moved to the Packer's batchedDataVIOs queue, from + * which they will eventually be routed to an idle OutputBin. + * + * There is one special input bin which is used to hold DataVIOs which have + * been canceled and removed from their input bin by the packer. These DataVIOs + * need to wait for the canceller to rendezvous with them (VDO-2809) and so + * they sit in this special bin. + **/ +struct inputBin { + /** List links for Packer.sortedBins */ + RingNode ring; + /** The number of items in the bin */ + SlotNumber slotsUsed; + /** The number of compressed block bytes remaining in the current batch */ + size_t freeSpace; + /** The current partial batch of DataVIOs, waiting for more */ + DataVIO *incoming[]; +}; + +/** + * Each OutputBin allows a single compressed block to be packed and written. + * When it is not idle, it holds a batch of DataVIOs that have been packed + * into the compressed block, written asynchronously, and are waiting for the + * write to complete. + **/ +typedef struct { + /** List links for Packer.outputBins */ + RingNode ring; + /** The storage for encoding the compressed block representation */ + CompressedBlock *block; + /** The AllocatingVIO wrapping the compressed block for writing */ + AllocatingVIO *writer; + /** The number of compression slots used in the compressed block */ + SlotNumber slotsUsed; + /** The DataVIOs packed into the block, waiting for the write to complete */ + WaitQueue outgoing; +} OutputBin; + +/** + * A counted array holding a batch of DataVIOs that should be packed into an + * output bin. + **/ +typedef struct { + size_t slotsUsed; + DataVIO *slots[MAX_COMPRESSION_SLOTS]; +} OutputBatch; + +struct packer { + /** The ID of the packer's callback thread */ + ThreadID threadID; + /** The selector for determining which physical zone to allocate from */ + AllocationSelector *selector; + /** The number of input bins */ + BlockCount size; + /** The block size minus header size */ + size_t binDataSize; + /** The number of compression slots */ + size_t maxSlots; + /** A ring of all InputBins, kept sorted by freeSpace */ + RingNode inputBins; + /** A ring of all OutputBins */ + RingNode outputBins; + /** + * A bin to hold DataVIOs which were canceled out of the packer and are + * waiting to rendezvous with the canceling DataVIO. + **/ + InputBin *canceledBin; + + /** The current flush generation */ + SequenceNumber flushGeneration; + + /** The administrative state of the packer */ + AdminState state; + /** True when writing batched DataVIOs */ + bool writingBatches; + + // Atomic counters corresponding to the fields of PackerStatistics: + + /** Number of compressed data items written since startup */ + Atomic64 fragmentsWritten; + /** Number of blocks containing compressed items written since startup */ + Atomic64 blocksWritten; + /** Number of DataVIOs that are pending in the packer */ + Atomic64 fragmentsPending; + + /** Queue of batched DataVIOs waiting to be packed */ + WaitQueue batchedDataVIOs; + + /** The total number of output bins allocated */ + size_t outputBinCount; + /** The number of idle output bins on the stack */ + size_t idleOutputBinCount; + /** The stack of idle output bins (0=bottom) */ + OutputBin *idleOutputBins[]; +}; + +/** + * This returns the first bin in the freeSpace-sorted list. + **/ +InputBin *getFullestBin(const Packer *packer); + +/** + * This returns the next bin in the freeSpace-sorted list. + **/ +InputBin *nextBin(const Packer *packer, InputBin *bin); + +/** + * Change the maxiumum number of compression slots the packer will use. The new + * number of slots must be less than or equal to MAX_COMPRESSION_SLOTS. Bins + * which already have fragments will not be resized until they are next written + * out. + * + * @param packer The packer + * @param slots The new number of slots + **/ +void resetSlotCount(Packer *packer, CompressedFragmentCount slots); + +/** + * Remove a DataVIO from the packer. This method is exposed for testing. + * + * @param dataVIO The DataVIO to remove + **/ +void removeFromPacker(DataVIO *dataVIO); + +#endif /* PACKER_INTERNALS_H */ diff --git a/source/vdo/base/partitionCopy.c b/source/vdo/base/partitionCopy.c new file mode 100644 index 0000000..d5fa6de --- /dev/null +++ b/source/vdo/base/partitionCopy.c @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/partitionCopy.c#2 $ + */ + +#include "partitionCopy.h" + +#include "memoryAlloc.h" + +#include "completion.h" +#include "constants.h" +#include "extent.h" +#include "numUtils.h" + +enum { + STRIDE_LENGTH = 2048 +}; + +/** + * A partition copy completion. + **/ +typedef struct { + /** completion header */ + VDOCompletion completion; + /** the source partition to copy from */ + Partition *source; + /** the target partition to copy to */ + Partition *target; + /** the current in-partition PBN the copy is beginning at */ + PhysicalBlockNumber currentIndex; + /** the last block to copy */ + PhysicalBlockNumber endingIndex; + /** the backing data used by the extent */ + char *data; + /** the extent being used to copy */ + VDOExtent *extent; +} CopyCompletion; + +/** + * Convert a VDOCompletion to a CopyCompletion. + * + * @param completion The completion to convert + * + * @return the completion as a CopyCompletion + **/ +__attribute__((warn_unused_result)) +static inline +CopyCompletion *asCopyCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(CopyCompletion, completion) == 0); + assertCompletionType(completion->type, PARTITION_COPY_COMPLETION); + return (CopyCompletion *) completion; +} + +/**********************************************************************/ +int makeCopyCompletion(PhysicalLayer *layer, VDOCompletion **completionPtr) +{ + CopyCompletion *copy; + int result = ALLOCATE(1, CopyCompletion, __func__, ©); + if (result != VDO_SUCCESS) { + return result; + } + initializeCompletion(©->completion, PARTITION_COPY_COMPLETION, layer); + + result = ALLOCATE((VDO_BLOCK_SIZE * STRIDE_LENGTH), char, + "partition copy extent", ©->data); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = ©->completion; + freeCopyCompletion(&completion); + return result; + } + + result = createExtent(layer, VIO_TYPE_PARTITION_COPY, VIO_PRIORITY_HIGH, + STRIDE_LENGTH, copy->data, ©->extent); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = ©->completion; + freeCopyCompletion(&completion); + return result; + } + + *completionPtr = ©->completion; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeCopyCompletion(VDOCompletion **completionPtr) +{ + if (*completionPtr == NULL) { + return; + } + + CopyCompletion *copy = asCopyCompletion(*completionPtr); + freeExtent(©->extent); + FREE(copy->data); + FREE(copy); + *completionPtr = NULL; +} + +/**********************************************************************/ +static void copyPartitionStride(CopyCompletion *copy); + +/** + * Determine the number of blocks to copy in the current stride. + * + * @param copy The copy completion + * + * @return The number of blocks to copy in the current stride + **/ +static inline BlockCount getStrideSize(CopyCompletion *copy) +{ + return minBlockCount(STRIDE_LENGTH, copy->endingIndex - copy->currentIndex); +} + +/** + * Process a completed write during a partition copy. + * + * @param completion The extent which has just completed writing + **/ +static void completeWriteForCopy(VDOCompletion *completion) +{ + CopyCompletion *copy = asCopyCompletion(completion->parent); + copy->currentIndex += getStrideSize(copy); + if (copy->currentIndex >= copy->endingIndex) { + // We're done. + finishCompletion(completion->parent, VDO_SUCCESS); + return; + } + copyPartitionStride(copy); +} + +/** + * Process a completed read during a partition copy, and launch the + * corresponding write to the new partition. + * + * @param completion The extent which has just completed reading + **/ +static void completeReadForCopy(VDOCompletion *completion) +{ + CopyCompletion *copy = asCopyCompletion(completion->parent); + PhysicalBlockNumber layerStartBlock; + int result = translateToPBN(copy->target, copy->currentIndex, + &layerStartBlock); + if (result != VDO_SUCCESS) { + finishCompletion(completion->parent, result); + return; + } + + completion->callback = completeWriteForCopy; + writePartialMetadataExtent(asVDOExtent(completion), layerStartBlock, + getStrideSize(copy)); +} + +/** + * Copy a stride from one partition to the new partition. + * + * @param copy The CopyCompletion + **/ +static void copyPartitionStride(CopyCompletion *copy) +{ + PhysicalBlockNumber layerStartBlock; + int result = translateToPBN(copy->source, copy->currentIndex, + &layerStartBlock); + if (result != VDO_SUCCESS) { + finishCompletion(©->completion, result); + return; + } + + prepareCompletion(©->extent->completion, completeReadForCopy, + finishParentCallback, copy->completion.callbackThreadID, + ©->completion); + readPartialMetadataExtent(copy->extent, layerStartBlock, + getStrideSize(copy)); +} + +/** + * Verify that the source can be copied to the target safely. + * + * @param source The source partition + * @param target The target partition + * + * @return VDO_SUCCESS or an error code + **/ +static int validatePartitionCopy(Partition *source, Partition *target) +{ + BlockCount sourceSize = getFixedLayoutPartitionSize(source); + BlockCount targetSize = getFixedLayoutPartitionSize(target); + + PhysicalBlockNumber sourceStart = getFixedLayoutPartitionOffset(source); + PhysicalBlockNumber sourceEnd = sourceStart + sourceSize; + PhysicalBlockNumber targetStart = getFixedLayoutPartitionOffset(target); + PhysicalBlockNumber targetEnd = targetStart + targetSize; + + int result = ASSERT(sourceSize <= targetSize, + "target partition must be not smaller than source" + " partition"); + if (result != UDS_SUCCESS) { + return result; + } + + return ASSERT(((sourceEnd <= targetStart) || (targetEnd <= sourceStart)), + "target partition must not overlap source partition"); +} + +/**********************************************************************/ +void copyPartitionAsync(VDOCompletion *completion, + Partition *source, + Partition *target, + VDOCompletion *parent) +{ + int result = validatePartitionCopy(source, target); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + CopyCompletion *copy = asCopyCompletion(completion); + prepareToFinishParent(©->completion, parent); + copy->source = source; + copy->target = target; + copy->currentIndex = 0; + copy->endingIndex = getFixedLayoutPartitionSize(source); + copyPartitionStride(copy); +} diff --git a/source/vdo/base/partitionCopy.h b/source/vdo/base/partitionCopy.h new file mode 100644 index 0000000..574ac13 --- /dev/null +++ b/source/vdo/base/partitionCopy.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/partitionCopy.h#2 $ + */ + +#ifndef PARTITION_COPY_H +#define PARTITION_COPY_H + +#include "fixedLayout.h" +#include "physicalLayer.h" +#include "types.h" + +/** + * Make a copy completion. + * + * @param [in] layer The layer on which the partitions reside + * @param [out] completionPtr A pointer to hold the copy completion + * + * @return VDO_SUCCESS or an error + **/ +int makeCopyCompletion(PhysicalLayer *layer, VDOCompletion **completionPtr) + __attribute__((warn_unused_result)); + +/** + * Free a copy completion and NULL out the reference to it. + * + * @param completionPtr A pointer to the complete to be freed + **/ +void freeCopyCompletion(VDOCompletion **completionPtr); + +/** + * Copy a partition. + * + * @param completion The copy completion to use + * @param source The partition to copy from + * @param target The partition to copy to + * @param parent The parent to finish when the copy is complete + **/ +void copyPartitionAsync(VDOCompletion *completion, + Partition *source, + Partition *target, + VDOCompletion *parent); + +#endif /* PARTITION_COPY_H */ diff --git a/source/vdo/base/pbnLock.c b/source/vdo/base/pbnLock.c new file mode 100644 index 0000000..5e9a274 --- /dev/null +++ b/source/vdo/base/pbnLock.c @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLock.c#3 $ + */ + +#include "pbnLock.h" + +#include "logger.h" + +#include "blockAllocator.h" +#include "referenceBlock.h" + +struct pbnLockImplementation { + PBNLockType type; + const char *name; + const char *releaseReason; +}; + +/** + * This array must have an entry for every PBNLockType value. + **/ +static const PBNLockImplementation LOCK_IMPLEMENTATIONS[] = { + [VIO_READ_LOCK] = { + .type = VIO_READ_LOCK, + .name = "read", + .releaseReason = "candidate duplicate", + }, + [VIO_WRITE_LOCK] = { + .type = VIO_WRITE_LOCK, + .name = "write", + .releaseReason = "newly allocated", + }, + [VIO_COMPRESSED_WRITE_LOCK] = { + .type = VIO_COMPRESSED_WRITE_LOCK, + .name = "compressed write", + .releaseReason = "failed compression", + }, + [VIO_BLOCK_MAP_WRITE_LOCK] = { + .type = VIO_BLOCK_MAP_WRITE_LOCK, + .name = "block map write", + .releaseReason = "block map write", + }, +}; + +/**********************************************************************/ +static inline bool hasLockType(const PBNLock *lock, PBNLockType type) +{ + return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]); +} + +/**********************************************************************/ +bool isPBNReadLock(const PBNLock *lock) +{ + return hasLockType(lock, VIO_READ_LOCK); +} + +/**********************************************************************/ +static inline void setPBNLockType(PBNLock *lock, PBNLockType type) +{ + lock->implementation = &LOCK_IMPLEMENTATIONS[type]; +} + +/**********************************************************************/ +void initializePBNLock(PBNLock *lock, PBNLockType type) +{ + lock->holderCount = 0; + setPBNLockType(lock, type); +} + +/**********************************************************************/ +void downgradePBNWriteLock(PBNLock *lock) +{ + ASSERT_LOG_ONLY(!isPBNReadLock(lock), + "PBN lock must not already have been downgraded"); + ASSERT_LOG_ONLY(!hasLockType(lock, VIO_BLOCK_MAP_WRITE_LOCK), + "must not downgrade block map write locks"); + ASSERT_LOG_ONLY(lock->holderCount == 1, + "PBN write lock should have one holder but has %u", + lock->holderCount); + if (hasLockType(lock, VIO_WRITE_LOCK)) { + // DataVIO write locks are downgraded in place--the writer retains the + // hold on the lock. They've already had a single incRef journaled. + lock->incrementLimit = MAXIMUM_REFERENCE_COUNT - 1; + } else { + // Compressed block write locks are downgraded when they are shared with + // all their hash locks. The writer is releasing its hold on the lock. + lock->holderCount = 0; + lock->incrementLimit = MAXIMUM_REFERENCE_COUNT; + } + setPBNLockType(lock, VIO_READ_LOCK); +} + +/**********************************************************************/ +bool claimPBNLockIncrement(PBNLock *lock) +{ + /* + * Claim the next free reference atomically since hash locks from multiple + * hash zone threads might be concurrently deduplicating against a single + * PBN lock on compressed block. As long as hitting the increment limit will + * lead to the PBN lock being released in a sane time-frame, we won't + * overflow a 32-bit claim counter, allowing a simple add instead of a + * compare-and-swap. + */ + uint32_t claimNumber = atomicAdd32(&lock->incrementsClaimed, 1); + return (claimNumber <= lock->incrementLimit); +} + +/**********************************************************************/ +void assignProvisionalReference(PBNLock *lock) +{ + ASSERT_LOG_ONLY(!lock->hasProvisionalReference, + "lock does not have a provisional reference"); + lock->hasProvisionalReference = true; +} + +/**********************************************************************/ +void unassignProvisionalReference(PBNLock *lock) +{ + lock->hasProvisionalReference = false; +} + +/**********************************************************************/ +void releaseProvisionalReference(PBNLock *lock, + PhysicalBlockNumber lockedPBN, + BlockAllocator *allocator) +{ + if (hasProvisionalReference(lock)) { + releaseBlockReference(allocator, lockedPBN, + lock->implementation->releaseReason); + unassignProvisionalReference(lock); + } +} diff --git a/source/vdo/base/pbnLock.h b/source/vdo/base/pbnLock.h new file mode 100644 index 0000000..bd6512b --- /dev/null +++ b/source/vdo/base/pbnLock.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLock.h#3 $ + */ + +#ifndef PBN_LOCK_H +#define PBN_LOCK_H + +#include "atomic.h" +#include "types.h" + +/** + * The type of a PBN lock. + **/ +typedef enum { + VIO_READ_LOCK = 0, + VIO_WRITE_LOCK, + VIO_COMPRESSED_WRITE_LOCK, + VIO_BLOCK_MAP_WRITE_LOCK, +} PBNLockType; + +typedef struct pbnLockImplementation PBNLockImplementation; + +/** + * A PBN lock. + **/ +struct pbnLock { + /** The implementation of the lock */ + const PBNLockImplementation *implementation; + + /** The number of VIOs holding or sharing this lock */ + VIOCount holderCount; + /** + * The number of compressed block writers holding a share of this lock while + * they are acquiring a reference to the PBN. + **/ + uint8_t fragmentLocks; + + /** + * Whether the locked PBN has been provisionally referenced on behalf of the + * lock holder. + **/ + bool hasProvisionalReference; + + /** + * For read locks, the number of references that were known to be available + * on the locked block at the time the lock was acquired. + **/ + uint8_t incrementLimit; + + /** + * For read locks, the number of DataVIOs that have tried to claim one of + * the available increments during the lifetime of the lock. Each claim will + * first increment this counter, so it can exceed the increment limit. + **/ + Atomic32 incrementsClaimed; +}; + +/** + * Initialize a PBNLock. + * + * @param lock The lock to initialize + * @param type The type of the lock + **/ +void initializePBNLock(PBNLock *lock, PBNLockType type); + +/** + * Check whether a PBNLock is a read lock. + * + * @param lock The lock to check + * + * @return true if the lock is a read lock + **/ +bool isPBNReadLock(const PBNLock *lock) + __attribute__((warn_unused_result)); + +/** + * Downgrade a PBN write lock to a PBN read lock. The lock holder count is + * cleared and the caller is responsible for setting the new count. + * + * @param lock The PBN write lock to downgrade + **/ +void downgradePBNWriteLock(PBNLock *lock); + +/** + * Try to claim one of the available reference count increments on a read + * lock. Claims may be attempted from any thread. A claim is only valid until + * the PBN lock is released. + * + * @param lock The PBN read lock from which to claim an increment + * + * @return true if the claim succeeded, guaranteeing one + * increment can be made without overflowing the PBN's reference count + **/ +bool claimPBNLockIncrement(PBNLock *lock) + __attribute__((warn_unused_result)); + +/** + * Check whether a PBN lock has a provisional reference. + * + * @param lock The PBN lock + **/ +static inline bool hasProvisionalReference(PBNLock *lock) +{ + return ((lock != NULL) && lock->hasProvisionalReference); +} + +/** + * Inform a PBN lock that it is responsible for a provisional reference. + * + * @param lock The PBN lock + **/ +void assignProvisionalReference(PBNLock *lock); + +/** + * Inform a PBN lock that it is no longer responsible for a provisional + * reference. + * + * @param lock The PBN lock + **/ +void unassignProvisionalReference(PBNLock *lock); + +/** + * If the lock is responsible for a provisional reference, release that + * reference. This method is called when the lock is released. + * + * @param lock The lock + * @param lockedPBN The PBN covered by the lock + * @param allocator The block allocator from which to release the reference + **/ +void releaseProvisionalReference(PBNLock *lock, + PhysicalBlockNumber lockedPBN, + BlockAllocator *allocator); + +#endif /* PBN_LOCK_H */ diff --git a/source/vdo/base/pbnLockPool.c b/source/vdo/base/pbnLockPool.c new file mode 100644 index 0000000..38e2f32 --- /dev/null +++ b/source/vdo/base/pbnLockPool.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLockPool.c#2 $ + */ + +#include "pbnLockPool.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "ringNode.h" +#include "pbnLock.h" + +/** + * Unused (idle) PBN locks are kept in a ring. Just like in a malloc + * implementation, the lock structure is unused memory, so we can save a bit + * of space (and not pollute the lock structure proper) by using a union to + * overlay the lock structure with the free list. + **/ +typedef union idlePBNLock { + /** Only used while locks are in the pool */ + RingNode node; + /** Only used while locks are not in the pool */ + PBNLock lock; +} IdlePBNLock; + +/** + * The lock pool is little more than the memory allocated for the locks. + **/ +struct pbnLockPool { + /** The number of locks allocated for the pool */ + size_t capacity; + /** The number of locks currently borrowed from the pool */ + size_t borrowed; + /** A ring containing all idle PBN lock instances */ + RingNode idleRing; + /** The memory for all the locks allocated by this pool */ + IdlePBNLock locks[]; +}; + +/**********************************************************************/ +int makePBNLockPool(size_t capacity, PBNLockPool **poolPtr) +{ + PBNLockPool *pool; + int result = ALLOCATE_EXTENDED(PBNLockPool, capacity, IdlePBNLock, __func__, + &pool); + if (result != VDO_SUCCESS) { + return result; + } + + pool->capacity = capacity; + pool->borrowed = capacity; + initializeRing(&pool->idleRing); + + for (size_t i = 0; i < capacity; i++) { + PBNLock *lock = &pool->locks[i].lock; + returnPBNLockToPool(pool, &lock); + } + + *poolPtr = pool; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freePBNLockPool(PBNLockPool **poolPtr) +{ + if (*poolPtr == NULL) { + return; + } + + PBNLockPool *pool = *poolPtr; + ASSERT_LOG_ONLY(pool->borrowed == 0, + "All PBN locks must be returned to the pool before it is" + " freed, but %zu locks are still on loan", + pool->borrowed); + FREE(pool); + *poolPtr = NULL; +} + +/**********************************************************************/ +int borrowPBNLockFromPool(PBNLockPool *pool, + PBNLockType type, + PBNLock **lockPtr) +{ + if (pool->borrowed >= pool->capacity) { + return logErrorWithStringError(VDO_LOCK_ERROR, + "no free PBN locks left to borrow"); + } + pool->borrowed += 1; + + RingNode *idleNode = popRingNode(&pool->idleRing); + // The lock was zeroed when it was placed in the pool, but the overlapping + // ring pointers are non-zero after a pop. + memset(idleNode, 0, sizeof(*idleNode)); + + STATIC_ASSERT(offsetof(IdlePBNLock, node) == offsetof(IdlePBNLock, lock)); + PBNLock *lock = (PBNLock *) idleNode; + initializePBNLock(lock, type); + + *lockPtr = lock; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void returnPBNLockToPool(PBNLockPool *pool, PBNLock **lockPtr) +{ + // Take what should be the last lock reference from the caller + PBNLock *lock = *lockPtr; + *lockPtr = NULL; + + // A bit expensive, but will promptly catch some use-after-free errors. + memset(lock, 0, sizeof(*lock)); + + RingNode *idleNode = (RingNode *) lock; + initializeRing(idleNode); + pushRingNode(&pool->idleRing, idleNode); + + ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed"); + pool->borrowed -= 1; +} diff --git a/source/vdo/base/pbnLockPool.h b/source/vdo/base/pbnLockPool.h new file mode 100644 index 0000000..6853f84 --- /dev/null +++ b/source/vdo/base/pbnLockPool.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLockPool.h#1 $ + */ + +#ifndef PBN_LOCK_POOL_H +#define PBN_LOCK_POOL_H + +#include "pbnLock.h" +#include "types.h" + +typedef struct pbnLockPool PBNLockPool; + +/** + * Create a new PBN lock pool and all the lock instances it can loan out. + * + * @param [in] capacity The number of PBN locks to allocate for the pool + * @param [out] poolPtr A pointer to receive the new pool + * + * @return a VDO_SUCCESS or an error code + **/ +int makePBNLockPool(size_t capacity, PBNLockPool **poolPtr) + __attribute__((warn_unused_result)); + +/** + * Free a PBN lock pool null out the reference to it. This also frees all all + * the PBN locks it allocated, so the caller must ensure that all locks have + * been returned to the pool. + * + * @param [in,out] poolPtr The reference to the lock pool to free + **/ +void freePBNLockPool(PBNLockPool **poolPtr); + +/** + * Borrow a PBN lock from the pool and initialize it with the provided type. + * Pools do not grow on demand or allocate memory, so this will fail if the + * pool is empty. Borrowed locks are still associated with this pool and must + * be returned to only this pool. + * + * @param [in] pool The pool from which to borrow + * @param [in] type The type with which to initialize the lock + * @param [out] lockPtr A pointer to receive the borrowed lock + * + * @return VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty + **/ +int borrowPBNLockFromPool(PBNLockPool *pool, + PBNLockType type, + PBNLock **lockPtr) + __attribute__((warn_unused_result)); + +/** + * Return to the pool a lock that was borrowed from it, and null out the + * caller's reference to it. It must be the last live reference, as if the + * memory were being freed (the lock memory will re-initialized or zeroed). + * + * @param [in] pool The pool from which the lock was borrowed + * @param [in,out] lockPtr The last reference to the lock being returned + **/ +void returnPBNLockToPool(PBNLockPool *pool, PBNLock **lockPtr); + +#endif // PBN_LOCK_POOL_H diff --git a/source/vdo/base/physicalLayer.c b/source/vdo/base/physicalLayer.c new file mode 100644 index 0000000..231a3bf --- /dev/null +++ b/source/vdo/base/physicalLayer.c @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalLayer.c#1 $ + */ + +#include "physicalLayer.h" + +static PhysicalLayerGetter *physicalLayerGetter; + +/**********************************************************************/ +void registerPhysicalLayerGetter(PhysicalLayerGetter *getter) +{ + physicalLayerGetter = getter; +} + +/**********************************************************************/ +PhysicalLayer *getPhysicalLayer(void) +{ + if (physicalLayerGetter != NULL) { + return (*physicalLayerGetter)(); + } + return NULL; +} diff --git a/source/vdo/base/physicalLayer.h b/source/vdo/base/physicalLayer.h new file mode 100644 index 0000000..18d6a20 --- /dev/null +++ b/source/vdo/base/physicalLayer.h @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalLayer.h#2 $ + */ + +#ifndef PHYSICAL_LAYER_H +#define PHYSICAL_LAYER_H + +#include "types.h" + +static const CRC32Checksum INITIAL_CHECKSUM = 0xffffffff; + +enum { + /* The size of a CRC-32 checksum */ + CHECKSUM_SIZE = sizeof(CRC32Checksum), +}; + +/** + * A function to destroy a physical layer and NULL out the reference to it. + * + * @param layerPtr A pointer to the layer to destroy + **/ +typedef void LayerDestructor(PhysicalLayer **layerPtr); + +/** + * A function to update a running CRC-32 checksum. + * + * @param crc The current value of the crc + * @param buffer The data to add to the checksum + * @param length The length of the data + * + * @return The updated value of the checksum + **/ +typedef uint32_t CRC32Updater(CRC32Checksum crc, + const byte *buffer, + size_t length); + +/** + * A function to report the block count of a physicalLayer. + * + * @param layer The layer + * + * @return The block count of the layer + **/ +typedef BlockCount BlockCountGetter(PhysicalLayer *layer); + +/** + * A function which can allocate a buffer suitable for use in an + * ExtentReader or ExtentWriter. + * + * @param [in] layer The physical layer in question + * @param [in] bytes The size of the buffer, in bytes. + * @param [in] why The occasion for allocating the buffer + * @param [out] bufferPtr A pointer to hold the buffer + * + * @return a success or error code + **/ +typedef int BufferAllocator(PhysicalLayer *layer, + size_t bytes, + const char *why, + char **bufferPtr); + +/** + * A function which can read an extent from a physicalLayer. + * + * @param [in] layer The physical layer from which to read + * @param [in] startBlock The physical block number of the start of the + * extent + * @param [in] blockCount The number of blocks in the extent + * @param [out] buffer A buffer to hold the extent + * @param [out] blocksRead A pointer to hold the number of blocks read (may be + * NULL) + * + * @return a success or error code + **/ +typedef int ExtentReader(PhysicalLayer *layer, + PhysicalBlockNumber startBlock, + size_t blockCount, + char *buffer, + size_t *blocksRead); + +/** + * A function which can write an extent to a physicalLayer. + * + * @param [in] layer The physical layer to which to write + * @param [in] startBlock The physical block number of the start of the + * extent + * @param [in] blockCount The number of blocks in the extent + * @param [in] buffer The buffer which contains the data + * @param [out] blocksWritten A pointer to hold the number of blocks written + * (may be NULL) + * + * @return a success or error code + **/ +typedef int ExtentWriter(PhysicalLayer *layer, + PhysicalBlockNumber startBlock, + size_t blockCount, + char *buffer, + size_t *blocksWritten); + +/** + * A function to allocate a metadata VIO. + * + * @param [in] layer The physical layer + * @param [in] vioType The type of VIO to create + * @param [in] priority The relative priority to assign to the VIOs + * @param [in] parent The parent of this VIO + * @param [in] data The buffer + * @param [out] vioPtr A pointer to hold the new VIO + * + * @return VDO_SUCCESS or an error + **/ +typedef int MetadataVIOCreator(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + char *data, + VIO **vioPtr); + +/** + * A function to allocate an AllocatingVIO for compressed writes. + * + * @param [in] layer The physical layer + * @param [in] parent The parent of this VIO + * @param [in] data The buffer + * @param [out] allocatingVIOPtr A pointer to hold the new AllocatingVIO + * + * @return VDO_SUCCESS or an error + **/ +typedef int CompressedWriteVIOCreator(PhysicalLayer *layer, + void *parent, + char *data, + AllocatingVIO **allocatingVIOPtr); + +/** + * A function to destroy a VIO. The pointer to the VIO will be nulled out. + * + * @param vioPtr A pointer to the VIO to destroy + **/ +typedef void VIODestructor(VIO **vioPtr); + +/** + * A function to zero the contents of a DataVIO. + * + * @param dataVIO The DataVIO to zero + **/ +typedef AsyncDataOperation DataVIOZeroer; + +/** + * A function to copy the contents of a DataVIO into another DataVIO. + * + * @param source The dataVIO to copy from + * @param destination The dataVIO to copy to + **/ +typedef void DataCopier(DataVIO *source, DataVIO *destination); + +/** + * A function to apply a partial write to a DataVIO which has completed the + * read portion of a read-modify-write operation. + * + * @param dataVIO The dataVIO to modify + **/ +typedef AsyncDataOperation DataModifier; + +/** + * A function to asynchronously hash the block data, setting the chunk name of + * the DataVIO. This is asynchronous to allow the computation to be done on + * different threads. + * + * @param dataVIO The DataVIO to hash + **/ +typedef AsyncDataOperation DataHasher; + +/** + * A function to determine whether a block is a duplicate. This function + * expects the 'physical' field of the DataVIO to be set to the physical block + * where the block will be written if it is not a duplicate. If the block does + * turn out to be a duplicate, the DataVIO's 'isDuplicate' field will be set to + * true, and the DataVIO's 'advice' field will be set to the physical block and + * mapping state of the already stored copy of the block. + * + * @param dataVIO The DataVIO containing the block to check. + **/ +typedef AsyncDataOperation DuplicationChecker; + +/** + * A function to verify the duplication advice by examining an already-stored + * data block. This function expects the 'physical' field of the DataVIO to be + * set to the physical block where the block will be written if it is not a + * duplicate, and the 'duplicate' field to be set to the physical block and + * mapping state where a copy of the data may already exist. If the block is + * not a duplicate, the DataVIO's 'isDuplicate' field will be cleared. + * + * @param dataVIO The dataVIO containing the block to check. + **/ +typedef AsyncDataOperation DuplicationVerifier; + +/** + * A function to read a single DataVIO from the layer. + * + * If the DataVIO does not describe a read-modify-write operation, the + * physical layer may safely acknowledge the related user I/O request + * as complete. + * + * @param dataVIO The DataVIO to read + **/ +typedef AsyncDataOperation DataReader; + +/** + * A function to read a single metadata VIO from the layer. + * + * @param vio The vio to read + **/ +typedef AsyncOperation MetadataReader; + +/** + * A function to write a single DataVIO to the layer + * + * @param dataVIO The DataVIO to write + **/ +typedef AsyncDataOperation DataWriter; + +/** + * A function to write a single metadata VIO from the layer. + * + * @param vio The vio to write + **/ +typedef AsyncOperation MetadataWriter; + +/** + * A function to inform the layer that a DataVIO's related I/O request can be + * safely acknowledged as complete, even though the DataVIO itself may have + * further processing to do. + * + * @param dataVIO The DataVIO to acknowledge + **/ +typedef AsyncDataOperation DataAcknowledger; + +/** + * A function to compare the contents of a DataVIO to another DataVIO. + * + * @param first The first DataVIO to compare + * @param second The second DataVIO to compare + * + * @return true if the contents of the two DataVIOs are the same + **/ +typedef bool DataVIOComparator(DataVIO *first, DataVIO *second); + +/** + * A function to compress the data in a DataVIO. + * + * @param dataVIO The DataVIO to compress + **/ +typedef AsyncDataOperation DataCompressor; + +/** + * Update albireo. + * + * @param dataVIO The DataVIO which needs to change the entry for its data + **/ +typedef AsyncDataOperation AlbireoUpdater; + +/** + * A function to finish flush requests + * + * @param vdoFlush The flush requests + **/ +typedef void FlushComplete(VDOFlush **vdoFlush); + +/** + * A function to query the write policy of the layer. + * + * @param layer The layer to query + * + * @return the write policy of the layer + **/ +typedef WritePolicy WritePolicyGetter(PhysicalLayer *layer); + +/** + * A function to create an object that can be enqueued to run in a specified + * thread. The Enqueueable will be put into the 'enqueueable' field of the + * supplied completion. + * + * @param completion The completion to invoke the callback of + * + * @return VDO_SUCCESS or an error code + **/ +typedef int EnqueueableCreator(VDOCompletion *completion); + +/** + * A function to destroy and deallocate an Enqueueable object. + * + * @param enqueueablePtr Pointer to the object pointer to be destroyed + **/ +typedef void EnqueueableDestructor(Enqueueable **enqueueablePtr); + +/** + * A function to enqueue the Enqueueable object to run on the thread specified + * by its associated completion. + * + * @param enqueueable The object to be enqueued + **/ +typedef void Enqueuer(Enqueueable *enqueueable); + +/** + * A function to wait for an admin operation to complete. This function should + * not be called from a base-code thread. + * + * @param layer The layer on which to wait + **/ +typedef void OperationWaiter(PhysicalLayer *layer); + +/** + * A function to inform the layer of the result of an admin operation. + * + * @param layer The layer to inform + **/ +typedef void OperationComplete(PhysicalLayer *layer); + +/** + * A function to get the id of the current thread. + * + * @return The id of the current thread + **/ +typedef ThreadID ThreadIDGetter(void); + +/** + * A function to return the physical layer pointer for the current thread. + * + * @return The physical layer pointer + **/ +typedef PhysicalLayer *PhysicalLayerGetter(void); + +/** + * An abstraction representing the underlying physical layer. + **/ +struct physicalLayer { + // Management interface + LayerDestructor *destroy; + + // Synchronous interface + CRC32Updater *updateCRC32; + BlockCountGetter *getBlockCount; + + // Synchronous IO interface + BufferAllocator *allocateIOBuffer; + ExtentReader *reader; + ExtentWriter *writer; + + WritePolicyGetter *getWritePolicy; + + // Synchronous interfaces (vio-based) + MetadataVIOCreator *createMetadataVIO; + CompressedWriteVIOCreator *createCompressedWriteVIO; + VIODestructor *freeVIO; + DataVIOZeroer *zeroDataVIO; + DataCopier *copyData; + DataModifier *applyPartialWrite; + + // Asynchronous interface (vio-based) + DataHasher *hashData; + DuplicationChecker *checkForDuplication; + DuplicationVerifier *verifyDuplication; + DataReader *readData; + DataWriter *writeData; + CompressedWriter *writeCompressedBlock; + MetadataReader *readMetadata; + MetadataWriter *writeMetadata; + MetadataWriter *flush; + DataAcknowledger *acknowledgeDataVIO; + DataVIOComparator *compareDataVIOs; + DataCompressor *compressDataVIO; + AlbireoUpdater *updateAlbireo; + + // Asynchronous interface (other) + FlushComplete *completeFlush; + EnqueueableCreator *createEnqueueable; + EnqueueableDestructor *destroyEnqueueable; + Enqueuer *enqueue; + OperationWaiter *waitForAdminOperation; + OperationComplete *completeAdminOperation; + + // Thread specific interface + ThreadIDGetter *getCurrentThreadID; +}; + +/** + * Register the layer-specific implementation of getPhysicalLayer(). + * + * @param getter The function to be called + **/ +void registerPhysicalLayerGetter(PhysicalLayerGetter *getter); + +/** + * Fetch the physical layer pointer for the current thread. + * + * @return The physical layer pointer + **/ +PhysicalLayer *getPhysicalLayer(void); + +/** + * Get the id of the callback thread on which a completion is current running. + * + * @return the current thread ID + **/ +static inline ThreadID getCallbackThreadID(void) +{ + return getPhysicalLayer()->getCurrentThreadID(); +} + +#endif // PHYSICAL_LAYER_H diff --git a/source/vdo/base/physicalZone.c b/source/vdo/base/physicalZone.c new file mode 100644 index 0000000..accb631 --- /dev/null +++ b/source/vdo/base/physicalZone.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalZone.c#3 $ + */ + +#include "physicalZone.h" + +#include "memoryAlloc.h" + +#include "blockAllocator.h" +#include "blockMap.h" +#include "completion.h" +#include "constants.h" +#include "dataVIO.h" +#include "flush.h" +#include "hashLock.h" +#include "intMap.h" +#include "pbnLock.h" +#include "pbnLockPool.h" +#include "slabDepot.h" +#include "vdoInternal.h" + +enum { + // Each user DataVIO needs a PBN read lock and write lock, and each packer + // output bin has an AllocatingVIO that needs a PBN write lock. + LOCK_POOL_CAPACITY = 2 * MAXIMUM_USER_VIOS + DEFAULT_PACKER_OUTPUT_BINS, +}; + +struct physicalZone { + /** Which physical zone this is */ + ZoneCount zoneNumber; + /** The thread ID for this zone */ + ThreadID threadID; + /** In progress operations keyed by PBN */ + IntMap *pbnOperations; + /** Pool of unused PBNLock instances */ + PBNLockPool *lockPool; + /** The block allocator for this zone */ + BlockAllocator *allocator; +}; + +/**********************************************************************/ +int makePhysicalZone(VDO *vdo, ZoneCount zoneNumber, PhysicalZone **zonePtr) +{ + PhysicalZone *zone; + int result = ALLOCATE(1, PhysicalZone, __func__, &zone); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeIntMap(LOCK_MAP_CAPACITY, 0, &zone->pbnOperations); + if (result != VDO_SUCCESS) { + freePhysicalZone(&zone); + return result; + } + + result = makePBNLockPool(LOCK_POOL_CAPACITY, &zone->lockPool); + if (result != VDO_SUCCESS) { + freePhysicalZone(&zone); + return result; + } + + zone->zoneNumber = zoneNumber; + zone->threadID = getPhysicalZoneThread(getThreadConfig(vdo), zoneNumber); + zone->allocator = getBlockAllocatorForZone(vdo->depot, zoneNumber); + + *zonePtr = zone; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freePhysicalZone(PhysicalZone **zonePtr) +{ + if (*zonePtr == NULL) { + return; + } + + PhysicalZone *zone = *zonePtr; + freePBNLockPool(&zone->lockPool); + freeIntMap(&zone->pbnOperations); + FREE(zone); + *zonePtr = NULL; +} + +/**********************************************************************/ +ZoneCount getPhysicalZoneNumber(const PhysicalZone *zone) +{ + return zone->zoneNumber; +} + +/**********************************************************************/ +ThreadID getPhysicalZoneThreadID(const PhysicalZone *zone) +{ + return zone->threadID; +} + +/**********************************************************************/ +BlockAllocator *getBlockAllocator(const PhysicalZone *zone) +{ + return zone->allocator; +} + +/**********************************************************************/ +PBNLock *getPBNLock(PhysicalZone *zone, PhysicalBlockNumber pbn) +{ + return ((zone == NULL) ? NULL : intMapGet(zone->pbnOperations, pbn)); +} + +/**********************************************************************/ +int attemptPBNLock(PhysicalZone *zone, + PhysicalBlockNumber pbn, + PBNLockType type, + PBNLock **lockPtr) +{ + // Borrow and prepare a lock from the pool so we don't have to do two IntMap + // accesses in the common case of no lock contention. + PBNLock *newLock; + int result = borrowPBNLockFromPool(zone->lockPool, type, &newLock); + if (result != VDO_SUCCESS) { + ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock"); + return result; + } + + PBNLock *lock; + result = intMapPut(zone->pbnOperations, pbn, newLock, false, + (void **) &lock); + if (result != VDO_SUCCESS) { + returnPBNLockToPool(zone->lockPool, &newLock); + return result; + } + + if (lock != NULL) { + // The lock is already held, so we don't need the borrowed lock. + returnPBNLockToPool(zone->lockPool, &newLock); + + result = ASSERT(lock->holderCount > 0, + "physical block %llu lock held", pbn); + if (result != VDO_SUCCESS) { + return result; + } + *lockPtr = lock; + } else { + *lockPtr = newLock; + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +void releasePBNLock(PhysicalZone *zone, + PhysicalBlockNumber lockedPBN, + PBNLock **lockPtr) +{ + PBNLock *lock = *lockPtr; + if (lock == NULL) { + return; + } + *lockPtr = NULL; + + ASSERT_LOG_ONLY(lock->holderCount > 0, + "should not be releasing a lock that is not held"); + + lock->holderCount -= 1; + if (lock->holderCount > 0) { + // The lock was shared and is still referenced, so don't release it yet. + return; + } + + PBNLock *holder = intMapRemove(zone->pbnOperations, lockedPBN); + ASSERT_LOG_ONLY((lock == holder), + "physical block lock mismatch for block %llu", + lockedPBN); + + releaseProvisionalReference(lock, lockedPBN, zone->allocator); + + returnPBNLockToPool(zone->lockPool, &lock); +} + +/**********************************************************************/ +void dumpPhysicalZone(const PhysicalZone *zone) +{ + dumpBlockAllocator(zone->allocator); +} diff --git a/source/vdo/base/physicalZone.h b/source/vdo/base/physicalZone.h new file mode 100644 index 0000000..2c02bbe --- /dev/null +++ b/source/vdo/base/physicalZone.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalZone.h#1 $ + */ + +#ifndef PHYSICAL_ZONE_H +#define PHYSICAL_ZONE_H + +#include "pbnLock.h" +#include "types.h" + +/** + * Create a physical zone. + * + * @param [in] vdo The VDO to which the zone will belong + * @param [in] zoneNumber The number of the zone to create + * @param [out] zonePtr A pointer to hold the new PhysicalZone + * + * @return VDO_SUCCESS or an error code + **/ +int makePhysicalZone(VDO *vdo, ZoneCount zoneNumber, PhysicalZone **zonePtr) + __attribute__((warn_unused_result)); + +/** + * Free a physical zone and null out the reference to it. + * + * @param zonePtr A pointer to the zone to free + **/ +void freePhysicalZone(PhysicalZone **zonePtr); + +/** + * Get the zone number of a physical zone. + * + * @param zone The zone + * + * @return The number of the zone + **/ +ZoneCount getPhysicalZoneNumber(const PhysicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the ID of a physical zone's thread. + * + * @param zone The zone + * + * @return The zone's thread ID + **/ +ThreadID getPhysicalZoneThreadID(const PhysicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the block allocator from a physical zone. + * + * @param zone The zone + * + * @return The zone's allocator + **/ +BlockAllocator *getBlockAllocator(const PhysicalZone *zone) + __attribute__((warn_unused_result)); + +/** + * Get the lock on a PBN if one exists. + * + * @param zone The physical zone responsible for the PBN + * @param pbn The physical block number whose lock is desired + * + * @return The lock or NULL if the PBN is not locked + **/ +PBNLock *getPBNLock(PhysicalZone *zone, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Attempt to lock a physical block in the zone responsible for it. If the PBN + * is already locked, the existing lock will be returned. Otherwise, a new + * lock instance will be borrowed from the pool, initialized, and returned. + * The lock owner will be NULL for a new lock acquired by the caller, who is + * responsible for setting that field promptly. The lock owner will be + * non-NULL when there is already an existing lock on the PBN. + * + * @param [in] zone The physical zone responsible for the PBN + * @param [in] pbn The physical block number to lock + * @param [in] type The type with which to initialize a new lock + * @param [out] lockPtr A pointer to receive the lock, existing or new + * + * @return VDO_SUCCESS or an error + **/ +int attemptPBNLock(PhysicalZone *zone, + PhysicalBlockNumber pbn, + PBNLockType type, + PBNLock **lockPtr) + __attribute__((warn_unused_result)); + +/** + * Release a physical block lock if it is held, return it to the lock pool, + * and null out the caller's reference to it. It must be the last live + * reference, as if the memory were being freed (the lock memory will + * re-initialized or zeroed). + * + * @param [in] zone The physical zone in which the lock was obtained + * @param [in] lockedPBN The physical block number to unlock + * @param [in,out] lockPtr The last reference to the lock being released + **/ +void releasePBNLock(PhysicalZone *zone, + PhysicalBlockNumber lockedPBN, + PBNLock **lockPtr); + +/** + * Dump information about a physical zone to the log for debugging. + * + * @param zone The zone to dump + **/ +void dumpPhysicalZone(const PhysicalZone *zone); + +#endif // PHYSICAL_ZONE_H diff --git a/source/vdo/base/pointerMap.c b/source/vdo/base/pointerMap.c new file mode 100644 index 0000000..395f266 --- /dev/null +++ b/source/vdo/base/pointerMap.c @@ -0,0 +1,633 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.c#1 $ + */ + +/** + * Hash table implementation of a map from integers to pointers, implemented + * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see + * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does + * not contain any of the locking/concurrency features of the algorithm, just + * the collision resolution scheme. + * + * Hopscotch Hashing is based on hashing with open addressing and linear + * probing. All the entries are stored in a fixed array of buckets, with no + * dynamic allocation for collisions. Unlike linear probing, all the entries + * that hash to a given bucket are stored within a fixed neighborhood starting + * at that bucket. Chaining is effectively represented as a bit vector + * relative to each bucket instead of as pointers or explicit offsets. + * + * When an empty bucket cannot be found within a given neighborhood, + * subsequent neighborhoods are searched, and one or more entries will "hop" + * into those neighborhoods. When this process works, an empty bucket will + * move into the desired neighborhood, allowing the entry to be added. When + * that process fails (typically when the buckets are around 90% full), the + * table must be resized and the all entries rehashed and added to the + * expanded table. + * + * Unlike linear probing, the number of buckets that must be searched in the + * worst case has a fixed upper bound (the size of the neighborhood). Those + * entries occupy a small number of memory cache lines, leading to improved + * use of the cache (fewer misses on both successful and unsuccessful + * searches). Hopscotch hashing outperforms linear probing at much higher load + * factors, so even with the increased memory burden for maintaining the hop + * vectors, less memory is needed to achieve that performance. Hopscotch is + * also immune to "contamination" from deleting entries since entries are + * genuinely removed instead of being replaced by a placeholder. + * + * The published description of the algorithm used a bit vector, but the paper + * alludes to an offset scheme which is used by this implementation. Since the + * entries in the neighborhood are within N entries of the hash bucket at the + * start of the neighborhood, a pair of small offset fields each log2(N) bits + * wide is all that's needed to maintain the hops as a linked list. In order + * to encode "no next hop" (i.e. NULL) as the natural initial value of zero, + * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 => + * offset=1, etc.) We can represent neighborhoods of up to 255 entries with + * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the + * first entry in the list is always the bucket closest to the start of the + * neighborhood. + * + * While individual accesses tend to be very fast, the table resize operations + * are very very expensive. If an upper bound on the latency of adding an + * entry to the table is needed, we either need to ensure the table is + * pre-sized to be large enough so no resize is ever needed, or we'll need to + * develop an approach to incrementally resize the table. + **/ + +#include "pointerMap.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +enum { + DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table + NEIGHBORHOOD = 255, // the number of buckets in each neighborhood + MAX_PROBES = 1024, // limit on the number of probes for a free bucket + NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list + DEFAULT_LOAD = 75 // a compromise between memory use and performance +}; + +/** + * Buckets are packed together to reduce memory usage and improve cache + * efficiency. It would be tempting to encode the hop offsets separately and + * maintain alignment of key/value pairs, but it's crucial to keep the hop + * fields near the buckets that they use them so they'll tend to share cache + * lines. + **/ +typedef struct __attribute__((packed)) bucket { + uint8_t firstHop; // the biased offset of the first entry in the hop list + // of the neighborhood that hashes to this bucket + uint8_t nextHop; // the biased offset of the next bucket in the hop list + + const void *key; // the key stored in this bucket + void *value; // the value stored in this bucket (NULL if empty) +} Bucket; + +/** + * The concrete definition of the opaque PointerMap type. To avoid having to + * wrap the neighborhoods of the last entries back around to the start of the + * bucket array, we allocate a few more buckets at the end of the array + * instead, which is why capacity and bucketCount are different. + **/ +struct pointerMap { + /** the number of entries stored in the map */ + size_t size; + /** the number of neighborhoods in the map */ + size_t capacity; + /** the number of buckets in the bucket array */ + size_t bucketCount; + /** the array of hash buckets */ + Bucket *buckets; + /** the function for comparing keys for equality */ + PointerKeyComparator *comparator; + /** the function for getting a hash code from a key */ + PointerKeyHasher *hasher; +}; + +/** + * Initialize a PointerMap. + * + * @param map the map to initialize + * @param capacity the initial capacity of the map + * + * @return UDS_SUCCESS or an error code + **/ +static int allocateBuckets(PointerMap *map, size_t capacity) +{ + map->size = 0; + map->capacity = capacity; + + // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a + // full neighborhood without have to wrap back around to element zero. + map->bucketCount = capacity + (NEIGHBORHOOD - 1); + return ALLOCATE(map->bucketCount, Bucket, "PointerMap buckets", + &map->buckets); +} + +/**********************************************************************/ +int makePointerMap(size_t initialCapacity, + unsigned int initialLoad, + PointerKeyComparator comparator, + PointerKeyHasher hasher, + PointerMap **mapPtr) +{ + // Use the default initial load if the caller did not specify one. + if (initialLoad == 0) { + initialLoad = DEFAULT_LOAD; + } + if (initialLoad > 100) { + return UDS_INVALID_ARGUMENT; + } + + PointerMap *map; + int result = ALLOCATE(1, PointerMap, "PointerMap", &map); + if (result != UDS_SUCCESS) { + return result; + } + + map->hasher = hasher; + map->comparator = comparator; + + // Use the default capacity if the caller did not specify one. + size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY; + + // Scale up the capacity by the specified initial load factor. + // (i.e to hold 1000 entries at 80% load we need a capacity of 1250) + capacity = capacity * 100 / initialLoad; + + result = allocateBuckets(map, capacity); + if (result != UDS_SUCCESS) { + freePointerMap(&map); + return result; + } + + *mapPtr = map; + return UDS_SUCCESS; +} + +/** + * Free the bucket array for the map. + * + * @param map the map whose bucket array is to be freed + **/ +static void freeBuckets(PointerMap *map) +{ + FREE(map->buckets); + map->buckets = NULL; +} + +/**********************************************************************/ +void freePointerMap(PointerMap **mapPtr) +{ + if (*mapPtr != NULL) { + freeBuckets(*mapPtr); + FREE(*mapPtr); + *mapPtr = NULL; + } +} + +/**********************************************************************/ +size_t pointerMapSize(const PointerMap *map) +{ + return map->size; +} + +/** + * Convert a biased hop offset within a neighborhood to a pointer to the + * bucket it references. + * + * @param neighborhood the first bucket in the neighborhood + * @param hopOffset the biased hop offset to the desired bucket + * + * @return NULL if hopOffset is zero, otherwise a pointer to + * the bucket in the neighborhood at hopOffset - 1 + **/ +static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset) +{ + if (hopOffset == NULL_HOP_OFFSET) { + return NULL; + } + + STATIC_ASSERT(NULL_HOP_OFFSET == 0); + return &neighborhood[hopOffset - 1]; +} + +/** + * Add a bucket into the hop list for the neighborhood, inserting it into the + * list so the hop list remains sorted by hop offset. + * + * @param neighborhood the first bucket in the neighborhood + * @param newBucket the bucket to add to the hop list + **/ +static void insertInHopList(Bucket *neighborhood, Bucket *newBucket) +{ + // Zero indicates a NULL hop offset, so bias the hop offset by one. + int hopOffset = 1 + (newBucket - neighborhood); + + // Handle the special case of adding a bucket at the start of the list. + int nextHop = neighborhood->firstHop; + if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { + newBucket->nextHop = nextHop; + neighborhood->firstHop = hopOffset; + return; + } + + // Search the hop list for the insertion point that maintains the sort + // order. + for (;;) { + Bucket *bucket = dereferenceHop(neighborhood, nextHop); + nextHop = bucket->nextHop; + + if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { + newBucket->nextHop = nextHop; + bucket->nextHop = hopOffset; + return; + } + } +} + +/** + * Select and return the hash bucket for a given search key. + * + * @param map the map to search + * @param key the mapping key + **/ +static Bucket *selectBucket(const PointerMap *map, const void *key) +{ + /* + * Scale the 32-bit hash to a bucket index by treating it as a binary + * fraction and multiplying that by the capacity. If the hash is uniformly + * distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be + * uniformly distributed over [0 .. capacity-1]. The multiply and shift is + * much faster than a divide (modulus) on X86 CPUs. + */ + uint64_t hash = map->hasher(key); + return &map->buckets[(hash * map->capacity) >> 32]; +} + +/** + * Search the hop list associated with given hash bucket for a given search + * key. If the key is found, returns a pointer to the entry (bucket or + * collision), otherwise returns NULL. + * + * @param [in] map the map being searched + * @param [in] bucket the map bucket to search for the key + * @param [in] key the mapping key + * @param [out] previousPtr if not NULL, a pointer in which to + * store the bucket in the list preceding the one + * that had the matching key + * + * @return an entry that matches the key, or NULL if not found + **/ +static Bucket *searchHopList(PointerMap *map, + Bucket *bucket, + const void *key, + Bucket **previousPtr) +{ + Bucket *previous = NULL; + unsigned int nextHop = bucket->firstHop; + while (nextHop != NULL_HOP_OFFSET) { + // Check the neighboring bucket indexed by the offset for the desired key. + Bucket *entry = dereferenceHop(bucket, nextHop); + if ((entry->value != NULL) && map->comparator(key, entry->key)) { + if (previousPtr != NULL) { + *previousPtr = previous; + } + return entry; + } + nextHop = entry->nextHop; + previous = entry; + } + return NULL; +} + +/**********************************************************************/ +void *pointerMapGet(PointerMap *map, const void *key) +{ + Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL); + return ((match != NULL) ? match->value : NULL); +} + +/** + * Increase the number of hash buckets and rehash all the existing entries, + * storing them in the new buckets. + * + * @param map the map to resize + **/ +static int resizeBuckets(PointerMap *map) +{ + // Copy the top-level map data to the stack. + PointerMap oldMap = *map; + + // Re-initialize the map to be empty and 50% larger. + size_t newCapacity = map->capacity / 2 * 3; + logInfo("%s: attempting resize from %zu to %zu, current size=%zu", + __func__, map->capacity, newCapacity, map->size); + int result = allocateBuckets(map, newCapacity); + if (result != UDS_SUCCESS) { + *map = oldMap; + return result; + } + + // Populate the new hash table from the entries in the old bucket array. + for (size_t i = 0; i < oldMap.bucketCount; i++) { + Bucket *entry = &oldMap.buckets[i]; + if (entry->value == NULL) { + continue; + } + + result = pointerMapPut(map, entry->key, entry->value, true, NULL); + if (result != UDS_SUCCESS) { + // Destroy the new partial map and restore the map from the stack. + freeBuckets(map); + *map = oldMap; + return result; + } + } + + // Destroy the old bucket array. + freeBuckets(&oldMap); + return UDS_SUCCESS; +} + +/** + * Probe the bucket array starting at the given bucket for the next empty + * bucket, returning a pointer to it. NULL will be returned if + * the search reaches the end of the bucket array or if the number of linear + * probes exceeds a specified limit. + * + * @param map the map containing the buckets to search + * @param bucket the bucket at which to start probing + * @param maxProbes the maximum number of buckets to search + * + * @return the next empty bucket, or NULL if the search failed + **/ +static Bucket *findEmptyBucket(PointerMap *map, + Bucket *bucket, + unsigned int maxProbes) +{ + // Limit the search to either the nearer of the end of the bucket array or a + // fixed distance beyond the initial bucket. + size_t remaining = &map->buckets[map->bucketCount] - bucket; + Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)]; + + for (Bucket *entry = bucket; entry < sentinel; entry++) { + if (entry->value == NULL) { + return entry; + } + } + return NULL; +} + +/** + * Move an empty bucket closer to the start of the bucket array. This searches + * the neighborhoods that contain the empty bucket for a non-empty bucket + * closer to the start of the array. If such a bucket is found, this swaps the + * two buckets by moving the entry to the empty bucket. + * + * @param map the map containing the bucket + * @param hole the empty bucket to fill with an entry that precedes it in one + * of its enclosing neighborhoods + * + * @return the bucket that was vacated by moving its entry to the provided + * hole, or NULL if no entry could be moved + **/ +static Bucket *moveEmptyBucket(PointerMap *map __attribute__((unused)), + Bucket *hole) +{ + /* + * Examine every neighborhood that the empty bucket is part of, starting + * with the one in which it is the last bucket. No boundary check is needed + * for the negative array arithmetic since this function is only called when + * hole is at least NEIGHBORHOOD cells deeper into the array than a valid + * bucket. + */ + for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { + // Find the entry that is nearest to the bucket, which means it will be + // nearest to the hash bucket whose neighborhood is full. + Bucket *newHole = dereferenceHop(bucket, bucket->firstHop); + if (newHole == NULL) { + // There are no buckets in this neighborhood that are in use by this one + // (they must all be owned by overlapping neighborhoods). + continue; + } + + // Skip this bucket if its first entry is actually further away than the + // hole that we're already trying to fill. + if (hole < newHole) { + continue; + } + + /* + * We've found an entry in this neighborhood that we can "hop" further + * away, moving the hole closer to the hash bucket, if not all the way + * into its neighborhood. + */ + + // The entry that will be the new hole is the first bucket in the list, + // so setting firstHop is all that's needed remove it from the list. + bucket->firstHop = newHole->nextHop; + newHole->nextHop = NULL_HOP_OFFSET; + + // Move the entry into the original hole. + hole->key = newHole->key; + hole->value = newHole->value; + newHole->value = NULL; + + // Insert the filled hole into the hop list for the neighborhood. + insertInHopList(bucket, hole); + return newHole; + } + + // We couldn't find an entry to relocate to the hole. + return NULL; +} + +/** + * Find and update any existing mapping for a given key, returning the value + * associated with the key in the provided pointer. + * + * @param [in] map the PointerMap to attempt to modify + * @param [in] neighborhood the first bucket in the neighborhood that + * would contain the search key + * @param [in] key the key with which to associate the new value + * @param [in] newValue the value to be associated with the key + * @param [in] update whether to overwrite an existing value + * @param [out] oldValuePtr a pointer in which to store the old value + * (unmodified if no mapping was found) + * + * @return true if the map contains a mapping for the key + * false if it does not + **/ +static bool updateMapping(PointerMap *map, + Bucket *neighborhood, + const void *key, + void *newValue, + bool update, + void **oldValuePtr) +{ + Bucket *bucket = searchHopList(map, neighborhood, key, NULL); + if (bucket == NULL) { + // There is no bucket containing the key in the neighborhood. + return false; + } + + // Return the value of the current mapping (if desired) and update the + // mapping with the new value (if desired). + if (oldValuePtr != NULL) { + *oldValuePtr = bucket->value; + } + if (update) { + // We're dropping the old key pointer on the floor here, assuming it's a + // property of the value or that it's otherwise safe to just forget. + bucket->key = key; + bucket->value = newValue; + } + return true; +} + +/** + * Find an empty bucket in a specified neighborhood for a new mapping or + * attempt to re-arrange mappings so there is such a bucket. This operation + * may fail (returning NULL) if an empty bucket is not available or could not + * be relocated to the neighborhood. + * + * @param map the PointerMap to search or modify + * @param neighborhood the first bucket in the neighborhood in which + * an empty bucket is needed for a new mapping + * + * @return a pointer to an empty bucket in the desired neighborhood, or + * NULL if a vacancy could not be found or arranged + **/ +static Bucket *findOrMakeVacancy(PointerMap *map, Bucket *neighborhood) +{ + // Probe within and beyond the neighborhood for the first empty bucket. + Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES); + + // Keep trying until the empty bucket is in the bucket's neighborhood or we + // are unable to move it any closer by swapping it with a filled bucket. + while (hole != NULL) { + int distance = hole - neighborhood; + if (distance < NEIGHBORHOOD) { + // We've found or relocated an empty bucket close enough to the initial + // hash bucket to be referenced by its hop vector. + return hole; + } + + // The nearest empty bucket isn't within the neighborhood that must + // contain the new entry, so try to swap it with bucket that is closer. + hole = moveEmptyBucket(map, hole); + } + + return NULL; +} + +/**********************************************************************/ +int pointerMapPut(PointerMap *map, + const void *key, + void *newValue, + bool update, + void **oldValuePtr) +{ + if (newValue == NULL) { + return UDS_INVALID_ARGUMENT; + } + + // Select the bucket at the start of the neighborhood that must contain any + // entry for the provided key. + Bucket *neighborhood = selectBucket(map, key); + + // Check whether the neighborhood already contains an entry for the key, in + // which case we optionally update it, returning the old value. + if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) { + return UDS_SUCCESS; + } + + /* + * Find an empty bucket in the desired neighborhood for the new entry or + * re-arrange entries in the map so there is such a bucket. This operation + * will usually succeed; the loop body will only be executed on the rare + * occasions that we have to resize the map. + */ + Bucket *bucket; + while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) { + /* + * There is no empty bucket in which to put the new entry in the current + * map, so we're forced to allocate a new bucket array with a larger + * capacity, re-hash all the entries into those buckets, and try again (a + * very expensive operation for large maps). + */ + int result = resizeBuckets(map); + if (result != UDS_SUCCESS) { + return result; + } + + // Resizing the map invalidates all pointers to buckets, so recalculate + // the neighborhood pointer. + neighborhood = selectBucket(map, key); + } + + // Put the new entry in the empty bucket, adding it to the neighborhood. + bucket->key = key; + bucket->value = newValue; + insertInHopList(neighborhood, bucket); + map->size += 1; + + // There was no existing entry, so there was no old value to be returned. + if (oldValuePtr != NULL) { + *oldValuePtr = NULL; + } + return UDS_SUCCESS; +} + +/**********************************************************************/ +void *pointerMapRemove(PointerMap *map, const void *key) +{ + // Select the bucket to search and search it for an existing entry. + Bucket *bucket = selectBucket(map, key); + Bucket *previous; + Bucket *victim = searchHopList(map, bucket, key, &previous); + + if (victim == NULL) { + // There is no matching entry to remove. + return NULL; + } + + // We found an entry to remove. Save the mapped value to return later and + // empty the bucket. + map->size -= 1; + void *value = victim->value; + victim->value = NULL; + victim->key = 0; + + // The victim bucket is now empty, but it still needs to be spliced out of + // the hop list. + if (previous == NULL) { + // The victim is the head of the list, so swing firstHop. + bucket->firstHop = victim->nextHop; + } else { + previous->nextHop = victim->nextHop; + } + victim->nextHop = NULL_HOP_OFFSET; + + return value; +} diff --git a/source/vdo/base/pointerMap.h b/source/vdo/base/pointerMap.h new file mode 100644 index 0000000..1bd0bd2 --- /dev/null +++ b/source/vdo/base/pointerMap.h @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.h#1 $ + */ + +#ifndef POINTER_MAP_H +#define POINTER_MAP_H + +#include "common.h" + +/** + * PointerMap associates pointer values (void *) with the data + * referenced by pointer keys (void *). NULL pointer + * values are not supported. A NULL key value is supported when + * the instance's key comparator and hasher functions support it. + * + * The map is implemented as hash table, which should provide constant-time + * insert, query, and remove operations, although the insert may occasionally + * grow the table, which is linear in the number of entries in the map. The + * table will grow as needed to hold new entries, but will not shrink as + * entries are removed. + * + * The key and value pointers passed to the map are retained and used by the + * map, but are not owned by the map. Freeing the map does not attempt to free + * the pointers. The client is entirely responsible for the memory managment + * of the keys and values. The current interface and implementation assume + * that keys will be properties of the values, or that keys will not be memory + * managed, or that keys will not need to be freed as a result of being + * replaced when a key is re-mapped. + **/ + +typedef struct pointerMap PointerMap; + +/** + * The prototype of functions that compare the referents of two pointer keys + * for equality. If two keys are equal, then both keys must have the same the + * hash code associated with them by the hasher function defined below. + + * @param thisKey The first element to compare + * @param thatKey The second element to compare + * + * @return true if and only if the referents of the two + * key pointers are to be treated as the same key by the map + **/ +typedef bool PointerKeyComparator(const void *thisKey, const void *thatKey); + +/** + * The prototype of functions that get or calculate a hash code associated + * with the referent of pointer key. The hash code must be uniformly + * distributed over all uint32_t values. The hash code associated with a given + * key must not change while the key is in the map. If the comparator function + * says two keys are equal, then this function must return the same hash code + * for both keys. This function may be called many times for a key while an + * entry is stored for it in the map. + * + * @param key The pointer key to hash + * + * @return the hash code for the key + **/ +typedef uint32_t PointerKeyHasher(const void *key); + +/** + * Allocate and initialize a PointerMap. + * + * @param [in] initialCapacity The number of entries the map should + * initially be capable of holding (zero tells + * the map to use its own small default) + * @param [in] initialLoad The load factor of the map, expressed as an + * integer percentage (typically in the range + * 50 to 90, with zero telling the map to use + * its own default) + * @param [in] comparator The function to use to compare the referents + * of two pointer keys for equality + * @param [in] hasher The function to use obtain the hash code + * associated with each pointer key + * @param [out] mapPtr A pointer to hold the new PointerMap + * + * @return UDS_SUCCESS or an error code + **/ +int makePointerMap(size_t initialCapacity, + unsigned int initialLoad, + PointerKeyComparator comparator, + PointerKeyHasher hasher, + PointerMap **mapPtr) + __attribute__((warn_unused_result)); + +/** + * Free a PointerMap and null out the reference to it. NOTE: The map does not + * own the pointer keys and values stored in the map and they are not freed by + * this call. + * + * @param [in,out] mapPtr The reference to the PointerMap to free + **/ +void freePointerMap(PointerMap **mapPtr); + +/** + * Get the number of entries stored in a PointerMap. + * + * @param map The PointerMap to query + * + * @return the number of entries in the map + **/ +size_t pointerMapSize(const PointerMap *map); + +/** + * Retrieve the value associated with a given key from the PointerMap. + * + * @param map The PointerMap to query + * @param key The key to look up (may be NULL if the + * comparator and hasher functions support it) + * + * @return the value associated with the given key, or NULL + * if the key is not mapped to any value + **/ +void *pointerMapGet(PointerMap *map, const void *key); + +/** + * Try to associate a value (a pointer) with an integer in a PointerMap. + * If the map already contains a mapping for the provided key, the old value is + * only replaced with the specified value if update is true. In either case + * the old value is returned. If the map does not already contain a value for + * the specified key, the new value is added regardless of the value of update. + * + * If the value stored in the map is updated, then the key stored in the map + * will also be updated with the key provided by this call. The old key will + * not be returned due to the memory managment assumptions described in the + * interface header comment. + * + * @param [in] map The PointerMap to attempt to modify + * @param [in] key The key with which to associate the new value + * (may be NULL if the comparator and + * hasher functions support it) + * @param [in] newValue The value to be associated with the key + * @param [in] update Whether to overwrite an existing value + * @param [out] oldValuePtr A pointer in which to store either the old value + * (if the key was already mapped) or + * NULL if the map did not contain the + * key; NULL may be provided if the + * caller does not need to know the old value + * + * @return UDS_SUCCESS or an error code + **/ +int pointerMapPut(PointerMap *map, + const void *key, + void *newValue, + bool update, + void **oldValuePtr) + __attribute__((warn_unused_result)); + +/** + * Remove the mapping for a given key from the PointerMap. + * + * @param map The PointerMap from which to remove the mapping + * @param key The key whose mapping is to be removed (may be NULL + * if the comparator and hasher functions support it) + * + * @return the value that was associated with the key, or + * NULL if it was not mapped + **/ +void *pointerMapRemove(PointerMap *map, const void *key); + +#endif /* POINTER_MAP_H */ diff --git a/source/vdo/base/priorityTable.c b/source/vdo/base/priorityTable.c new file mode 100644 index 0000000..deb423b --- /dev/null +++ b/source/vdo/base/priorityTable.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/priorityTable.c#1 $ + */ + +#include "priorityTable.h" + +#include "errors.h" +#include "memoryAlloc.h" +#include "numUtils.h" + +#include "statusCodes.h" + +/** We use a single 64-bit search vector, so the maximum priority is 63 */ +enum { MAX_PRIORITY = 63 }; + +/** + * All the entries with the same priority are queued in a circular list in a + * bucket for that priority. The table is essentially an array of buckets. + **/ +typedef struct bucket { + /** The head of a queue of table entries, all having the same priority */ + RingNode queue; + /** The priority of all the entries in this bucket */ + unsigned int priority; +} Bucket; + +/** + * A priority table is an array of buckets, indexed by priority. New entries + * are added to the end of the queue in the appropriate bucket. The dequeue + * operation finds the highest-priority non-empty bucket by searching a bit + * vector represented as a single 8-byte word, which is very fast with + * compiler and CPU support. + **/ +struct priorityTable { + /** The maximum priority of entries that may be stored in this table */ + unsigned int maxPriority; + /** A bit vector flagging all buckets that are currently non-empty */ + uint64_t searchVector; + /** The array of all buckets, indexed by priority */ + Bucket buckets[]; +}; + +/** + * Convert a queue head to to the bucket that contains it. + * + * @param head The bucket queue ring head pointer to convert + * + * @return the enclosing bucket + **/ +static inline Bucket *asBucket(RingNode *head) +{ + STATIC_ASSERT(offsetof(Bucket, queue) == 0); + return (Bucket *) head; +} + +/**********************************************************************/ +int makePriorityTable(unsigned int maxPriority, PriorityTable **tablePtr) +{ + if (maxPriority > MAX_PRIORITY) { + return UDS_INVALID_ARGUMENT; + } + + PriorityTable *table; + int result = ALLOCATE_EXTENDED(PriorityTable, maxPriority + 1, Bucket, + __func__, &table); + if (result != VDO_SUCCESS) { + return result; + } + + for (unsigned int priority = 0; priority <= maxPriority; priority++) { + Bucket *bucket = &table->buckets[priority]; + bucket->priority = priority; + initializeRing(&bucket->queue); + } + + table->maxPriority = maxPriority; + table->searchVector = 0; + + *tablePtr = table; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freePriorityTable(PriorityTable **tablePtr) +{ + PriorityTable *table = *tablePtr; + if (table == NULL) { + return; + } + + // Unlink the buckets from any entries still in the table so the entries + // won't be left with dangling pointers to freed memory. + resetPriorityTable(table); + + FREE(table); + *tablePtr = NULL; +} + +/**********************************************************************/ +void resetPriorityTable(PriorityTable *table) +{ + table->searchVector = 0; + for (unsigned int priority = 0; priority <= table->maxPriority; priority++) { + unspliceRingNode(&table->buckets[priority].queue); + } +} + +/**********************************************************************/ +void priorityTableEnqueue(PriorityTable *table, + unsigned int priority, + RingNode *entry) +{ + ASSERT_LOG_ONLY((priority <= table->maxPriority), + "entry priority must be valid for the table"); + + // Append the entry to the queue in the specified bucket. + pushRingNode(&table->buckets[priority].queue, entry); + + // Flag the bucket in the search vector since it must be non-empty. + table->searchVector |= (1ULL << priority); +} + +/**********************************************************************/ +static inline void markBucketEmpty(PriorityTable *table, Bucket *bucket) +{ + table->searchVector &= ~(1ULL << bucket->priority); +} + +/**********************************************************************/ +RingNode *priorityTableDequeue(PriorityTable *table) +{ + // Find the highest priority non-empty bucket by finding the highest-order + // non-zero bit in the search vector. + int topPriority = logBaseTwo(table->searchVector); + + if (topPriority < 0) { + // All buckets are empty. + return NULL; + } + + // Dequeue the first entry in the bucket. + Bucket *bucket = &table->buckets[topPriority]; + RingNode *entry = unspliceRingNode(bucket->queue.next); + + // Clear the bit in the search vector if the bucket has been emptied. + if (isRingEmpty(&bucket->queue)) { + markBucketEmpty(table, bucket); + } + + return entry; +} + +/**********************************************************************/ +void priorityTableRemove(PriorityTable *table, RingNode *entry) +{ + // We can't guard against calls where the entry is on a ring for a different + // table, but it's easy to deal with an entry not in any table or ring. + if (isRingEmpty(entry)) { + return; + } + + // Remove the entry from the bucket ring, remembering a pointer to another + // entry in the ring. + RingNode *nextNode = entry->next; + unspliceRingNode(entry); + + // If the rest of the ring is now empty, the next node must be the ring head + // in the bucket and we can use it to update the search vector. + if (isRingEmpty(nextNode)) { + markBucketEmpty(table, asBucket(nextNode)); + } +} + +/**********************************************************************/ +bool isPriorityTableEmpty(PriorityTable *table) +{ + return (table->searchVector == 0); +} diff --git a/source/vdo/base/priorityTable.h b/source/vdo/base/priorityTable.h new file mode 100644 index 0000000..d48a570 --- /dev/null +++ b/source/vdo/base/priorityTable.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/priorityTable.h#2 $ + */ + +#ifndef PRIORITY_TABLE_H +#define PRIORITY_TABLE_H + +#include "ringNode.h" + +/** + * A PriorityTable is a simple implementation of a priority queue for entries + * with priorities that are small non-negative integer values. It implements + * the obvious priority queue operations of enqueuing an entry and dequeuing + * an entry with the maximum priority. It also supports removing an arbitrary + * entry. The priority of an entry already in the table can be changed by + * removing it and re-enqueuing it with a different priority. All operations + * have O(1) complexity. + * + * The links for the table entries must be embedded in the entries themselves. + * RingNode is used to link entries in the table and no wrapper type is + * declared, so an existing RingNode link in an object can also be used to + * queue it in a PriorityTable, assuming the field is not used for anything + * else while so queued. + * + * The table is implemented as an array of queues (circular lists) indexed by + * priority, along with a hint for which queues are non-empty. Steven Skiena + * calls a very similar structure a "bounded height priority queue", but given + * the resemblance to a hash table, "priority table" seems both shorter and + * more apt, if somewhat novel. + **/ + +typedef struct priorityTable PriorityTable; + +/** + * Allocate and initialize a new PriorityTable. + * + * @param [in] maxPriority The maximum priority value for table entries + * @param [out] tablePtr A pointer to hold the new table + * + * @return VDO_SUCCESS or an error code + **/ +int makePriorityTable(unsigned int maxPriority, PriorityTable **tablePtr) + __attribute__((warn_unused_result)); + +/** + * Free a PriorityTable and null out the reference to it. NOTE: The table does + * not own the entries stored in it and they are not freed by this call. + * + * @param [in,out] tablePtr The reference to the table to free + **/ +void freePriorityTable(PriorityTable **tablePtr); + +/** + * Add a new entry to the priority table, appending it to the queue for + * entries with the specified priority. + * + * @param table The table in which to store the entry + * @param priority The priority of the entry + * @param entry The RingNode embedded in the entry to store in the table + * (the caller must have initialized it) + **/ +void priorityTableEnqueue(PriorityTable *table, + unsigned int priority, + RingNode *entry); + +/** + * Reset a priority table, leaving it in the same empty state as when newly + * constructed. NOTE: The table does not own the entries stored in it and they + * are not freed (or even unlinked from each other) by this call. + * + * @param table The table to reset + **/ +void resetPriorityTable(PriorityTable *table); + +/** + * Find the highest-priority entry in the table, remove it from the table, and + * return it. If there are multiple entries with the same priority, the one + * that has been in the table with that priority the longest will be returned. + * + * @param table The priority table from which to remove an entry + * + * @return the dequeued entry, or NULL if the table is currently empty + **/ +RingNode *priorityTableDequeue(PriorityTable *table) + __attribute__((warn_unused_result)); + +/** + * Remove a specified entry from its priority table. + * + * @param table The table from which to remove the entry + * @param entry The entry to remove from the table + **/ +void priorityTableRemove(PriorityTable *table, RingNode *entry); + +/** + * Return whether the priority table is empty. + * + * @param table The table to check + * + * @return true if the table is empty + **/ +bool isPriorityTableEmpty(PriorityTable *table) + __attribute__((warn_unused_result)); + +#endif /* PRIORITY_TABLE_H */ diff --git a/source/vdo/base/readOnlyNotifier.c b/source/vdo/base/readOnlyNotifier.c new file mode 100644 index 0000000..ba837ac --- /dev/null +++ b/source/vdo/base/readOnlyNotifier.c @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyNotifier.c#5 $ + */ + +#include "readOnlyNotifier.h" + +#include "atomic.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "completion.h" +#include "physicalLayer.h" +#include "threadConfig.h" + +/** + * A ReadOnlyNotifier has a single completion which is used to perform + * read-only notifications, however, enterReadOnlyMode() may be called from any + * base thread. A pair of atomic fields are used to control the read-only mode + * entry process. The first field holds the read-only error. The second is the + * state field, which may hold any of the four special values enumerated here. + * + * When enterReadOnlyMode() is called from some base thread, a compare-and-swap + * is done on the readOnlyError, setting it to the supplied error if the value + * was VDO_SUCCESS. If this fails, some other thread has already intiated + * read-only entry or scheduled a pending entry, so the call exits. Otherwise, + * a compare-and-swap is done on the state, setting it to NOTIFYING if the + * value was MAY_NOTIFY. If this succeeds, the caller initiates the + * notification. If this failed due to notifications being disallowed, the + * notifier will be in the MAY_NOT_NOTIFY state but readOnlyError will not be + * VDO_SUCCESS. This configuration will indicate to allowReadOnlyModeEntry() + * that there is a pending notification to perform. + **/ +enum { + /** Notifications are allowed but not in progress */ + MAY_NOTIFY = 0, + /** A notification is in progress */ + NOTIFYING, + /** Notifications are not allowed */ + MAY_NOT_NOTIFY, + /** A notification has completed */ + NOTIFIED, +}; + +/** + * An object to be notified when the VDO enters read-only mode + **/ +typedef struct readOnlyListener ReadOnlyListener; + +struct readOnlyListener { + /** The listener */ + void *listener; + /** The method to call to notifiy the listener */ + ReadOnlyNotification *notify; + /** A pointer to the next listener */ + ReadOnlyListener *next; +}; + +/** + * Data associated with each base code thread. + **/ +typedef struct threadData { + /** + * Each thread maintains its own notion of whether the VDO is read-only so + * that the read-only state can be checked from any base thread without + * worrying about synchronization or thread safety. This does mean that + * knowledge of the VDO going read-only does not occur simultaneously across + * the VDO's threads, but that does not seem to cause any problems. + */ + bool isReadOnly; + /** + * A list of objects waiting to be notified on this thread that the VDO has + * entered read-only mode. + **/ + ReadOnlyListener *listeners; +} ThreadData; + +struct readOnlyNotifier { + /** The completion for entering read-only mode */ + VDOCompletion completion; + /** A completion waiting for notifications to be drained or enabled */ + VDOCompletion *waiter; + /** The code of the error which put the VDO into read-only mode */ + Atomic32 readOnlyError; + /** The current state of the notifier (values described above) */ + Atomic32 state; + /** The thread config of the VDO */ + const ThreadConfig *threadConfig; + /** The array of per-thread data */ + ThreadData threadData[]; +}; + +/** + * Convert a generic VDOCompletion to a ReadOnlyNotifier. + * + * @param completion The completion to convert + * + * @return The completion as a ReadOnlyNotifier + **/ +static inline ReadOnlyNotifier *asNotifier(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(ReadOnlyNotifier, completion) == 0); + assertCompletionType(completion->type, READ_ONLY_MODE_COMPLETION); + return (ReadOnlyNotifier *) completion; +} + +/**********************************************************************/ +int makeReadOnlyNotifier(bool isReadOnly, + const ThreadConfig *threadConfig, + PhysicalLayer *layer, + ReadOnlyNotifier **notifierPtr) +{ + ReadOnlyNotifier *notifier; + int result = ALLOCATE_EXTENDED(ReadOnlyNotifier, + threadConfig->baseThreadCount, ThreadData, + __func__, ¬ifier); + if (result != VDO_SUCCESS) { + return result; + } + + notifier->threadConfig = threadConfig; + if (isReadOnly) { + atomicStore32(¬ifier->readOnlyError, (uint32_t) VDO_READ_ONLY); + atomicStore32(¬ifier->state, NOTIFIED); + } else { + atomicStore32(¬ifier->state, MAY_NOTIFY); + } + result = initializeEnqueueableCompletion(¬ifier->completion, + READ_ONLY_MODE_COMPLETION, layer); + if (result != VDO_SUCCESS) { + freeReadOnlyNotifier(¬ifier); + return result; + } + + for (ThreadCount id = 0; id < threadConfig->baseThreadCount; id++) { + notifier->threadData[id].isReadOnly = isReadOnly; + } + + *notifierPtr = notifier; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeReadOnlyNotifier(ReadOnlyNotifier **notifierPtr) +{ + ReadOnlyNotifier *notifier = *notifierPtr; + if (notifier == NULL) { + return; + } + + for (ThreadCount id = 0; id < notifier->threadConfig->baseThreadCount; + id++) { + ThreadData *threadData = ¬ifier->threadData[id]; + ReadOnlyListener *listener = threadData->listeners; + while (listener != NULL) { + ReadOnlyListener *toFree = listener; + listener = listener->next; + FREE(toFree); + } + } + + destroyEnqueueable(¬ifier->completion); + FREE(notifier); + *notifierPtr = NULL; +} + +/** + * Check that a function was called on the admin thread. + * + * @param notifier The notifier + * @param caller The name of the function (for logging) + **/ +static void assertOnAdminThread(ReadOnlyNotifier *notifier, const char *caller) +{ + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((getAdminThread(notifier->threadConfig) == threadID), + "%s called on admin thread", caller); +} + + +/**********************************************************************/ +void waitUntilNotEnteringReadOnlyMode(ReadOnlyNotifier *notifier, + VDOCompletion *parent) +{ + if (notifier == NULL) { + finishCompletion(parent, VDO_SUCCESS); + return; + } + + assertOnAdminThread(notifier, __func__); + if (notifier->waiter != NULL) { + finishCompletion(parent, VDO_COMPONENT_BUSY); + return; + } + + uint32_t state = atomicLoad32(¬ifier->state); + if ((state == MAY_NOT_NOTIFY) || (state == NOTIFIED)) { + // Notifications are already done or disallowed. + completeCompletion(parent); + return; + } + + if (compareAndSwap32(¬ifier->state, MAY_NOTIFY, MAY_NOT_NOTIFY)) { + // A notification was not in progress, and now they are disallowed. + completeCompletion(parent); + return; + } + + /* + * A notification is in progress, so wait for it to finish. There is no race + * here since the notification can't finish while the admin thread is in this + * method. + */ + notifier->waiter = parent; +} + +/** + * Complete the process of entering read only mode. + * + * @param completion The read-only mode completion + **/ +static void finishEnteringReadOnlyMode(VDOCompletion *completion) +{ + ReadOnlyNotifier *notifier = asNotifier(completion); + assertOnAdminThread(notifier, __func__); + atomicStore32(¬ifier->state, NOTIFIED); + + VDOCompletion *waiter = notifier->waiter; + if (waiter != NULL) { + notifier->waiter = NULL; + finishCompletion(waiter, completion->result); + } +} + +/** + * Inform each thread that the VDO is in read-only mode. + * + * @param completion The read-only mode completion + **/ +static void makeThreadReadOnly(VDOCompletion *completion) +{ + ThreadID threadID = completion->callbackThreadID; + ReadOnlyNotifier *notifier = asNotifier(completion); + ReadOnlyListener *listener = completion->parent; + if (listener == NULL) { + // This is the first call on this thread + ThreadData *threadData = ¬ifier->threadData[threadID]; + threadData->isReadOnly = true; + listener = threadData->listeners; + if (threadID == 0) { + // Note: This message must be recognizable by Permabit::UserMachine. + logErrorWithStringError((int) atomicLoad32(¬ifier->readOnlyError), + "Unrecoverable error, entering read-only mode"); + } + } else { + // We've just finished notifying a listener + listener = listener->next; + } + + if (listener != NULL) { + // We have a listener to notify + prepareCompletion(completion, makeThreadReadOnly, makeThreadReadOnly, + threadID, listener); + listener->notify(listener->listener, completion); + return; + } + + // We're done with this thread + if (++threadID >= notifier->threadConfig->baseThreadCount) { + // There are no more threads + prepareCompletion(completion, finishEnteringReadOnlyMode, + finishEnteringReadOnlyMode, + getAdminThread(notifier->threadConfig), NULL); + } else { + prepareCompletion(completion, makeThreadReadOnly, makeThreadReadOnly, + threadID, NULL); + } + + invokeCallback(completion); +} + +/**********************************************************************/ +void allowReadOnlyModeEntry(ReadOnlyNotifier *notifier, VDOCompletion *parent) +{ + assertOnAdminThread(notifier, __func__); + if (notifier->waiter != NULL) { + finishCompletion(parent, VDO_COMPONENT_BUSY); + return; + } + + if (!compareAndSwap32(¬ifier->state, MAY_NOT_NOTIFY, MAY_NOTIFY)) { + // Notifications were already allowed or complete + completeCompletion(parent); + return; + } + + if ((int) atomicLoad32(¬ifier->readOnlyError) == VDO_SUCCESS) { + // We're done + completeCompletion(parent); + return; + } + + // There may have been a pending notification + if (!compareAndSwap32(¬ifier->state, MAY_NOTIFY, NOTIFYING)) { + /* + * There wasn't, the error check raced with a thread calling + * enterReadOnlyMode() after we set the state to MAY_NOTIFY. It has already + * started the notification. + */ + completeCompletion(parent); + return; + } + + // Do the pending notification. + notifier->waiter = parent; + makeThreadReadOnly(¬ifier->completion); +} + +/**********************************************************************/ +void enterReadOnlyMode(ReadOnlyNotifier *notifier, int errorCode) +{ + ThreadData *threadData = ¬ifier->threadData[getCallbackThreadID()]; + if (threadData->isReadOnly) { + // This thread has already gone read-only. + return; + } + + // Record for this thread that the VDO is read-only. + threadData->isReadOnly = true; + + if (!compareAndSwap32(¬ifier->readOnlyError, (uint32_t) VDO_SUCCESS, + (uint32_t) errorCode)) { + // The notifier is already aware of a read-only error + return; + } + + if (compareAndSwap32(¬ifier->state, MAY_NOTIFY, NOTIFYING)) { + // Initiate a notification starting on the lowest numbered thread. + launchCallback(¬ifier->completion, makeThreadReadOnly, 0); + } +} + +/**********************************************************************/ +bool isReadOnly(ReadOnlyNotifier *notifier) +{ + return notifier->threadData[getCallbackThreadID()].isReadOnly; +} + +/**********************************************************************/ +bool isOrWillBeReadOnly(ReadOnlyNotifier *notifier) +{ + return (((int) relaxedLoad32(¬ifier->readOnlyError)) != VDO_SUCCESS); +} + +/**********************************************************************/ +int registerReadOnlyListener(ReadOnlyNotifier *notifier, + void *listener, + ReadOnlyNotification *notification, + ThreadID threadID) +{ + ReadOnlyListener *readOnlyListener; + int result = ALLOCATE(1, ReadOnlyListener, __func__, &readOnlyListener); + if (result != VDO_SUCCESS) { + return result; + } + + ThreadData *threadData = ¬ifier->threadData[threadID]; + *readOnlyListener = (ReadOnlyListener) { + .listener = listener, + .notify = notification, + .next = threadData->listeners, + }; + + threadData->listeners = readOnlyListener; + return VDO_SUCCESS; +} diff --git a/source/vdo/base/readOnlyNotifier.h b/source/vdo/base/readOnlyNotifier.h new file mode 100644 index 0000000..b5eb322 --- /dev/null +++ b/source/vdo/base/readOnlyNotifier.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyNotifier.h#3 $ + */ + +/* + * A ReadOnlyNotifier is responsible for propogating the fact that the VDO + * has encountered an unrecoverable error to all base threads. It also persists + * the read-only state to the super block. + * + * The notifier also provides the ability to wait for any notifications to be + * complete in order to not cause super block write races when shutting down + * the VDO. + */ + +#ifndef READ_ONLY_NOTIFIER_H +#define READ_ONLY_NOTIFIER_H + +#include "completion.h" + +/** + * A function to notify a listener that the VDO has gone read-only. + * + * @param listener The object to notify + * @param parent The completion to notify in order to acknowledge the + * notification + **/ +typedef void ReadOnlyNotification(void *listener, VDOCompletion *parent); + +/** + * Create a read-only notifer. + * + * @param [in] isReadOnly Whether the VDO is already read-only + * @param [in] threadConfig The thread configuration of the VDO + * @param [in] layer The physical layer of the VDO + * @param [out] notifierPtr A pointer to receive the new notifier + * + * @return VDO_SUCCESS or an error + **/ +int makeReadOnlyNotifier(bool isReadOnly, + const ThreadConfig *threadConfig, + PhysicalLayer *layer, + ReadOnlyNotifier **notifierPtr) + __attribute__((warn_unused_result)); + +/** + * Free a ReadOnlyNotifier and null out the reference to it. + * + * @param notifierPtr The reference to the notifier to free + **/ +void freeReadOnlyNotifier(ReadOnlyNotifier **notifierPtr); + +/** + * Wait until no read-only notifications are in progress and prevent any + * subsequent notifications. Notifications may be re-enabled by calling + * allowReadOnlyModeEntry(). + * + * @param notifier The read-only notifier on which to wait + * @param parent The completion to notify when no threads are entering + * read-only mode + **/ +void waitUntilNotEnteringReadOnlyMode(ReadOnlyNotifier *notifier, + VDOCompletion *parent); + +/** + * Allow the notifier to put the VDO into read-only mode, reversing the effects + * of waitUntilNotEnteringReadOnlyMode(). If some thread tried to put the VDO + * into read-only mode while notifications were disallowed, it will be done + * when this method is called. If that happens, the parent will not be notified + * until the VDO has actually entered read-only mode and attempted to save the + * super block. + * + *

This method may only be called from the admin thread. + * + * @param notifier The notifier + * @param parent The object to notify once the operation is complete + **/ +void allowReadOnlyModeEntry(ReadOnlyNotifier *notifier, + VDOCompletion *parent); + +/** + * Put a VDO into read-only mode and save the read-only state in the super + * block. This method is a no-op if the VDO is already read-only. + * + * @param notifier The read-only notifier of the VDO + * @param errorCode The error which caused the VDO to enter read-only + * mode + **/ +void enterReadOnlyMode(ReadOnlyNotifier *notifier, int errorCode); + +/** + * Check whether the VDO is read-only. This method may be called from any + * thread, as opposed to examining the VDO's state field which is only safe + * to check from the admin thread. + * + * @param notifier The read-only notifier of the VDO + * + * @return true if the VDO is read-only + **/ +bool isReadOnly(ReadOnlyNotifier *notifier) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO is or will be read-only (i.e. some thread has started + * the process of entering read-only mode, but not all threads have been + * notified yet). This method should only be called in cases where the expense + * of reading atomic state is not a problem. It was introduced in order to allow + * suppresion of spurious error messages resulting from VIO cleanup racing with + * read-only notification. + * + * @param notifier The read-only notifier of the VDO + * + * @return true if the VDO has started (and possibly finished) + * the process of entering read-only mode + **/ +bool isOrWillBeReadOnly(ReadOnlyNotifier *notifier) + __attribute__((warn_unused_result)); + +/** + * Register a listener to be notified when the VDO goes read-only. + * + * @param notifier The notifier to register with + * @param listener The object to notify + * @param notification The function to call to send the notification + * @param threadID The id of the thread on which to send the notification + * + * @return VDO_SUCCESS or an error + **/ +int registerReadOnlyListener(ReadOnlyNotifier *notifier, + void *listener, + ReadOnlyNotification *notification, + ThreadID threadID); + +#endif /* READ_ONLY_NOTIFIER_H */ diff --git a/source/vdo/base/readOnlyRebuild.c b/source/vdo/base/readOnlyRebuild.c new file mode 100644 index 0000000..7e9df0c --- /dev/null +++ b/source/vdo/base/readOnlyRebuild.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyRebuild.c#9 $ + */ + +#include "readOnlyRebuild.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMapInternals.h" +#include "blockMapRecovery.h" +#include "completion.h" +#include "numUtils.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalInternals.h" +#include "recoveryUtils.h" +#include "referenceCountRebuild.h" +#include "slabDepot.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" + +typedef struct { + /** The completion header */ + VDOCompletion completion; + /** A sub task completion */ + VDOCompletion subTaskCompletion; + /** The VDO in question */ + VDO *vdo; + /** A buffer to hold the data read off disk */ + char *journalData; + /** The entry data for the block map rebuild */ + NumberedBlockMapping *entries; + /** The number of entries in the entry array */ + size_t entryCount; + /** The sequence number of the first valid block of the journal (if known) */ + SequenceNumber head; + /** The sequence number of the last valid block of the journal (if known) */ + SequenceNumber tail; + /** The number of logical blocks in use */ + BlockCount logicalBlocksUsed; + /** The number of allocated block map pages */ + BlockCount blockMapDataBlocks; +} ReadOnlyRebuildCompletion; + +/** + * Convert a generic completion to a ReadOnlyRebuildCompletion. + * + * @param completion The completion to convert + * + * @return the journal rebuild completion + **/ +__attribute__((warn_unused_result)) +static inline ReadOnlyRebuildCompletion * +asReadOnlyRebuildCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(ReadOnlyRebuildCompletion, completion) == 0); + assertCompletionType(completion->type, READ_ONLY_REBUILD_COMPLETION); + return (ReadOnlyRebuildCompletion *) completion; +} + +/** + * Free a rebuild completion and all underlying structures. + * + * @param rebuildPtr A pointer to the rebuild completion to free + */ +static void freeRebuildCompletion(ReadOnlyRebuildCompletion **rebuildPtr) +{ + ReadOnlyRebuildCompletion *rebuild = *rebuildPtr; + if (rebuild == NULL) { + return; + } + + destroyEnqueueable(&rebuild->subTaskCompletion); + FREE(rebuild->journalData); + FREE(rebuild->entries); + FREE(rebuild); + *rebuildPtr = NULL; +} + +/** + * Allocate and initialize a read only rebuild completion. + * + * @param [in] vdo The VDO in question + * @param [out] rebuildPtr A pointer to return the created rebuild completion + * + * @return VDO_SUCCESS or an error code + **/ +static int makeRebuildCompletion(VDO *vdo, + ReadOnlyRebuildCompletion **rebuildPtr) +{ + ReadOnlyRebuildCompletion *rebuild; + int result = ALLOCATE(1, ReadOnlyRebuildCompletion, __func__, &rebuild); + if (result != VDO_SUCCESS) { + return result; + } + + initializeCompletion(&rebuild->completion, READ_ONLY_REBUILD_COMPLETION, + vdo->layer); + + result = initializeEnqueueableCompletion(&rebuild->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + freeRebuildCompletion(&rebuild); + return result; + } + + rebuild->vdo = vdo; + *rebuildPtr = rebuild; + return VDO_SUCCESS; +} + +/** + * Clean up the rebuild process, whether or not it succeeded, by freeing the + * rebuild completion and notifying the parent of the outcome. + * + * @param completion The rebuild completion + **/ +static void completeRebuild(VDOCompletion *completion) +{ + VDOCompletion *parent = completion->parent; + int result = completion->result; + ReadOnlyRebuildCompletion *rebuild = asReadOnlyRebuildCompletion(completion); + VDO *vdo = rebuild->vdo; + setVDOPageCacheRebuildMode(getBlockMap(vdo)->zones[0].pageCache, false); + freeRebuildCompletion(&rebuild); + finishCompletion(parent, result); +} + +/** + * Finish rebuilding, free the rebuild completion and notify the parent. + * + * @param completion The rebuild completion + **/ +static void finishRebuild(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild = asReadOnlyRebuildCompletion(completion); + initializeRecoveryJournalPostRebuild(rebuild->vdo->recoveryJournal, + rebuild->vdo->completeRecoveries, + rebuild->tail, + rebuild->logicalBlocksUsed, + rebuild->blockMapDataBlocks); + logInfo("Read-only rebuild complete"); + completeRebuild(completion); +} + +/** + * Handle a rebuild error. + * + * @param completion The rebuild completion + **/ +static void abortRebuild(VDOCompletion *completion) +{ + logInfo("Read-only rebuild aborted"); + completeRebuild(completion); +} + +/** + * Abort a rebuild if there is an error. + * + * @param result The result to check + * @param rebuild The journal rebuild completion + * + * @return true if the result was an error + **/ +__attribute__((warn_unused_result)) +static bool abortRebuildOnError(int result, + ReadOnlyRebuildCompletion *rebuild) +{ + if (result == VDO_SUCCESS) { + return false; + } + + finishCompletion(&rebuild->completion, result); + return true; +} + +/** + * Clean up after finishing the reference count rebuild. This callback is + * registered in launchReferenceCountRebuild(). + * + * @param completion The sub-task completion + **/ +static void finishReferenceCountRebuild(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild = completion->parent; + VDO *vdo = rebuild->vdo; + assertOnAdminThread(vdo, __func__); + if (vdo->loadState != VDO_REBUILD_FOR_UPGRADE) { + // A "rebuild" for upgrade should not increment this count. + vdo->completeRecoveries++; + } + + logInfo("Saving rebuilt state"); + prepareToFinishParent(completion, &rebuild->completion); + drainSlabDepot(vdo->depot, ADMIN_STATE_REBUILDING, completion); +} + +/** + * Rebuild the reference counts from the block map now that all journal entries + * have been applied to the block map. This callback is registered in + * applyJournalEntries(). + * + * @param completion The sub-task completion + **/ +static void launchReferenceCountRebuild(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild = completion->parent; + VDO *vdo = rebuild->vdo; + + // We must allocate RefCounts before we can rebuild them. + int result = allocateSlabRefCounts(vdo->depot); + if (abortRebuildOnError(result, rebuild)) { + return; + } + + prepareCompletion(completion, finishReferenceCountRebuild, + finishParentCallback, getAdminThread(getThreadConfig(vdo)), + completion->parent); + rebuildReferenceCounts(vdo, completion, &rebuild->logicalBlocksUsed, + &rebuild->blockMapDataBlocks); +} + +/** + * Append an array of recovery journal entries from a journal block sector to + * the array of numbered mappings in the rebuild completion, numbering each + * entry in the order they are appended. + * + * @param rebuild The journal rebuild completion + * @param sector The recovery journal sector with entries + * @param entryCount The number of entries to append + **/ +static void appendSectorEntries(ReadOnlyRebuildCompletion *rebuild, + PackedJournalSector *sector, + JournalEntryCount entryCount) +{ + for (JournalEntryCount i = 0; i < entryCount; i++) { + RecoveryJournalEntry entry + = unpackRecoveryJournalEntry(§or->entries[i]); + int result = validateRecoveryJournalEntry(rebuild->vdo, &entry); + if (result != VDO_SUCCESS) { + // When recovering from read-only mode, ignore damaged entries. + continue; + } + + if (isIncrementOperation(entry.operation)) { + rebuild->entries[rebuild->entryCount] = (NumberedBlockMapping) { + .blockMapSlot = entry.slot, + .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state), + .number = rebuild->entryCount, + }; + rebuild->entryCount++; + } + } +} + +/** + * Create an array of all valid journal entries, in order, and store + * it in the rebuild completion. + * + * @param rebuild The journal rebuild completion + * + * @return VDO_SUCCESS or an error code + **/ +static int extractJournalEntries(ReadOnlyRebuildCompletion *rebuild) +{ + VDO *vdo = rebuild->vdo; + RecoveryJournal *journal = vdo->recoveryJournal; + SequenceNumber first = rebuild->head; + SequenceNumber last = rebuild->tail; + BlockCount maxCount = ((last - first + 1) * journal->entriesPerBlock); + + // Allocate a NumberedBlockMapping array large enough to transcribe every + // PackedRecoveryJournalEntry from every valid journal block. + int result = ALLOCATE(maxCount, NumberedBlockMapping, __func__, + &rebuild->entries); + if (result != VDO_SUCCESS) { + return result; + } + + for (SequenceNumber i = first; i <= last; i++) { + PackedJournalHeader *packedHeader + = getJournalBlockHeader(journal, rebuild->journalData, i); + RecoveryBlockHeader header; + unpackRecoveryBlockHeader(packedHeader, &header); + + if (!isExactRecoveryJournalBlock(journal, &header, i)) { + // This block is invalid, so skip it. + continue; + } + + // Don't extract more than the expected maximum entries per block. + JournalEntryCount blockEntries = minBlock(journal->entriesPerBlock, + header.entryCount); + for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) { + // Stop when all entries counted in the header are applied or skipped. + if (blockEntries == 0) { + break; + } + + PackedJournalSector *sector = getJournalBlockSector(packedHeader, j); + if (!isValidRecoveryJournalSector(&header, sector)) { + blockEntries -= minBlock(blockEntries, + RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); + continue; + } + + // Don't extract more than the expected maximum entries per sector. + JournalEntryCount sectorEntries + = minBlock(sector->entryCount, RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); + // Only extract as many as the block header calls for. + sectorEntries = minBlock(sectorEntries, blockEntries); + appendSectorEntries(rebuild, sector, sectorEntries); + // Even if the sector wasn't full, count it as full when counting up + // to the entry count the block header claims. + blockEntries -= minBlock(blockEntries, + RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); + } + } + + return VDO_SUCCESS; +} + +/** + * Determine the limits of the valid recovery journal and apply all + * valid entries to the block map. This callback is registered in + * rebuildJournalAsync(). + * + * @param completion The sub-task completion + **/ +static void applyJournalEntries(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild + = asReadOnlyRebuildCompletion(completion->parent); + VDO *vdo = rebuild->vdo; + + logInfo("Finished reading recovery journal"); + assertOnLogicalZoneThread(vdo, 0, __func__); + + bool foundEntries = findHeadAndTail(vdo->recoveryJournal, + rebuild->journalData, &rebuild->tail, + &rebuild->head, NULL); + if (foundEntries) { + int result = extractJournalEntries(rebuild); + if (abortRebuildOnError(result, rebuild)) { + return; + } + } + + // Suppress block map errors. + setVDOPageCacheRebuildMode(getBlockMap(vdo)->zones[0].pageCache, true); + + // Play the recovery journal into the block map. + prepareCompletion(completion, launchReferenceCountRebuild, + finishParentCallback, completion->callbackThreadID, + completion->parent); + recoverBlockMap(vdo, rebuild->entryCount, rebuild->entries, completion); +} + +/** + * Begin loading the journal. + * + * @param completion The sub task completion + **/ +static void loadJournal(VDOCompletion *completion) +{ + ReadOnlyRebuildCompletion *rebuild + = asReadOnlyRebuildCompletion(completion->parent); + VDO *vdo = rebuild->vdo; + assertOnLogicalZoneThread(vdo, 0, __func__); + + prepareCompletion(completion, applyJournalEntries, finishParentCallback, + completion->callbackThreadID, completion->parent); + loadJournalAsync(vdo->recoveryJournal, completion, &rebuild->journalData); +} + +/**********************************************************************/ +void launchRebuild(VDO *vdo, VDOCompletion *parent) +{ + // Note: These messages must be recognizable by Permabit::VDODeviceBase. + if (vdo->loadState == VDO_REBUILD_FOR_UPGRADE) { + logWarning("Rebuilding reference counts for upgrade"); + } else { + logWarning("Rebuilding reference counts to clear read-only mode"); + vdo->readOnlyRecoveries++; + } + + ReadOnlyRebuildCompletion *rebuild; + int result = makeRebuildCompletion(vdo, &rebuild); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + VDOCompletion *completion = &rebuild->completion; + prepareCompletion(completion, finishRebuild, abortRebuild, + parent->callbackThreadID, parent); + + VDOCompletion *subTaskCompletion = &rebuild->subTaskCompletion; + prepareCompletion(subTaskCompletion, loadJournal, finishParentCallback, + getLogicalZoneThread(getThreadConfig(vdo), 0), + completion); + loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_REBUILD, + subTaskCompletion, NULL); +} diff --git a/source/vdo/base/readOnlyRebuild.h b/source/vdo/base/readOnlyRebuild.h new file mode 100644 index 0000000..9f40ce6 --- /dev/null +++ b/source/vdo/base/readOnlyRebuild.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyRebuild.h#1 $ + */ + +#ifndef READ_ONLY_REBUILD_H +#define READ_ONLY_REBUILD_H + +#include "completion.h" +#include "vdo.h" + +/** + * Construct a ReadOnlyRebuildCompletion and launch it. Apply all valid journal + * block entries to all VDO structures. Must be launched from logical zone 0. + * + * @param vdo The VDO to rebuild + * @param parent The completion to notify when the rebuild is complete + **/ +void launchRebuild(VDO *vdo, VDOCompletion *parent); + +#endif // READ_ONLY_REBUILD_H diff --git a/source/vdo/base/recoveryJournal.c b/source/vdo/base/recoveryJournal.c new file mode 100644 index 0000000..c44053c --- /dev/null +++ b/source/vdo/base/recoveryJournal.c @@ -0,0 +1,1403 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.c#30 $ + */ + +#include "recoveryJournal.h" +#include "recoveryJournalInternals.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMap.h" +#include "constants.h" +#include "dataVIO.h" +#include "extent.h" +#include "header.h" +#include "numUtils.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalBlock.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "waitQueue.h" + +typedef struct { + SequenceNumber journalStart; // Sequence number to start the journal + BlockCount logicalBlocksUsed; // Number of logical blocks used by VDO + BlockCount blockMapDataBlocks; // Number of block map pages allocated +} __attribute__((packed)) RecoveryJournalState7_0; + +static const Header RECOVERY_JOURNAL_HEADER_7_0 = { + .id = RECOVERY_JOURNAL, + .version = { + .majorVersion = 7, + .minorVersion = 0, + }, + .size = sizeof(RecoveryJournalState7_0), +}; + +static const uint64_t RECOVERY_COUNT_MASK = 0xff; + +enum { + /* + * The number of reserved blocks must be large enough to prevent a + * new recovery journal block write from overwriting a block which + * appears to still be a valid head block of the journal. Currently, + * that means reserving enough space for all 2048 VIOs, or 8 blocks. + */ + RECOVERY_JOURNAL_RESERVED_BLOCKS = 8, +}; + +/**********************************************************************/ +const char *getJournalOperationName(JournalOperation operation) +{ + switch (operation) { + case DATA_DECREMENT: + return "data decrement"; + + case DATA_INCREMENT: + return "data increment"; + + case BLOCK_MAP_DECREMENT: + return "block map decrement"; + + case BLOCK_MAP_INCREMENT: + return "block map increment"; + + default: + return "unknown journal operation"; + } +} + +/** + * Get a block from the end of the free list. + * + * @param journal The journal + * + * @return The block or NULL if the list is empty + **/ +static RecoveryJournalBlock *popFreeList(RecoveryJournal *journal) +{ + return blockFromRingNode(popRingNode(&journal->freeTailBlocks)); +} + +/** + * Get a block from the end of the active list. + * + * @param journal The journal + * + * @return The block or NULL if the list is empty + **/ +static RecoveryJournalBlock *popActiveList(RecoveryJournal *journal) +{ + return blockFromRingNode(popRingNode(&journal->activeTailBlocks)); +} + +/** + * Assert that we are running on the journal thread. + * + * @param journal The journal + * @param functionName The function doing the check (for logging) + **/ +static void assertOnJournalThread(RecoveryJournal *journal, + const char *functionName) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() == journal->threadID), + "%s() called on journal thread", functionName); +} + +/** + * WaiterCallback implementation invoked whenever a DataVIO is to be released + * from the journal, either because its entry was committed to disk, + * or because there was an error. + **/ +static void continueWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + dataVIOAddTraceRecord(dataVIO, + THIS_LOCATION("$F($j-$js);" + "cb=continueJournalWaiter($j-$js)")); + int waitResult = *((int *) context); + continueDataVIO(dataVIO, waitResult); +} + +/** + * Check whether the journal has any waiters on any blocks. + * + * @param journal The journal in question + * + * @return true if any block has a waiter + **/ +static inline bool hasBlockWaiters(RecoveryJournal *journal) +{ + // Either the first active tail block (if it exists) has waiters, + // or no active tail block has waiters. + if (isRingEmpty(&journal->activeTailBlocks)) { + return false; + } + + RecoveryJournalBlock *block + = blockFromRingNode(journal->activeTailBlocks.next); + return (hasWaiters(&block->entryWaiters) + || hasWaiters(&block->commitWaiters)); +} + +/**********************************************************************/ +static void recycleJournalBlocks(RecoveryJournal *block); +static void recycleJournalBlock(RecoveryJournalBlock *block); +static void notifyCommitWaiters(RecoveryJournal *journal); + +/** + * Check whether the journal has drained. + * + * @param journal The journal which may have just drained + **/ +static void checkForDrainComplete(RecoveryJournal *journal) +{ + int result = VDO_SUCCESS; + if (isReadOnly(journal->readOnlyNotifier)) { + result = VDO_READ_ONLY; + /* + * Clean up any full active blocks which were not written due to being + * in read-only mode. + * + * XXX: This would probably be better as a short-circuit in writeBlock(). + */ + notifyCommitWaiters(journal); + recycleJournalBlocks(journal); + + // Release any DataVIOs waiting to be assigned entries. + notifyAllWaiters(&journal->decrementWaiters, continueWaiter, &result); + notifyAllWaiters(&journal->incrementWaiters, continueWaiter, &result); + } + + if (!isDraining(&journal->state) + || journal->reaping || hasBlockWaiters(journal) + || hasWaiters(&journal->incrementWaiters) + || hasWaiters(&journal->decrementWaiters) + || !suspendLockCounter(journal->lockCounter)) { + return; + } + + if (isSaving(&journal->state)) { + if (journal->activeBlock != NULL) { + ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) + || !isRecoveryBlockDirty(journal->activeBlock)), + "journal being saved has clean active block"); + recycleJournalBlock(journal->activeBlock); + } + + ASSERT_LOG_ONLY(isRingEmpty(&journal->activeTailBlocks), + "all blocks in a journal being saved must be inactive"); + } + + finishDrainingWithResult(&journal->state, result); +} + +/** + * Notifiy a recovery journal that the VDO has gone read-only. + * + *

Implements ReadOnlyNotification. + * + * @param listener The journal + * @param parent The completion to notify in order to acknowledge the + * notification + **/ +static void notifyRecoveryJournalOfReadOnlyMode(void *listener, + VDOCompletion *parent) +{ + checkForDrainComplete(listener); + completeCompletion(parent); +} + +/** + * Put the journal in read-only mode. All attempts to add entries after + * this function is called will fail. All VIOs waiting for commits will be + * awakened with an error. + * + * @param journal The journal which has failed + * @param errorCode The error result triggering this call + **/ +static void enterJournalReadOnlyMode(RecoveryJournal *journal, int errorCode) +{ + enterReadOnlyMode(journal->readOnlyNotifier, errorCode); + checkForDrainComplete(journal); +} + +/**********************************************************************/ +SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal) +{ + return journal->tail; +} + +/** + * Get the head of the recovery journal, which is the lowest sequence number of + * the block map head and the slab journal head. + * + * @param journal The journal + * + * @return the head of the journal + **/ +static inline SequenceNumber getRecoveryJournalHead(RecoveryJournal *journal) +{ + return minSequenceNumber(journal->blockMapHead, journal->slabJournalHead); +} + +/** + * Compute the recovery count byte for a given recovery count. + * + * @param recoveryCount The recovery count + * + * @return The byte corresponding to the recovery count + **/ +__attribute__((warn_unused_result)) +static inline uint8_t computeRecoveryCountByte(uint64_t recoveryCount) +{ + return (uint8_t) (recoveryCount & RECOVERY_COUNT_MASK); +} + +/** + * Check whether the journal is over the threshold, and if so, force the oldest + * slab journal tail block to commit. + * + * @param journal The journal + **/ +static void checkSlabJournalCommitThreshold(RecoveryJournal *journal) +{ + BlockCount currentLength = journal->tail - journal->slabJournalHead; + if (currentLength > journal->slabJournalCommitThreshold) { + journal->events.slabJournalCommitsRequested++; + commitOldestSlabJournalTailBlocks(journal->depot, + journal->slabJournalHead); + } +} + +/**********************************************************************/ +static void reapRecoveryJournal(RecoveryJournal *journal); +static void assignEntries(RecoveryJournal *journal); + +/** + * Finish reaping the journal. + * + * @param journal The journal being reaped + **/ +static void finishReaping(RecoveryJournal *journal) +{ + SequenceNumber oldHead = getRecoveryJournalHead(journal); + journal->blockMapHead = journal->blockMapReapHead; + journal->slabJournalHead = journal->slabJournalReapHead; + BlockCount blocksReaped = getRecoveryJournalHead(journal) - oldHead; + journal->availableSpace += blocksReaped * journal->entriesPerBlock; + journal->reaping = false; + checkSlabJournalCommitThreshold(journal); + assignEntries(journal); + checkForDrainComplete(journal); +} + +/** + * Finish reaping the journal after flushing the lower layer. This is the + * callback registered in reapRecoveryJournal(). + * + * @param completion The journal's flush VIO + **/ +static void completeReaping(VDOCompletion *completion) +{ + RecoveryJournal *journal = completion->parent; + finishReaping(journal); + + // Try reaping again in case more locks were released while flush was out. + reapRecoveryJournal(journal); +} + +/** + * Handle an error when flushing the lower layer due to reaping. + * + * @param completion The journal's flush VIO + **/ +static void handleFlushError(VDOCompletion *completion) +{ + RecoveryJournal *journal = completion->parent; + journal->reaping = false; + enterJournalReadOnlyMode(journal, completion->result); +} + +/** + * Set all journal fields appropriately to start journaling from the current + * active block. + * + * @param journal The journal to be reset based on its active block + **/ +static void initializeJournalState(RecoveryJournal *journal) +{ + journal->appendPoint.sequenceNumber = journal->tail; + journal->lastWriteAcknowledged = journal->tail; + journal->blockMapHead = journal->tail; + journal->slabJournalHead = journal->tail; + journal->blockMapReapHead = journal->tail; + journal->slabJournalReapHead = journal->tail; + journal->blockMapHeadBlockNumber + = getRecoveryJournalBlockNumber(journal, journal->blockMapHead); + journal->slabJournalHeadBlockNumber + = getRecoveryJournalBlockNumber(journal, journal->slabJournalHead); +} + +/**********************************************************************/ +BlockCount getRecoveryJournalLength(BlockCount journalSize) +{ + BlockCount reservedBlocks = journalSize / 4; + if (reservedBlocks > RECOVERY_JOURNAL_RESERVED_BLOCKS) { + reservedBlocks = RECOVERY_JOURNAL_RESERVED_BLOCKS; + } + return (journalSize - reservedBlocks); +} + +/** + * Attempt to reap the journal now that all the locks on some journal block + * have been released. This is the callback registered with the lock counter. + * + * @param completion The lock counter completion + **/ +static void reapRecoveryJournalCallback(VDOCompletion *completion) +{ + RecoveryJournal *journal = (RecoveryJournal *) completion->parent; + // The acknowledgement must be done before reaping so that there is no + // race between acknowledging the notification and unlocks wishing to notify. + acknowledgeUnlock(journal->lockCounter); + + if (isQuiescing(&journal->state)) { + // Don't start reaping when the journal is trying to quiesce. Do check if + // this notification is the last thing the drain is waiting on. + checkForDrainComplete(journal); + return; + } + + reapRecoveryJournal(journal); + checkSlabJournalCommitThreshold(journal); +} + +/********************************************************************** + * Set the journal's tail sequence number. + * + * @param journal The journal whose tail is to be set + * @param tail The new tail value + **/ +static void setJournalTail(RecoveryJournal *journal, SequenceNumber tail) +{ + // VDO does not support sequence numbers above 1 << 48 in the slab journal. + if (tail >= (1ULL << 48)) { + enterJournalReadOnlyMode(journal, VDO_JOURNAL_OVERFLOW); + } + + journal->tail = tail; +} + +/**********************************************************************/ +int makeRecoveryJournal(Nonce nonce, + PhysicalLayer *layer, + Partition *partition, + uint64_t recoveryCount, + BlockCount journalSize, + BlockCount tailBufferSize, + ReadOnlyNotifier *readOnlyNotifier, + const ThreadConfig *threadConfig, + RecoveryJournal **journalPtr) +{ + RecoveryJournal *journal; + int result = ALLOCATE(1, RecoveryJournal, __func__, &journal); + if (result != VDO_SUCCESS) { + return result; + } + + initializeRing(&journal->freeTailBlocks); + initializeRing(&journal->activeTailBlocks); + initializeWaitQueue(&journal->pendingWrites); + + journal->threadID = getJournalZoneThread(threadConfig); + journal->partition = partition; + journal->nonce = nonce; + journal->recoveryCount = computeRecoveryCountByte(recoveryCount); + journal->size = journalSize; + journal->readOnlyNotifier = readOnlyNotifier; + journal->tail = 1; + journal->slabJournalCommitThreshold = (journalSize * 2) / 3; + initializeJournalState(journal); + + journal->entriesPerBlock = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK; + BlockCount journalLength = getRecoveryJournalLength(journalSize); + journal->availableSpace = journal->entriesPerBlock * journalLength; + + // Only make the tail buffer and VIO in normal operation since the formatter + // doesn't need them. + if (layer->createMetadataVIO != NULL) { + for (BlockCount i = 0; i < tailBufferSize; i++) { + RecoveryJournalBlock *block; + result = makeRecoveryBlock(layer, journal, &block); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + pushRingNode(&journal->freeTailBlocks, &block->ringNode); + } + + result = makeLockCounter(layer, journal, reapRecoveryJournalCallback, + journal->threadID, threadConfig->logicalZoneCount, + threadConfig->physicalZoneCount, journal->size, + &journal->lockCounter); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + result = ALLOCATE(VDO_BLOCK_SIZE, char, "journal flush data", + &journal->unusedFlushVIOData); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + result = createVIO(layer, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH, + journal, journal->unusedFlushVIOData, + &journal->flushVIO); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + result = registerReadOnlyListener(readOnlyNotifier, journal, + notifyRecoveryJournalOfReadOnlyMode, + journal->threadID); + if (result != VDO_SUCCESS) { + freeRecoveryJournal(&journal); + return result; + } + + journal->flushVIO->completion.callbackThreadID = journal->threadID; + } + + *journalPtr = journal; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeRecoveryJournal(RecoveryJournal **journalPtr) +{ + RecoveryJournal *journal = *journalPtr; + if (journal == NULL) { + return; + } + + freeLockCounter(&journal->lockCounter); + freeVIO(&journal->flushVIO); + FREE(journal->unusedFlushVIOData); + + // XXX: eventually, the journal should be constructed in a quiescent state + // which requires opening before use. + if (!isQuiescent(&journal->state)) { + ASSERT_LOG_ONLY(isRingEmpty(&journal->activeTailBlocks), + "journal being freed has no active tail blocks"); + } else if (!isSaved(&journal->state) + && !isRingEmpty(&journal->activeTailBlocks)) { + logWarning("journal being freed has uncommited entries"); + } + + RecoveryJournalBlock *block; + while ((block = popActiveList(journal)) != NULL) { + freeRecoveryBlock(&block); + } + + while ((block = popFreeList(journal)) != NULL) { + freeRecoveryBlock(&block); + } + + FREE(journal); + *journalPtr = NULL; +} + +/**********************************************************************/ +void setRecoveryJournalPartition(RecoveryJournal *journal, + Partition *partition) +{ + journal->partition = partition; +} + +/**********************************************************************/ +void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal, + uint64_t recoveryCount, + SequenceNumber tail) +{ + setJournalTail(journal, tail + 1); + journal->recoveryCount = computeRecoveryCountByte(recoveryCount); + initializeJournalState(journal); +} + +/**********************************************************************/ +void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal, + uint64_t recoveryCount, + SequenceNumber tail, + BlockCount logicalBlocksUsed, + BlockCount blockMapDataBlocks) +{ + initializeRecoveryJournalPostRecovery(journal, recoveryCount, tail); + journal->logicalBlocksUsed = logicalBlocksUsed; + journal->blockMapDataBlocks = blockMapDataBlocks; +} + +/**********************************************************************/ +BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal) +{ + return journal->blockMapDataBlocks; +} + +/**********************************************************************/ +void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal, + BlockCount pages) +{ + journal->blockMapDataBlocks = pages; +} + +/**********************************************************************/ +ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal) +{ + return journal->threadID; +} + +/**********************************************************************/ +void openRecoveryJournal(RecoveryJournal *journal, + SlabDepot *depot, + BlockMap *blockMap) +{ + journal->depot = depot; + journal->blockMap = blockMap; + journal->state.state = ADMIN_STATE_NORMAL_OPERATION; +} + +/**********************************************************************/ +size_t getRecoveryJournalEncodedSize(void) +{ + return ENCODED_HEADER_SIZE + sizeof(RecoveryJournalState7_0); +} + +/**********************************************************************/ +int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) +{ + SequenceNumber journalStart; + if (isSaved(&journal->state)) { + // If the journal is saved, we should start one past the active block + // (since the active block is not guaranteed to be empty). + journalStart = journal->tail; + } else { + // When we're merely suspended or have gone read-only, we must record the + // first block that might have entries that need to be applied. + journalStart = getRecoveryJournalHead(journal); + } + + int result = encodeHeader(&RECOVERY_JOURNAL_HEADER_7_0, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = putUInt64LEIntoBuffer(buffer, journalStart); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, journal->logicalBlocksUsed); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, journal->blockMapDataBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + return ASSERT(RECOVERY_JOURNAL_HEADER_7_0.size == encodedSize, + "encoded recovery journal component size" + " must match header size"); +} + +/** + * Decode recovery journal component state version 7.0 from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param state The state structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeRecoveryJournalState_7_0(Buffer *buffer, + RecoveryJournalState7_0 *state) +{ + size_t initialLength = contentLength(buffer); + + SequenceNumber journalStart; + int result = getUInt64LEFromBuffer(buffer, &journalStart); + if (result != UDS_SUCCESS) { + return result; + } + + BlockCount logicalBlocksUsed; + result = getUInt64LEFromBuffer(buffer, &logicalBlocksUsed); + if (result != UDS_SUCCESS) { + return result; + } + + BlockCount blockMapDataBlocks; + result = getUInt64LEFromBuffer(buffer, &blockMapDataBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + *state = (RecoveryJournalState7_0) { + .journalStart = journalStart, + .logicalBlocksUsed = logicalBlocksUsed, + .blockMapDataBlocks = blockMapDataBlocks, + }; + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(RECOVERY_JOURNAL_HEADER_7_0.size == decodedSize, + "decoded slab depot component size must match header size"); +} + +/**********************************************************************/ +int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) +{ + Header header; + int result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&RECOVERY_JOURNAL_HEADER_7_0, &header, + true, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + RecoveryJournalState7_0 state; + result = decodeRecoveryJournalState_7_0(buffer, &state); + if (result != VDO_SUCCESS) { + return result; + } + + // Update recovery journal in-memory information. + setJournalTail(journal, state.journalStart); + journal->logicalBlocksUsed = state.logicalBlocksUsed; + journal->blockMapDataBlocks = state.blockMapDataBlocks; + initializeJournalState(journal); + + // XXX: this is a hack until we make initial resume of a VDO a real resume + journal->state.state = ADMIN_STATE_SUSPENDED; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) +{ + // Sodium uses version 7.0, same as head, currently. + return decodeRecoveryJournal(journal, buffer); +} + +/** + * Advance the tail of the journal. + * + * @param journal The journal whose tail should be advanced + * + * @return true if the tail was advanced + **/ +static bool advanceTail(RecoveryJournal *journal) +{ + journal->activeBlock = popFreeList(journal); + if (journal->activeBlock == NULL) { + return false; + } + + pushRingNode(&journal->activeTailBlocks, &journal->activeBlock->ringNode); + initializeRecoveryBlock(journal->activeBlock); + setJournalTail(journal, journal->tail + 1); + advanceBlockMapEra(journal->blockMap, journal->tail); + return true; +} + +/** + * Check whether there is space to make a given type of entry. + * + * @param journal The journal to check + * @param increment Set to true if the desired entry is an + * increment + * + * @return true if there is space in the journal to make an + * entry of the specified type + **/ +static bool checkForEntrySpace(RecoveryJournal *journal, bool increment) +{ + if (increment) { + return ((journal->availableSpace - journal->pendingDecrementCount) > 1); + } + + return (journal->availableSpace > 0); +} + +/** + * Prepare the currently active block to receive an entry and check whether + * an entry of the given type may be assigned at this time. + * + * @param journal The journal receiving an entry + * @param increment Set to true if the desired entry is an + * increment + * + * @return true if there is space in the journal to store an + * entry of the specified type + **/ +static bool prepareToAssignEntry(RecoveryJournal *journal, bool increment) +{ + if (!checkForEntrySpace(journal, increment)) { + if (!increment) { + // There must always be room to make a decrement entry. + logError("No space for decrement entry in recovery journal"); + enterJournalReadOnlyMode(journal, VDO_RECOVERY_JOURNAL_FULL); + } + return false; + } + + if (isRecoveryBlockFull(journal->activeBlock) && !advanceTail(journal)) { + return false; + } + + if (!isRecoveryBlockEmpty(journal->activeBlock)) { + return true; + } + + if ((journal->tail - getRecoveryJournalHead(journal)) > journal->size) { + // Cannot use this block since the journal is full. + journal->events.diskFull++; + return false; + } + + /* + * Don't allow the new block to be reaped until all of its entries have been + * committed to the block map and until the journal block has been fully + * committed as well. Because the block map update is done only after any + * slab journal entries have been made, the per-entry lock for the block map + * entry serves to protect those as well. + */ + initializeLockCount(journal->lockCounter, journal->activeBlock->blockNumber, + journal->entriesPerBlock + 1); + return true; +} + +/**********************************************************************/ +static void writeBlocks(RecoveryJournal *journal); + +/** + * Queue a block for writing. The block is expected to be full. If the block + * is currently writing, this is a noop as the block will be queued for + * writing when the write finishes. The block must not currently be queued + * for writing. + * + * @param journal The journal in question + * @param block The block which is now ready to write + **/ +static void scheduleBlockWrite(RecoveryJournal *journal, + RecoveryJournalBlock *block) +{ + if (block->committing) { + return; + } + + int result = enqueueWaiter(&journal->pendingWrites, &block->writeWaiter); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + return; + } + + PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; + if ((layer->getWritePolicy(layer) == WRITE_POLICY_ASYNC)) { + /* + * At the end of adding entries, or discovering this partial block + * is now full and ready to rewrite, we will call writeBlocks() and + * write a whole batch. + */ + return; + } + writeBlocks(journal); +} + +/** + * Release a reference to a journal block. + * + * @param block The journal block from which to release a reference + **/ +static void releaseJournalBlockReference(RecoveryJournalBlock *block) +{ + releaseJournalZoneReference(block->journal->lockCounter, block->blockNumber); +} + +/** + * Implements WaiterCallback. Assign an entry waiter to the active block. + **/ +static void assignEntry(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + RecoveryJournalBlock *block = (RecoveryJournalBlock *) context; + RecoveryJournal *journal = block->journal; + + // Record the point at which we will make the journal entry. + dataVIO->recoveryJournalPoint = (JournalPoint) { + .sequenceNumber = block->sequenceNumber, + .entryCount = block->entryCount, + }; + + switch (dataVIO->operation.type) { + case DATA_INCREMENT: + if (dataVIO->operation.state != MAPPING_STATE_UNMAPPED) { + journal->logicalBlocksUsed++; + } + journal->pendingDecrementCount++; + break; + + case DATA_DECREMENT: + if (dataVIO->operation.state != MAPPING_STATE_UNMAPPED) { + journal->logicalBlocksUsed--; + } + + // Per-entry locks need not be held for decrement entries since the lock + // held for the incref entry will protect this entry as well. + releaseJournalBlockReference(block); + ASSERT_LOG_ONLY((journal->pendingDecrementCount != 0), + "decrement follows increment"); + journal->pendingDecrementCount--; + break; + + case BLOCK_MAP_INCREMENT: + journal->blockMapDataBlocks++; + break; + + default: + logError("Invalid journal operation %u", dataVIO->operation.type); + enterJournalReadOnlyMode(journal, VDO_NOT_IMPLEMENTED); + continueDataVIO(dataVIO, VDO_NOT_IMPLEMENTED); + return; + } + + journal->availableSpace--; + int result = enqueueRecoveryBlockEntry(block, dataVIO); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + continueDataVIO(dataVIO, result); + } + + if (isRecoveryBlockFull(block)) { + // The block is full, so we can write it anytime henceforth. If it is + // already committing, we'll queue it for writing when it comes back. + scheduleBlockWrite(journal, block); + } + + // Force out slab journal tail blocks when threshold is reached. + checkSlabJournalCommitThreshold(journal); +} + +/**********************************************************************/ +static bool assignEntriesFromQueue(RecoveryJournal *journal, + WaitQueue *queue, + bool increment) +{ + while (hasWaiters(queue)) { + if (!prepareToAssignEntry(journal, increment)) { + return false; + } + + notifyNextWaiter(queue, assignEntry, journal->activeBlock); + } + + return true; +} + +/**********************************************************************/ +static void assignEntries(RecoveryJournal *journal) +{ + if (journal->addingEntries) { + // Protect against re-entrancy. + return; + } + + journal->addingEntries = true; + if (assignEntriesFromQueue(journal, &journal->decrementWaiters, false)) { + assignEntriesFromQueue(journal, &journal->incrementWaiters, true); + } + + // Now that we've finished with entries, see if we have a batch of blocks to + // write. + writeBlocks(journal); + journal->addingEntries = false; +} + +/** + * Prepare an in-memory journal block to be reused now that it has been fully + * committed. + * + * @param block The block to be recycled + **/ +static void recycleJournalBlock(RecoveryJournalBlock *block) +{ + RecoveryJournal *journal = block->journal; + pushRingNode(&journal->freeTailBlocks, &block->ringNode); + + // Release any unused entry locks. + for (BlockCount i = block->entryCount; i < journal->entriesPerBlock; i++) { + releaseJournalBlockReference(block); + } + + // Release our own lock against reaping now that the block is completely + // committed, or we're giving up because we're in read-only mode. + if (block->entryCount > 0) { + releaseJournalBlockReference(block); + } + + if (block == journal->activeBlock) { + journal->activeBlock = NULL; + } +} + +/** + * WaiterCallback implementation invoked whenever a VIO is to be released + * from the journal because its entry was committed to disk. + **/ +static void continueCommittedWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + RecoveryJournal *journal = (RecoveryJournal *) context; + ASSERT_LOG_ONLY(beforeJournalPoint(&journal->commitPoint, + &dataVIO->recoveryJournalPoint), + "DataVIOs released from recovery journal in order. " + "Recovery journal point is (%llu, %" PRIu16 "), " + "but commit waiter point is (%llu, %" PRIu16 ")", + journal->commitPoint.sequenceNumber, + journal->commitPoint.entryCount, + dataVIO->recoveryJournalPoint.sequenceNumber, + dataVIO->recoveryJournalPoint.entryCount); + journal->commitPoint = dataVIO->recoveryJournalPoint; + + int result + = (isReadOnly(journal->readOnlyNotifier) ? VDO_READ_ONLY : VDO_SUCCESS); + continueWaiter(waiter, &result); +} + +/** + * Notify any VIOs whose entries have now committed. + * + * @param journal The recovery journal to update + **/ +static void notifyCommitWaiters(RecoveryJournal *journal) +{ + if (isRingEmpty(&journal->activeTailBlocks)) { + return; + } + + for (RingNode *node = journal->activeTailBlocks.next; + node != &journal->activeTailBlocks; + node = node->next) { + RecoveryJournalBlock *block = blockFromRingNode(node); + + if (block->committing) { + return; + } + + notifyAllWaiters(&block->commitWaiters, continueCommittedWaiter, journal); + if (isReadOnly(journal->readOnlyNotifier)) { + notifyAllWaiters(&block->entryWaiters, continueCommittedWaiter, journal); + } else if (isRecoveryBlockDirty(block) || !isRecoveryBlockFull(block)) { + // Stop at partially-committed or partially-filled blocks. + return; + } + } +} + +/** + * Recycle any journal blocks which have been fully committed. + * + * @param journal The recovery journal to update + **/ +static void recycleJournalBlocks(RecoveryJournal *journal) +{ + while (!isRingEmpty(&journal->activeTailBlocks)) { + RecoveryJournalBlock *block + = blockFromRingNode(journal->activeTailBlocks.next); + + if (block->committing) { + // Don't recycle committing blocks. + return; + } + + if (!isReadOnly(journal->readOnlyNotifier) + && (isRecoveryBlockDirty(block) + || !isRecoveryBlockFull(block))) { + // Don't recycle partially written or partially full + // blocks, except in read-only mode. + return; + } + recycleJournalBlock(block); + } +} + +/** + * Handle post-commit processing. This is the callback registered by + * writeBlock(). If more entries accumulated in the block being committed while + * the commit was in progress, another commit will be initiated. + * + * @param completion The completion of the VIO writing this block + **/ +static void completeWrite(VDOCompletion *completion) +{ + RecoveryJournalBlock *block = completion->parent; + RecoveryJournal *journal = block->journal; + assertOnJournalThread(journal, __func__); + + journal->pendingWriteCount -= 1; + journal->events.blocks.committed += 1; + journal->events.entries.committed += block->entriesInCommit; + block->uncommittedEntryCount -= block->entriesInCommit; + block->entriesInCommit = 0; + block->committing = false; + + // If this block is the latest block to be acknowledged, record that fact. + if (block->sequenceNumber > journal->lastWriteAcknowledged) { + journal->lastWriteAcknowledged = block->sequenceNumber; + } + + RecoveryJournalBlock *lastActiveBlock + = blockFromRingNode(journal->activeTailBlocks.next); + ASSERT_LOG_ONLY((block->sequenceNumber >= lastActiveBlock->sequenceNumber), + "completed journal write is still active"); + + notifyCommitWaiters(journal); + + // Is this block now full? Reaping, and adding entries, might have already + // sent it off for rewriting; else, queue it for rewrite. + if (isRecoveryBlockDirty(block) && isRecoveryBlockFull(block)) { + scheduleBlockWrite(journal, block); + } + + recycleJournalBlocks(journal); + writeBlocks(journal); + + checkForDrainComplete(journal); +} + +/**********************************************************************/ +static void handleWriteError(VDOCompletion *completion) +{ + RecoveryJournalBlock *block = completion->parent; + RecoveryJournal *journal = block->journal; + logErrorWithStringError(completion->result, + "cannot write recovery journal block %llu", + block->sequenceNumber); + enterJournalReadOnlyMode(journal, completion->result); + completeWrite(completion); +} + +/** + * Issue a block for writing. Implements WaiterCallback. + **/ +static void writeBlock(Waiter *waiter, void *context __attribute__((unused))) +{ + RecoveryJournalBlock *block = blockFromWaiter(waiter); + if (isReadOnly(block->journal->readOnlyNotifier)) { + return; + } + + int result = commitRecoveryBlock(block, completeWrite, handleWriteError); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(block->journal, result); + } +} + +/** + * Attempt to commit blocks, according to write policy. + * + * @param journal The recovery journal + **/ +static void writeBlocks(RecoveryJournal *journal) +{ + assertOnJournalThread(journal, __func__); + /* + * In sync and async-unsafe modes, we call this function each time we queue + * a full block on pending writes; in addition, in all cases we call this + * function after adding entries to the journal and finishing a block write. + * Thus, when this function terminates we must either have no VIOs waiting + * in the journal or have some outstanding IO to provide a future wakeup. + * + * In all modes, if there are no outstanding writes and some unwritten + * entries, we must issue a block, even if it's the active block and it + * isn't full. Otherwise, in sync/async-unsafe modes, we want to issue + * all full blocks every time; since we call it each time we fill a block, + * this is equivalent to issuing every full block as soon as its full. In + * async mode, we want to only issue full blocks if there are no + * pending writes. + */ + + PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; + if ((layer->getWritePolicy(layer) != WRITE_POLICY_ASYNC) + || (journal->pendingWriteCount == 0)) { + // Write all the full blocks. + notifyAllWaiters(&journal->pendingWrites, writeBlock, NULL); + } + + // Do we need to write the active block? Only if we have no outstanding + // writes, even after issuing all of the full writes. + if ((journal->pendingWriteCount == 0) + && canCommitRecoveryBlock(journal->activeBlock)) { + writeBlock(&journal->activeBlock->writeWaiter, NULL); + } +} + +/**********************************************************************/ +void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO) +{ + assertOnJournalThread(journal, __func__); + if (!isNormal(&journal->state)) { + continueDataVIO(dataVIO, VDO_INVALID_ADMIN_STATE); + return; + } + + if (isReadOnly(journal->readOnlyNotifier)) { + continueDataVIO(dataVIO, VDO_READ_ONLY); + return; + } + + bool increment = isIncrementOperation(dataVIO->operation.type); + ASSERT_LOG_ONLY((!increment || (dataVIO->recoverySequenceNumber == 0)), + "journal lock not held for increment"); + + advanceJournalPoint(&journal->appendPoint, journal->entriesPerBlock); + int result = enqueueDataVIO((increment + ? &journal->incrementWaiters + : &journal->decrementWaiters), dataVIO, + THIS_LOCATION("$F($j-$js);io=journal($j-$js)")); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + continueDataVIO(dataVIO, result); + return; + } + + assignEntries(journal); +} + +/** + * Conduct a sweep on a recovery journal to reclaim unreferenced blocks. + * + * @param journal The recovery journal + **/ +static void reapRecoveryJournal(RecoveryJournal *journal) +{ + if (journal->reaping) { + // We already have an outstanding reap in progress. We need to wait for it + // to finish. + return; + } + + if (isQuiescent(&journal->state)) { + // We are supposed to not do IO. Don't botch it by reaping. + return; + } + + // Start reclaiming blocks only when the journal head has no references. Then + // stop when a block is referenced. + while ((journal->blockMapReapHead < journal->lastWriteAcknowledged) + && !isLocked(journal->lockCounter, journal->blockMapHeadBlockNumber, + ZONE_TYPE_LOGICAL)) { + journal->blockMapReapHead++; + if (++journal->blockMapHeadBlockNumber == journal->size) { + journal->blockMapHeadBlockNumber = 0; + } + } + + while ((journal->slabJournalReapHead < journal->lastWriteAcknowledged) + && !isLocked(journal->lockCounter, + journal->slabJournalHeadBlockNumber, + ZONE_TYPE_PHYSICAL)) { + journal->slabJournalReapHead++; + if (++journal->slabJournalHeadBlockNumber == journal->size) { + journal->slabJournalHeadBlockNumber = 0; + } + } + + if ((journal->blockMapReapHead == journal->blockMapHead) + && (journal->slabJournalReapHead == journal->slabJournalHead)) { + // Nothing happened. + return; + } + + PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; + if (layer->getWritePolicy(layer) != WRITE_POLICY_SYNC) { + /* + * If the block map head will advance, we must flush any block map page + * modified by the entries we are reaping. If the slab journal head will + * advance, we must flush the slab summary update covering the slab journal + * that just released some lock. + * + * In sync mode, this is unnecessary because we won't record these numbers + * on disk until the next journal block write, and in sync mode every + * journal block write is preceded by a flush, which does the block map + * page and slab summary update flushing itself. + */ + journal->reaping = true; + launchFlush(journal->flushVIO, completeReaping, handleFlushError); + return; + } + + finishReaping(journal); +} + +/**********************************************************************/ +void acquireRecoveryJournalBlockReference(RecoveryJournal *journal, + SequenceNumber sequenceNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + if (sequenceNumber == 0) { + return; + } + + BlockCount blockNumber + = getRecoveryJournalBlockNumber(journal, sequenceNumber); + acquireLockCountReference(journal->lockCounter, blockNumber, zoneType, + zoneID); +} + +/**********************************************************************/ +void releaseRecoveryJournalBlockReference(RecoveryJournal *journal, + SequenceNumber sequenceNumber, + ZoneType zoneType, + ZoneCount zoneID) +{ + if (sequenceNumber == 0) { + return; + } + + BlockCount blockNumber + = getRecoveryJournalBlockNumber(journal, sequenceNumber); + releaseLockCountReference(journal->lockCounter, blockNumber, zoneType, + zoneID); +} + +/**********************************************************************/ +void releasePerEntryLockFromOtherZone(RecoveryJournal *journal, + SequenceNumber sequenceNumber) +{ + if (sequenceNumber == 0) { + return; + } + + BlockCount blockNumber + = getRecoveryJournalBlockNumber(journal, sequenceNumber); + releaseJournalZoneReferenceFromOtherZone(journal->lockCounter, blockNumber); +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + checkForDrainComplete(container_of(state, RecoveryJournal, state)); +} + +/**********************************************************************/ +void drainRecoveryJournal(RecoveryJournal *journal, + AdminStateCode operation, + VDOCompletion *parent) +{ + assertOnJournalThread(journal, __func__); + startDraining(&journal->state, operation, parent, initiateDrain); +} + +/**********************************************************************/ +void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent) +{ + assertOnJournalThread(journal, __func__); + bool saved = isSaved(&journal->state); + setCompletionResult(parent, resumeIfQuiescent(&journal->state)); + + if (isReadOnly(journal->readOnlyNotifier)) { + finishCompletion(parent, VDO_READ_ONLY); + return; + } + + if (saved) { + initializeJournalState(journal); + } + + if (resumeLockCounter(journal->lockCounter)) { + // We might have missed a notification. + reapRecoveryJournal(journal); + } + + completeCompletion(parent); +} + +/**********************************************************************/ +BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal) +{ + return journal->logicalBlocksUsed; +} + +/**********************************************************************/ +RecoveryJournalStatistics +getRecoveryJournalStatistics(const RecoveryJournal *journal) +{ + return journal->events; +} + +/**********************************************************************/ +void dumpRecoveryJournalStatistics(const RecoveryJournal *journal) +{ + RecoveryJournalStatistics stats = getRecoveryJournalStatistics(journal); + logInfo("Recovery Journal"); + logInfo(" blockMapHead=%llu slabJournalHead=%" PRIu64 + " lastWriteAcknowledged=%llu tail=%" PRIu64 + " blockMapReapHead=%llu slabJournalReapHead=%" PRIu64 + " diskFull=%llu slabJournalCommitsRequested=%" PRIu64 + " incrementWaiters=%zu decrementWaiters=%zu", + journal->blockMapHead, journal->slabJournalHead, + journal->lastWriteAcknowledged, journal->tail, + journal->blockMapReapHead, journal->slabJournalReapHead, + stats.diskFull, stats.slabJournalCommitsRequested, + countWaiters(&journal->incrementWaiters), + countWaiters(&journal->decrementWaiters)); + logInfo(" entries: started=%llu written=%llu committed=%" + PRIu64, + stats.entries.started, stats.entries.written, + stats.entries.committed); + logInfo(" blocks: started=%llu written=%llu committed=%" + PRIu64, + stats.blocks.started, stats.blocks.written, + stats.blocks.committed); + + logInfo(" active blocks:"); + const RingNode *head = &journal->activeTailBlocks; + for (RingNode *node = head->next; node != head; node = node->next) { + dumpRecoveryBlock(blockFromRingNode(node)); + } +} diff --git a/source/vdo/base/recoveryJournal.h b/source/vdo/base/recoveryJournal.h new file mode 100644 index 0000000..8ae7de0 --- /dev/null +++ b/source/vdo/base/recoveryJournal.h @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.h#5 $ + */ + +#ifndef RECOVERY_JOURNAL_H +#define RECOVERY_JOURNAL_H + +#include "buffer.h" + +#include "adminState.h" +#include "completion.h" +#include "fixedLayout.h" +#include "flush.h" +#include "readOnlyNotifier.h" +#include "statistics.h" +#include "trace.h" +#include "types.h" + +/** + * The RecoveryJournal provides a log of all block mapping changes + * which have not yet been stably written to the block map. It exists + * to help provide resiliency guarantees by allowing synchronous + * writes to be acknowledged as soon as the corresponding journal + * entry is committed instead of having to wait for the block map + * update. For asynchronous writes, the journal aids in meeting the + * five second data loss window by ensuring that writes will not be + * lost as long as they are committed to the journal before the window + * expires. This should be less work than committing all of the + * required block map pages. + * + * The journal consists of a set of on-disk blocks arranged as a + * circular log with monotonically increasing sequence numbers. Three + * sequence numbers serve to define the active extent of the + * journal. The 'head' is the oldest active block in the journal. The + * 'tail' is the end of the half-open interval containing the active + * blocks. 'active' is the number of the block actively receiving + * entries. In an empty journal, head == active == tail. Once any + * entries are added, tail = active + 1, and head may be any value in + * the interval [tail - size, active]. + * + * The journal also contains a set of in-memory blocks which are used + * to buffer up entries until they can be committed. In general the + * number of in-memory blocks ('tailBufferCount') will be less than + * the on-disk size. Each in-memory block is also a VDOCompletion. + * Each in-memory block has a VDOExtent which is used to commit that + * block to disk. The extent's data is a PackedJournalBlock (which is a + * formatted journal block). In addition each in-memory block has a + * buffer which is used to accumulate entries while a partial commit + * of the block is in progress. In-memory blocks are kept on two + * rings. Free blocks live on the 'freeTailBlocks' ring. When a block + * becomes active (see below) it is moved to the 'activeTailBlocks' + * ring. When a block is fully committed, it is moved back to the + * 'freeTailBlocks' ring. + * + * When entries are added to the journal, they are added to the active + * in-memory block, as indicated by the 'activeBlock' field. If the + * caller wishes to wait for the entry to be committed, the requesting + * VIO will be attached to the in-memory block to which the caller's + * entry was added. If the caller does wish to wait, or if the entry + * filled the active block, an attempt will be made to commit that + * block to disk. If there is already another commit in progress, the + * attempt will be ignored and then automatically retried when the + * in-progress commit completes. If there is no commit in progress, + * any VIOs waiting on the block are transferred to the extent. The + * extent is then written, automatically waking all of the waiters + * when it completes. When the extent completes, any entries which + * accumulated in the block are copied to the extent's data buffer. + * + * Finally, the journal maintains a set of counters, one for each on + * disk journal block. These counters are used as locks to prevent + * premature reaping of journal blocks. Each time a new sequence + * number is used, the counter for the corresponding block is + * incremented. The counter is subsequently decremented when that + * block is filled and then committed for the last time. This prevents + * blocks from being reaped while they are still being updated. The + * counter is also incremented once for each entry added to a block, + * and decremented once each time the block map is updated in memory + * for that request. This prevents blocks from being reaped while + * their VIOs are still active. Finally, each in-memory block map page + * tracks the oldest journal block that contains entries corresponding to + * uncommitted updates to that block map page. Each time an in-memory block + * map page is updated, it checks if the journal block for the VIO + * is earlier than the one it references, in which case it increments + * the count on the earlier journal block and decrements the count on the + * later journal block, maintaining a lock on the oldest journal block + * containing entries for that page. When a block map page has been flushed + * from the cache, the counter for the journal block it references is + * decremented. Whenever the counter for the head block goes to 0, the + * head is advanced until it comes to a block whose counter is not 0 + * or until it reaches the active block. This is the mechanism for + * reclaiming journal space on disk. + * + * If there is no in-memory space when a VIO attempts to add an entry, + * the VIO will be attached to the 'commitCompletion' and will be + * woken the next time a full block has committed. If there is no + * on-disk space when a VIO attempts to add an entry, the VIO will be + * attached to the 'reapCompletion', and will be woken the next time a + * journal block is reaped. + **/ + +/** + * Return whether a given JournalOperation is an increment type. + * + * @param operation The operation in question + * + * @return true if the type is an increment type + **/ +static inline bool isIncrementOperation(JournalOperation operation) +{ + return ((operation == DATA_INCREMENT) || (operation == BLOCK_MAP_INCREMENT)); +} + +/** + * Get the name of a journal operation. + * + * @param operation The operation to name + * + * @return The name of the operation + **/ +const char *getJournalOperationName(JournalOperation operation) + __attribute__((warn_unused_result)); + +/** + * Create a recovery journal. + * + * @param [in] nonce the nonce of the VDO + * @param [in] layer the physical layer for the journal + * @param [in] partition the partition for the journal + * @param [in] recoveryCount The VDO's number of completed recoveries + * @param [in] journalSize the number of blocks in the journal on disk + * @param [in] tailBufferSize the number of blocks for tail buffer + * @param [in] readOnlyNotifier the read-only mode notifier + * @param [in] threadConfig the thread configuration of the VDO + * @param [out] journalPtr the pointer to hold the new recovery journal + * + * @return a success or error code + **/ +int makeRecoveryJournal(Nonce nonce, + PhysicalLayer *layer, + Partition *partition, + uint64_t recoveryCount, + BlockCount journalSize, + BlockCount tailBufferSize, + ReadOnlyNotifier *readOnlyNotifier, + const ThreadConfig *threadConfig, + RecoveryJournal **journalPtr) + __attribute__((warn_unused_result)); + +/** + * Free a recovery journal and null out the reference to it. + * + * @param [in,out] journalPtr The reference to the recovery journal to free + **/ +void freeRecoveryJournal(RecoveryJournal **journalPtr); + +/** + * Move the backing partition pointer of the recovery journal. + * Assumes that the data in the old and the new partitions is identical. + * + * @param journal the journal being moved + * @param partition the new journal partition + **/ +void setRecoveryJournalPartition(RecoveryJournal *journal, + Partition *partition); + +/** + * Initialize the journal after a recovery. + * + * @param journal The journal in question + * @param recoveryCount The number of completed recoveries + * @param tail The new tail block sequence number + **/ +void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal, + uint64_t recoveryCount, + SequenceNumber tail); + +/** + * Initialize the journal after a rebuild. + * + * @param journal The journal in question + * @param recoveryCount The number of completed recoveries + * @param tail The new tail block sequence number + * @param logicalBlocksUsed The new number of logical blocks used + * @param blockMapDataBlocks The new number of block map data blocks + **/ +void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal, + uint64_t recoveryCount, + SequenceNumber tail, + BlockCount logicalBlocksUsed, + BlockCount blockMapDataBlocks); + +/** + * Get the number of block map pages, allocated from data blocks, currently + * in use. + * + * @param journal The journal in question + * + * @return The number of block map pages allocated from slabs + **/ +BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Set the number of block map pages, allocated from data blocks, currently + * in use. + * + * @param journal The journal in question + * @param pages The number of block map pages allocated from slabs + **/ +void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal, + BlockCount pages); + +/** + * Get the ID of a recovery journal's thread. + * + * @param journal The journal to query + * + * @return The ID of the journal's thread. + **/ +ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Prepare the journal for new entries. + * + * @param journal The journal in question + * @param depot The slab depot for this VDO + * @param blockMap The block map for this VDO + **/ +void openRecoveryJournal(RecoveryJournal *journal, + SlabDepot *depot, + BlockMap *blockMap); + +/** + * Obtain the recovery journal's current sequence number. Exposed only so + * the block map can be initialized therefrom. + * + * @param journal The journal in question + * + * @return the sequence number of the tail block + **/ +SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal); + +/** + * Get the number of usable recovery journal blocks. + * + * @param journalSize The size of the recovery journal in blocks + * + * @return the number of recovery journal blocks usable for entries + **/ +BlockCount getRecoveryJournalLength(BlockCount journalSize) + __attribute__((warn_unused_result)); + +/** + * Get the size of the encoded state of a recovery journal. + * + * @return the encoded size of the journal's state + **/ +size_t getRecoveryJournalEncodedSize(void) + __attribute__((warn_unused_result)); + +/** + * Encode the state of a recovery journal. + * + * @param journal the recovery journal + * @param buffer the buffer to encode into + * + * @return VDO_SUCCESS or an error code + **/ +int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode the state of a recovery journal saved in a buffer. + * + * @param journal the recovery journal + * @param buffer the buffer containing the saved state + * + * @return VDO_SUCCESS or an error code + **/ +int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode the state of a Sodium recovery journal saved in a buffer. + * + * @param journal the recovery journal + * @param buffer the buffer containing the saved state + * + * @return VDO_SUCCESS or an error code + **/ +int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Add an entry to a recovery journal. This method is asynchronous. The DataVIO + * will not be called back until the entry is committed to the on-disk journal. + * + * @param journal The journal in which to make an entry + * @param dataVIO The DataVIO for which to add the entry. The entry will be + * taken from the logical and newMapped fields of the + * DataVIO. The DataVIO's recoverySequenceNumber field will + * be set to the sequence number of the journal block in + * which the entry was made. + **/ +void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO); + +/** + * Acquire a reference to a recovery journal block from somewhere other than + * the journal itself. + * + * @param journal The recovery journal + * @param sequenceNumber The journal sequence number of the referenced block + * @param zoneType The type of the zone making the adjustment + * @param zoneID The ID of the zone making the adjustment + **/ +void acquireRecoveryJournalBlockReference(RecoveryJournal *journal, + SequenceNumber sequenceNumber, + ZoneType zoneType, + ZoneCount zoneID); + + +/** + * Release a reference to a recovery journal block from somewhere other than + * the journal itself. If this is the last reference for a given zone type, + * an attempt will be made to reap the journal. + * + * @param journal The recovery journal + * @param sequenceNumber The journal sequence number of the referenced block + * @param zoneType The type of the zone making the adjustment + * @param zoneID The ID of the zone making the adjustment + **/ +void releaseRecoveryJournalBlockReference(RecoveryJournal *journal, + SequenceNumber sequenceNumber, + ZoneType zoneType, + ZoneCount zoneID); + +/** + * Release a single per-entry reference count for a recovery journal block. This + * method may be called from any zone (but shouldn't be called from the journal + * zone as it would be inefficient). + * + * @param journal The recovery journal + * @param sequenceNumber The journal sequence number of the referenced block + **/ +void releasePerEntryLockFromOtherZone(RecoveryJournal *journal, + SequenceNumber sequenceNumber); + +/** + * Drain recovery journal I/O. All uncommitted entries will be written out. + * + * @param journal The journal to drain + * @param operation The drain operation (suspend or save) + * @param parent The completion to finish once the journal is drained + **/ +void drainRecoveryJournal(RecoveryJournal *journal, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Resume a recovery journal which has been drained. + * + * @param journal The journal to resume + * @param parent The completion to finish once the journal is resumed + * + * @return VDO_SUCCESS or an error + **/ +void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent); + +/** + * Get the number of logical blocks in use by the VDO + * + * @param journal the journal + * + * @return the number of logical blocks in use by the VDO + **/ +BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Get the current statistics from the recovery journal. + * + * @param journal The recovery journal to query + * + * @return a copy of the current statistics for the journal + **/ +RecoveryJournalStatistics +getRecoveryJournalStatistics(const RecoveryJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Dump some current statistics and other debug info from the recovery + * journal. + * + * @param journal The recovery journal to dump + **/ +void dumpRecoveryJournalStatistics(const RecoveryJournal *journal); + +#endif // RECOVERY_JOURNAL_H diff --git a/source/vdo/base/recoveryJournalBlock.c b/source/vdo/base/recoveryJournalBlock.c new file mode 100644 index 0000000..1bbacfc --- /dev/null +++ b/source/vdo/base/recoveryJournalBlock.c @@ -0,0 +1,341 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalBlock.c#13 $ + */ + +#include "recoveryJournalBlock.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "dataVIO.h" +#include "fixedLayout.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalEntry.h" +#include "recoveryJournalInternals.h" +#include "ringNode.h" +#include "vio.h" +#include "waitQueue.h" + +/**********************************************************************/ +int makeRecoveryBlock(PhysicalLayer *layer, + RecoveryJournal *journal, + RecoveryJournalBlock **blockPtr) +{ + // Ensure that a block is large enough to store + // RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries. + STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK + <= ((VDO_BLOCK_SIZE - sizeof(PackedJournalHeader)) + / sizeof(PackedRecoveryJournalEntry))); + + RecoveryJournalBlock *block; + int result = ALLOCATE(1, RecoveryJournalBlock, __func__, &block); + if (result != VDO_SUCCESS) { + return result; + } + + // Allocate a full block for the journal block even though not all of the + // space is used since the VIO needs to write a full disk block. + result = ALLOCATE(VDO_BLOCK_SIZE, char, "PackedJournalBlock", &block->block); + if (result != VDO_SUCCESS) { + freeRecoveryBlock(&block); + return result; + } + + result = createVIO(layer, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH, + block, block->block, &block->vio); + if (result != VDO_SUCCESS) { + freeRecoveryBlock(&block); + return result; + } + + block->vio->completion.callbackThreadID = journal->threadID; + initializeRing(&block->ringNode); + block->journal = journal; + + *blockPtr = block; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeRecoveryBlock(RecoveryJournalBlock **blockPtr) +{ + RecoveryJournalBlock *block = *blockPtr; + if (block == NULL) { + return; + } + + FREE(block->block); + freeVIO(&block->vio); + FREE(block); + *blockPtr = NULL; +} + +/** + * Get a pointer to the packed journal block header in the block buffer. + * + * @param block The recovery block + * + * @return The block's header + **/ +static inline +PackedJournalHeader *getBlockHeader(const RecoveryJournalBlock *block) +{ + return (PackedJournalHeader *) block->block; +} + +/** + * Set the current sector of the current block and initialize it. + * + * @param block The block to update + * @param sector A pointer to the first byte of the new sector + **/ +static void setActiveSector(RecoveryJournalBlock *block, void *sector) +{ + block->sector = (PackedJournalSector *) sector; + block->sector->checkByte = getBlockHeader(block)->fields.checkByte; + block->sector->recoveryCount = block->journal->recoveryCount; + block->sector->entryCount = 0; +} + +/**********************************************************************/ +void initializeRecoveryBlock(RecoveryJournalBlock *block) +{ + memset(block->block, 0x0, VDO_BLOCK_SIZE); + + RecoveryJournal *journal = block->journal; + block->sequenceNumber = journal->tail; + block->entryCount = 0; + block->uncommittedEntryCount = 0; + + block->blockNumber = getRecoveryJournalBlockNumber(journal, journal->tail); + + RecoveryBlockHeader unpacked = { + .metadataType = VDO_METADATA_RECOVERY_JOURNAL, + .blockMapDataBlocks = journal->blockMapDataBlocks, + .logicalBlocksUsed = journal->logicalBlocksUsed, + .nonce = journal->nonce, + .recoveryCount = journal->recoveryCount, + .sequenceNumber = journal->tail, + .checkByte = computeRecoveryCheckByte(journal, journal->tail), + }; + PackedJournalHeader *header = getBlockHeader(block); + packRecoveryBlockHeader(&unpacked, header); + + setActiveSector(block, getJournalBlockSector(header, 1)); +} + +/**********************************************************************/ +int enqueueRecoveryBlockEntry(RecoveryJournalBlock *block, DataVIO *dataVIO) +{ + // First queued entry indicates this is a journal block we've just opened + // or a committing block we're extending and will have to write again. + bool newBatch = !hasWaiters(&block->entryWaiters); + + // Enqueue the DataVIO to wait for its entry to commit. + int result = enqueueDataVIO(&block->entryWaiters, dataVIO, + THIS_LOCATION("$F($j-$js)")); + if (result != VDO_SUCCESS) { + return result; + } + + block->entryCount++; + block->uncommittedEntryCount++; + + // Update stats to reflect the journal entry we're going to write. + if (newBatch) { + block->journal->events.blocks.started++; + } + block->journal->events.entries.started++; + + return VDO_SUCCESS; +} + +/** + * Check whether the current sector of a block is full. + * + * @param block The block to check + * + * @return true if the sector is full + **/ +__attribute__((warn_unused_result)) +static bool isSectorFull(const RecoveryJournalBlock *block) +{ + return (block->sector->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); +} + +/** + * Actually add entries from the queue to the given block. + * + * @param block The journal block + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int addQueuedRecoveryEntries(RecoveryJournalBlock *block) +{ + while (hasWaiters(&block->entryWaiters)) { + DataVIO *dataVIO + = waiterAsDataVIO(dequeueNextWaiter(&block->entryWaiters)); + if (dataVIO->operation.type == DATA_INCREMENT) { + // In order to not lose committed sectors of this partial write, we must + // flush before the partial write entries are committed. + block->hasPartialWriteEntry = (block->hasPartialWriteEntry + || dataVIO->isPartialWrite); + /* + * In order to not lose acknowledged writes with the FUA flag set, we + * must issue a flush to cover the data write and also all previous + * journal writes, and we must issue a FUA on the journal write. + */ + block->hasFUAEntry = (block->hasFUAEntry + || vioRequiresFlushAfter(dataVIOAsVIO(dataVIO))); + } + + // Compose and encode the entry. + PackedRecoveryJournalEntry *packedEntry + = &block->sector->entries[block->sector->entryCount++]; + TreeLock *lock = &dataVIO->treeLock; + RecoveryJournalEntry newEntry = { + .mapping = { + .pbn = dataVIO->operation.pbn, + .state = dataVIO->operation.state, + }, + .operation = dataVIO->operation.type, + .slot = lock->treeSlots[lock->height].blockMapSlot, + }; + *packedEntry = packRecoveryJournalEntry(&newEntry); + + if (isIncrementOperation(dataVIO->operation.type)) { + dataVIO->recoverySequenceNumber = block->sequenceNumber; + } + + // Enqueue the DataVIO to wait for its entry to commit. + int result = enqueueDataVIO(&block->commitWaiters, dataVIO, + THIS_LOCATION("$F($j-$js)")); + if (result != VDO_SUCCESS) { + continueDataVIO(dataVIO, result); + return result; + } + + if (isSectorFull(block)) { + setActiveSector(block, (char *) block->sector + VDO_SECTOR_SIZE); + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int getRecoveryBlockPBN(RecoveryJournalBlock *block, + PhysicalBlockNumber *pbnPtr) +{ + RecoveryJournal *journal = block->journal; + int result = translateToPBN(journal->partition, block->blockNumber, pbnPtr); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, + "Error translating recovery journal block " + "number %llu", block->blockNumber); + } + return result; +} + +/**********************************************************************/ +bool canCommitRecoveryBlock(RecoveryJournalBlock *block) +{ + // Cannot commit in read-only mode, if already committing the block, or + // if there are no entries to commit. + return ((block != NULL) + && !block->committing + && hasWaiters(&block->entryWaiters) + && !isReadOnly(block->journal->readOnlyNotifier)); +} + +/**********************************************************************/ +int commitRecoveryBlock(RecoveryJournalBlock *block, + VDOAction *callback, + VDOAction *errorHandler) +{ + int result = ASSERT(canCommitRecoveryBlock(block), "should never call %s" + " when the block can't be committed", __func__); + if (result != VDO_SUCCESS) { + return result; + } + + PhysicalBlockNumber blockPBN; + result = getRecoveryBlockPBN(block, &blockPBN); + if (result != VDO_SUCCESS) { + return result; + } + + block->entriesInCommit = countWaiters(&block->entryWaiters); + result = addQueuedRecoveryEntries(block); + if (result != VDO_SUCCESS) { + return result; + } + + RecoveryJournal *journal = block->journal; + PackedJournalHeader *header = getBlockHeader(block); + + // Update stats to reflect the block and entries we're about to write. + journal->pendingWriteCount += 1; + journal->events.blocks.written += 1; + journal->events.entries.written += block->entriesInCommit; + + storeUInt64LE(header->fields.blockMapHead, journal->blockMapHead); + storeUInt64LE(header->fields.slabJournalHead, journal->slabJournalHead); + storeUInt16LE(header->fields.entryCount, block->entryCount); + + block->committing = true; + + /* + * In sync or async mode, when we are writing an increment entry for a + * request with FUA, or when making the increment entry for a partial + * write, we need to make sure all the data being mapped to by this block + * is stable on disk and also that the recovery journal is stable up to + * the current block, so we must flush before writing. + * + * In sync mode, and for FUA, we also need to make sure that the write we + * are doing is stable, so we issue the write with FUA. + */ + PhysicalLayer *layer = vioAsCompletion(block->vio)->layer; + bool fua = (block->hasFUAEntry + || (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC)); + bool flush = (block->hasFUAEntry + || (layer->getWritePolicy(layer) != WRITE_POLICY_ASYNC_UNSAFE) + || block->hasPartialWriteEntry); + block->hasFUAEntry = false; + block->hasPartialWriteEntry = false; + launchWriteMetadataVIOWithFlush(block->vio, blockPBN, callback, errorHandler, + flush, fua); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void dumpRecoveryBlock(const RecoveryJournalBlock *block) +{ + logInfo(" sequence number %llu; entries %" PRIu16 + "; %s; %zu entry waiters; %zu commit waiters", + block->sequenceNumber, + block->entryCount, + (block->committing ? "committing" : "waiting"), + countWaiters(&block->entryWaiters), + countWaiters(&block->commitWaiters)); +} diff --git a/source/vdo/base/recoveryJournalBlock.h b/source/vdo/base/recoveryJournalBlock.h new file mode 100644 index 0000000..f26f8e8 --- /dev/null +++ b/source/vdo/base/recoveryJournalBlock.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalBlock.h#8 $ + */ + +#ifndef RECOVERY_JOURNAL_BLOCK_H +#define RECOVERY_JOURNAL_BLOCK_H + +#include "permassert.h" + +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalInternals.h" +#include "ringNode.h" +#include "types.h" +#include "waitQueue.h" + +struct recoveryJournalBlock { + /** The doubly linked pointers for the free or active lists */ + RingNode ringNode; + /** The waiter for the pending full block list */ + Waiter writeWaiter; + /** The journal to which this block belongs */ + RecoveryJournal *journal; + /** A pointer to a block-sized buffer holding the packed block data */ + char *block; + /** A pointer to the current sector in the packed block buffer */ + PackedJournalSector *sector; + /** The VIO for writing this block */ + VIO *vio; + /** The sequence number for this block */ + SequenceNumber sequenceNumber; + /** The location of this block in the on-disk journal */ + PhysicalBlockNumber blockNumber; + /** Whether this block is being committed */ + bool committing; + /** Whether this block has an uncommitted increment for a partial write */ + bool hasPartialWriteEntry; + /** Whether this block has an uncommitted increment for a write with FUA */ + bool hasFUAEntry; + /** The total number of entries in this block */ + JournalEntryCount entryCount; + /** The total number of uncommitted entries (queued or committing) */ + JournalEntryCount uncommittedEntryCount; + /** The number of new entries in the current commit */ + JournalEntryCount entriesInCommit; + /** The queue of VIOs which will make entries for the next commit */ + WaitQueue entryWaiters; + /** The queue of VIOs waiting for the current commit */ + WaitQueue commitWaiters; +}; + +/** + * Return the block associated with a ring node. + * + * @param node The ring node to recast as a block + * + * @return The block + **/ +static inline RecoveryJournalBlock *blockFromRingNode(RingNode *node) +{ + STATIC_ASSERT(offsetof(RecoveryJournalBlock, ringNode) == 0); + return (RecoveryJournalBlock *) node; +} + +/** + * Return the block associated with a waiter + * + * @param waiter The waiter to recast as a block + * + * @return The block + **/ +static inline RecoveryJournalBlock *blockFromWaiter(Waiter *waiter) +{ + return (RecoveryJournalBlock *) + ((uintptr_t) waiter - offsetof(RecoveryJournalBlock, writeWaiter)); +} + +/** + * Check whether a recovery block is dirty, indicating it has any uncommitted + * entries, which includes both entries not written and entries written but + * not yet acknowledged. + * + * @param block The block to check + * + * @return true if the block has any uncommitted entries + **/ +__attribute__((warn_unused_result)) +static inline bool isRecoveryBlockDirty(const RecoveryJournalBlock *block) +{ + return (block->uncommittedEntryCount > 0); +} + +/** + * Check whether a journal block is empty. + * + * @param block The block to check + * + * @return true if the block has no entries + **/ +__attribute__((warn_unused_result)) +static inline bool isRecoveryBlockEmpty(const RecoveryJournalBlock *block) +{ + return (block->entryCount == 0); +} + +/** + * Check whether a journal block is full. + * + * @param block The block to check + * + * @return true if the the block is full + **/ +__attribute__((warn_unused_result)) +static inline bool isRecoveryBlockFull(const RecoveryJournalBlock *block) +{ + return ((block == NULL) + || (block->journal->entriesPerBlock == block->entryCount)); +} + +/** + * Construct a journal block. + * + * @param [in] layer The layer from which to construct VIOs + * @param [in] journal The journal to which the block will belong + * @param [out] blockPtr A pointer to receive the new block + * + * @return VDO_SUCCESS or an error + **/ +int makeRecoveryBlock(PhysicalLayer *layer, + RecoveryJournal *journal, + RecoveryJournalBlock **blockPtr) + __attribute__((warn_unused_result)); + +/** + * Free a tail block and null out the reference to it. + * + * @param blockPtr The reference to the tail block to free + **/ +void freeRecoveryBlock(RecoveryJournalBlock **blockPtr); + +/** + * Initialize the next active recovery journal block. + * + * @param block The journal block to initialize + **/ +void initializeRecoveryBlock(RecoveryJournalBlock *block); + +/** + * Enqueue a DataVIO to asynchronously encode and commit its next recovery + * journal entry in this block. The DataVIO will not be continued until the + * entry is committed to the on-disk journal. The caller is responsible for + * ensuring the block is not already full. + * + * @param block The journal block in which to make an entry + * @param dataVIO The DataVIO to enqueue + * + * @return VDO_SUCCESS or an error code if the DataVIO could not be enqueued + **/ +int enqueueRecoveryBlockEntry(RecoveryJournalBlock *block, DataVIO *dataVIO) + __attribute__((warn_unused_result)); + +/** + * Attempt to commit a block. If the block is not the oldest block with + * uncommitted entries or if it is already being committed, nothing will be + * done. + * + * @param block The block to write + * @param callback The function to call when the write completes + * @param errorHandler The handler for flush or write errors + * + * @return VDO_SUCCESS, or an error if the write could not be launched + **/ +int commitRecoveryBlock(RecoveryJournalBlock *block, + VDOAction *callback, + VDOAction *errorHandler) + __attribute__((warn_unused_result)); + +/** + * Dump the contents of the recovery block to the log. + * + * @param block The block to dump + **/ +void dumpRecoveryBlock(const RecoveryJournalBlock *block); + +/** + * Check whether a journal block can be committed. + * + * @param block The journal block in question + * + * @return true if the block can be committed now + **/ +bool canCommitRecoveryBlock(RecoveryJournalBlock *block) + __attribute__((warn_unused_result)); + +#endif // RECOVERY_JOURNAL_BLOCK_H diff --git a/source/vdo/base/recoveryJournalEntry.h b/source/vdo/base/recoveryJournalEntry.h new file mode 100644 index 0000000..bf2a3e0 --- /dev/null +++ b/source/vdo/base/recoveryJournalEntry.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalEntry.h#1 $ + */ + +#ifndef RECOVERY_JOURNAL_ENTRY_H +#define RECOVERY_JOURNAL_ENTRY_H + +#include "numeric.h" + +#include "blockMapEntry.h" +#include "journalPoint.h" +#include "types.h" + +/** + * A recovery journal entry stores two physical locations: a data location + * that is the value of a single mapping in the block map tree, and the + * location of the block map page and and slot that is either acquiring or + * releasing a reference to the data location. The journal entry also stores + * an operation code that says whether the reference is being acquired (an + * increment) or released (a decrement), and whether the mapping is for a + * logical block or for the block map tree itself. + **/ +typedef struct { + BlockMapSlot slot; + DataLocation mapping; + JournalOperation operation; +} RecoveryJournalEntry; + +/** The packed, on-disk representation of a recovery journal entry. */ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** + * In little-endian bit order: + * Bits 15..12: The four highest bits of the 36-bit physical block number + * of the block map tree page + * Bits 11..2: The 10-bit block map page slot number + * Bits 1..0: The 2-bit JournalOperation of the entry + **/ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned operation : 2; + unsigned slotLow : 6; + unsigned slotHigh : 4; + unsigned pbnHighNibble : 4; +#else + unsigned slotLow : 6; + unsigned operation : 2; + unsigned pbnHighNibble : 4; + unsigned slotHigh : 4; +#endif + + /** + * Bits 47..16: The 32 low-order bits of the block map page PBN, + * in little-endian byte order + **/ + byte pbnLowWord[4]; + + /** + * Bits 87..48: The five-byte block map entry encoding the location that + * was or will be stored in the block map page slot + **/ + BlockMapEntry blockMapEntry; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[11]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + unsigned operation : 2; + unsigned slot : 10; + unsigned pbnHighNibble : 4; + uint32_t pbnLowWord; + BlockMapEntry blockMapEntry; + } littleEndian; +#endif +} PackedRecoveryJournalEntry; + +/** + * Return the packed, on-disk representation of a recovery journal entry. + * + * @param entry The journal entry to pack + * + * @return The packed representation of the journal entry + **/ +static inline PackedRecoveryJournalEntry +packRecoveryJournalEntry(const RecoveryJournalEntry *entry) +{ + PackedRecoveryJournalEntry packed = { + .fields = { + .operation = entry->operation, + .slotLow = entry->slot.slot & 0x3F, + .slotHigh = (entry->slot.slot >> 6) & 0x0F, + .pbnHighNibble = (entry->slot.pbn >> 32) & 0x0F, + .blockMapEntry = packPBN(entry->mapping.pbn, entry->mapping.state), + } + }; + storeUInt32LE(packed.fields.pbnLowWord, entry->slot.pbn & UINT_MAX); + return packed; +} + +/** + * Unpack the on-disk representation of a recovery journal entry. + * + * @param entry The recovery journal entry to unpack + * + * @return The unpacked entry + **/ +static inline RecoveryJournalEntry +unpackRecoveryJournalEntry(const PackedRecoveryJournalEntry *entry) +{ + PhysicalBlockNumber low32 = getUInt32LE(entry->fields.pbnLowWord); + PhysicalBlockNumber high4 = entry->fields.pbnHighNibble; + return (RecoveryJournalEntry) { + .operation = entry->fields.operation, + .slot = { + .pbn = ((high4 << 32) | low32), + .slot = (entry->fields.slotLow | (entry->fields.slotHigh << 6)), + }, + .mapping = unpackBlockMapEntry(&entry->fields.blockMapEntry), + }; +} + +#endif // RECOVERY_JOURNAL_ENTRY_H diff --git a/source/vdo/base/recoveryJournalInternals.h b/source/vdo/base/recoveryJournalInternals.h new file mode 100644 index 0000000..0266990 --- /dev/null +++ b/source/vdo/base/recoveryJournalInternals.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalInternals.h#10 $ + */ + +#ifndef RECOVERY_JOURNAL_INTERNALS_H +#define RECOVERY_JOURNAL_INTERNALS_H + +#include "numeric.h" + +#include "adminState.h" +#include "fixedLayout.h" +#include "journalPoint.h" +#include "lockCounter.h" +#include "recoveryJournal.h" +#include "ringNode.h" +#include "statistics.h" +#include "types.h" +#include "waitQueue.h" + +typedef struct recoveryJournalBlock RecoveryJournalBlock; + +struct recoveryJournal { + /** The thread ID of the journal zone */ + ThreadID threadID; + /** The slab depot which can hold locks on this journal */ + SlabDepot *depot; + /** The block map which can hold locks on this journal */ + BlockMap *blockMap; + /** The queue of VIOs waiting to make increment entries */ + WaitQueue incrementWaiters; + /** The queue of VIOs waiting to make decrement entries */ + WaitQueue decrementWaiters; + /** The number of free entries in the journal */ + uint64_t availableSpace; + /** The number of decrement entries which need to be made */ + VIOCount pendingDecrementCount; + /** + * Whether the journal is adding entries from the increment or + * decrement waiters queues + **/ + bool addingEntries; + /** The notifier for read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The administrative state of the journal */ + AdminState state; + /** Whether a reap is in progress */ + bool reaping; + /** The partition which holds the journal on disk */ + Partition *partition; + /** The oldest active block in the journal on disk for block map rebuild */ + SequenceNumber blockMapHead; + /** The oldest active block in the journal on disk for slab journal replay */ + SequenceNumber slabJournalHead; + /** The newest block in the journal on disk to which a write has finished */ + SequenceNumber lastWriteAcknowledged; + /** The end of the half-open interval of the active journal */ + SequenceNumber tail; + /** The point at which the last entry will have been added */ + JournalPoint appendPoint; + /** The journal point of the VIO most recently released from the journal */ + JournalPoint commitPoint; + /** The nonce of the VDO */ + Nonce nonce; + /** The number of recoveries completed by the VDO */ + uint8_t recoveryCount; + /** The number of entries which fit in a single block */ + JournalEntryCount entriesPerBlock; + /** Unused in-memory journal blocks */ + RingNode freeTailBlocks; + /** In-memory journal blocks with records */ + RingNode activeTailBlocks; + /** A pointer to the active block (the one we are adding entries to now) */ + RecoveryJournalBlock *activeBlock; + /** Journal blocks that need writing */ + WaitQueue pendingWrites; + /** The new block map reap head after reaping */ + SequenceNumber blockMapReapHead; + /** The head block number for the block map rebuild range */ + BlockCount blockMapHeadBlockNumber; + /** The new slab journal reap head after reaping */ + SequenceNumber slabJournalReapHead; + /** The head block number for the slab journal replay range */ + BlockCount slabJournalHeadBlockNumber; + /** The VIO on which we can call flush (less ick, but still ick) */ + VIO *flushVIO; + /** The data block which must live in the VIO in the flush extent */ + char *unusedFlushVIOData; + /** The number of blocks in the on-disk journal */ + BlockCount size; + /** The number of logical blocks that are in-use */ + BlockCount logicalBlocksUsed; + /** The number of block map pages that are allocated */ + BlockCount blockMapDataBlocks; + /** The number of journal blocks written but not yet acknowledged */ + BlockCount pendingWriteCount; + /** The threshold at which slab journal tail blocks will be written out */ + BlockCount slabJournalCommitThreshold; + /** Counters for events in the journal that are reported as statistics */ + RecoveryJournalStatistics events; + /** The locks for each on-disk block */ + LockCounter *lockCounter; +}; + +/** + * Get the physical block number for a given sequence number. + * + * @param journal The journal + * @param sequence The sequence number of the desired block + * + * @return The block number corresponding to the sequence number + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber +getRecoveryJournalBlockNumber(const RecoveryJournal *journal, + SequenceNumber sequence) +{ + // Since journal size is a power of two, the block number modulus can just + // be extracted from the low-order bits of the sequence. + return (sequence & (journal->size - 1)); +} + +/** + * Compute the checkByte for a given sequence number. + * + * @param journal The journal + * @param sequence The sequence number + * + * @return The check byte corresponding to the sequence number + **/ +__attribute__((warn_unused_result)) +static inline uint8_t computeRecoveryCheckByte(const RecoveryJournal *journal, + SequenceNumber sequence) +{ + // The check byte must change with each trip around the journal. + return (((sequence / journal->size) & 0x7F) | 0x80); +} + +#endif // RECOVERY_JOURNAL_INTERNALS_H diff --git a/source/vdo/base/recoveryUtils.c b/source/vdo/base/recoveryUtils.c new file mode 100644 index 0000000..44f16ee --- /dev/null +++ b/source/vdo/base/recoveryUtils.c @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryUtils.c#4 $ + */ + +#include "recoveryUtils.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "completion.h" +#include "extent.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalEntry.h" +#include "recoveryJournalInternals.h" +#include "slabDepot.h" +#include "vdoInternal.h" + +/** + * Finish loading the journal by freeing the extent and notifying the parent. + * This callback is registered in loadJournalAsync(). + * + * @param completion The load extent + **/ +static void finishJournalLoad(VDOCompletion *completion) +{ + int result = completion->result; + VDOCompletion *parent = completion->parent; + VDOExtent *extent = asVDOExtent(completion); + freeExtent(&extent); + finishCompletion(parent, result); +} + +/**********************************************************************/ +void loadJournalAsync(RecoveryJournal *journal, + VDOCompletion *parent, + char **journalDataPtr) +{ + int result = ALLOCATE(journal->size * VDO_BLOCK_SIZE, char, __func__, + journalDataPtr); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + VDOExtent *extent; + result = createExtent(parent->layer, VIO_TYPE_RECOVERY_JOURNAL, + VIO_PRIORITY_METADATA, journal->size, + *journalDataPtr, &extent); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + prepareCompletion(&extent->completion, finishJournalLoad, finishJournalLoad, + parent->callbackThreadID, parent); + readMetadataExtent(extent, + getFixedLayoutPartitionOffset(journal->partition)); +} + +/** + * Determine whether the given header describe a valid block for the + * given journal that could appear at the given offset in the journal. + * + * @param journal The journal to use + * @param header The unpacked block header to check + * @param offset An offset indicating where the block was in the journal + * + * @return True if the header matches + **/ +__attribute__((warn_unused_result)) +static bool isCongruentRecoveryJournalBlock(RecoveryJournal *journal, + const RecoveryBlockHeader *header, + PhysicalBlockNumber offset) +{ + PhysicalBlockNumber expectedOffset + = getRecoveryJournalBlockNumber(journal, header->sequenceNumber); + return ((expectedOffset == offset) + && isValidRecoveryJournalBlock(journal, header)); +} + +/**********************************************************************/ +bool findHeadAndTail(RecoveryJournal *journal, + char *journalData, + SequenceNumber *tailPtr, + SequenceNumber *blockMapHeadPtr, + SequenceNumber *slabJournalHeadPtr) +{ + SequenceNumber highestTail = journal->tail; + SequenceNumber blockMapHeadMax = 0; + SequenceNumber slabJournalHeadMax = 0; + bool foundEntries = false; + for (PhysicalBlockNumber i = 0; i < journal->size; i++) { + PackedJournalHeader *packedHeader + = getJournalBlockHeader(journal, journalData, i); + RecoveryBlockHeader header; + unpackRecoveryBlockHeader(packedHeader, &header); + + if (!isCongruentRecoveryJournalBlock(journal, &header, i)) { + // This block is old, unformatted, or doesn't belong at this location. + continue; + } + + if (header.sequenceNumber >= highestTail) { + foundEntries = true; + highestTail = header.sequenceNumber; + } + if (header.blockMapHead > blockMapHeadMax) { + blockMapHeadMax = header.blockMapHead; + } + if (header.slabJournalHead > slabJournalHeadMax) { + slabJournalHeadMax = header.slabJournalHead; + } + } + + *tailPtr = highestTail; + if (!foundEntries) { + return false; + } + + *blockMapHeadPtr = blockMapHeadMax; + if (slabJournalHeadPtr != NULL) { + *slabJournalHeadPtr = slabJournalHeadMax; + } + return true; +} + +/**********************************************************************/ +int validateRecoveryJournalEntry(const VDO *vdo, + const RecoveryJournalEntry *entry) +{ + if ((entry->slot.pbn >= vdo->config.physicalBlocks) + || (entry->slot.slot >= BLOCK_MAP_ENTRIES_PER_PAGE) + || !isValidLocation(&entry->mapping) + || !isPhysicalDataBlock(vdo->depot, entry->mapping.pbn)) { + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Invalid entry:" + " (%llu, %" PRIu16 ") to %" PRIu64 + " (%s) is not within bounds", + entry->slot.pbn, entry->slot.slot, + entry->mapping.pbn, + getJournalOperationName(entry->operation)); + } + + if ((entry->operation == BLOCK_MAP_INCREMENT) + && (isCompressed(entry->mapping.state) + || (entry->mapping.pbn == ZERO_BLOCK))) { + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Invalid entry:" + " (%llu, %" PRIu16 ") to %" PRIu64 + " (%s) is not a valid tree mapping", + entry->slot.pbn, entry->slot.slot, + entry->mapping.pbn, + getJournalOperationName(entry->operation)); + } + + return VDO_SUCCESS; +} diff --git a/source/vdo/base/recoveryUtils.h b/source/vdo/base/recoveryUtils.h new file mode 100644 index 0000000..6778af9 --- /dev/null +++ b/source/vdo/base/recoveryUtils.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryUtils.h#5 $ + */ + +#ifndef RECOVERY_UTILS_H +#define RECOVERY_UTILS_H + +#include "constants.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournalEntry.h" +#include "recoveryJournalInternals.h" +#include "types.h" + +/** + * Get the block header for a block at a position in the journal data. + * + * @param journal The recovery journal + * @param journalData The recovery journal data + * @param sequence The sequence number + * + * @return A pointer to a packed recovery journal block header. + **/ +__attribute__((warn_unused_result)) +static inline +PackedJournalHeader *getJournalBlockHeader(RecoveryJournal *journal, + char *journalData, + SequenceNumber sequence) +{ + off_t blockOffset = (getRecoveryJournalBlockNumber(journal, sequence) + * VDO_BLOCK_SIZE); + return (PackedJournalHeader *) &journalData[blockOffset]; +} + +/** + * Determine whether the given header describes a valid block for the + * given journal. A block is not valid if it is unformatted, or if it + * is older than the last successful recovery or reformat. + * + * @param journal The journal to use + * @param header The unpacked block header to check + * + * @return True if the header is valid + **/ +__attribute__((warn_unused_result)) +static inline +bool isValidRecoveryJournalBlock(const RecoveryJournal *journal, + const RecoveryBlockHeader *header) +{ + return ((header->metadataType == VDO_METADATA_RECOVERY_JOURNAL) + && (header->nonce == journal->nonce) + && (header->recoveryCount == journal->recoveryCount)); +} + +/** + * Determine whether the given header describes the exact block indicated. + * + * @param journal The journal to use + * @param header The unpacked block header to check + * @param sequence The expected sequence number + * + * @return True if the block matches + **/ +__attribute__((warn_unused_result)) +static inline +bool isExactRecoveryJournalBlock(const RecoveryJournal *journal, + const RecoveryBlockHeader *header, + SequenceNumber sequence) +{ + return ((header->sequenceNumber == sequence) + && isValidRecoveryJournalBlock(journal, header)); +} + +/** + * Determine whether the header of the given sector could describe a + * valid sector for the given journal block header. + * + * @param header The unpacked block header to compare against + * @param sector The packed sector to check + * + * @return True if the sector matches the block header + **/ +__attribute__((warn_unused_result)) +static inline +bool isValidRecoveryJournalSector(const RecoveryBlockHeader *header, + const PackedJournalSector *sector) +{ + return ((header->checkByte == sector->checkByte) + && (header->recoveryCount == sector->recoveryCount)); +} + +/** + * Load the journal data off the disk. + * + * @param [in] journal The recovery journal to load + * @param [in] parent The completion to notify when the load is + * complete + * @param [out] journalDataPtr A pointer to the journal data buffer (it is the + * caller's responsibility to free this buffer) + **/ +void loadJournalAsync(RecoveryJournal *journal, + VDOCompletion *parent, + char **journalDataPtr); + +/** + * Find the tail and the head of the journal by searching for the highest + * sequence number in a block with a valid nonce, and the highest head value + * among the blocks with valid nonces. + * + * @param [in] journal The recovery journal + * @param [in] journalData The journal data read from disk + * @param [out] tailPtr A pointer to return the tail found, or if + * no higher block is found, the value + * currently in the journal + * @param [out] blockMapHeadPtr A pointer to return the block map head + * @param [out] slabJournalHeadPtr An optional pointer to return the slab + * journal head + * + * @return True if there were valid journal blocks + **/ +bool findHeadAndTail(RecoveryJournal *journal, + char *journalData, + SequenceNumber *tailPtr, + SequenceNumber *blockMapHeadPtr, + SequenceNumber *slabJournalHeadPtr); + +/** + * Validate a recovery journal entry. + * + * @param vdo The VDO + * @param entry The entry to validate + * + * @return VDO_SUCCESS or an error + **/ +int validateRecoveryJournalEntry(const VDO *vdo, + const RecoveryJournalEntry *entry) + __attribute__((warn_unused_result)); + +#endif // RECOVERY_UTILS_H diff --git a/source/vdo/base/refCounts.c b/source/vdo/base/refCounts.c new file mode 100644 index 0000000..daf04c4 --- /dev/null +++ b/source/vdo/base/refCounts.c @@ -0,0 +1,1451 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCounts.c#9 $ + */ + +#include "refCounts.h" +#include "refCountsInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "adminState.h" +#include "blockAllocatorInternals.h" +#include "completion.h" +#include "extent.h" +#include "header.h" +#include "journalPoint.h" +#include "numUtils.h" +#include "pbnLock.h" +#include "readOnlyNotifier.h" +#include "referenceBlock.h" +#include "referenceOperation.h" +#include "slab.h" +#include "slabJournal.h" +#include "slabJournalInternals.h" +#include "slabSummary.h" +#include "statusCodes.h" +#include "stringUtils.h" +#include "vdo.h" +#include "vioPool.h" +#include "waitQueue.h" + +static const uint64_t BYTES_PER_WORD = sizeof(uint64_t); +static const bool NORMAL_OPERATION = true; + +/** + * Return the RefCounts from the RefCounts waiter. + * + * @param waiter The waiter to convert + * + * @return The RefCounts + **/ +__attribute__((warn_unused_result)) +static inline RefCounts *refCountsFromWaiter(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + return (RefCounts *) + ((uintptr_t) waiter - offsetof(RefCounts, slabSummaryWaiter)); +} + +/** + * Convert the index of a reference counter back to the block number of the + * physical block for which it is counting references. The index is assumed to + * be valid and in-range. + * + * @param refCounts The reference counts object + * @param index The array index of the reference counter + * + * @return the physical block number corresponding to the index + **/ +static PhysicalBlockNumber indexToPBN(const RefCounts *refCounts, + uint64_t index) +{ + return (refCounts->slab->start + index); +} + +/** + * Convert a block number to the index of a reference counter for that block. + * Out of range values are pinned to the beginning or one past the end of the + * array. + * + * @param refCounts The reference counts object + * @param pbn The physical block number + * + * @return the index corresponding to the physical block number + **/ +static uint64_t pbnToIndex(const RefCounts *refCounts, PhysicalBlockNumber pbn) +{ + if (pbn < refCounts->slab->start) { + return 0; + } + uint64_t index = (pbn - refCounts->slab->start); + return minBlock(index, refCounts->blockCount); +} + +/**********************************************************************/ +ReferenceStatus referenceCountToStatus(ReferenceCount count) +{ + if (count == EMPTY_REFERENCE_COUNT) { + return RS_FREE; + } else if (count == 1) { + return RS_SINGLE; + } else if (count == PROVISIONAL_REFERENCE_COUNT) { + return RS_PROVISIONAL; + } else { + return RS_SHARED; + } +} + +/** + * Reset the free block search back to the first reference counter + * in the first reference block. + * + * @param refCounts The RefCounts object containing the search cursor + **/ +static void resetSearchCursor(RefCounts *refCounts) +{ + SearchCursor *cursor = &refCounts->searchCursor; + + cursor->block = cursor->firstBlock; + cursor->index = 0; + // Unit tests have slabs with only one reference block (and it's a runt). + cursor->endIndex = minBlock(COUNTS_PER_BLOCK, refCounts->blockCount); +} + +/** + * Advance the search cursor to the start of the next reference block, + * wrapping around to the first reference block if the current block is the + * last reference block. + * + * @param refCounts The RefCounts object containing the search cursor + * + * @return true unless the cursor was at the last reference block + **/ +static bool advanceSearchCursor(RefCounts *refCounts) +{ + SearchCursor *cursor = &refCounts->searchCursor; + + // If we just finished searching the last reference block, then wrap back + // around to the start of the array. + if (cursor->block == cursor->lastBlock) { + resetSearchCursor(refCounts); + return false; + } + + // We're not already at the end, so advance to cursor to the next block. + cursor->block++; + cursor->index = cursor->endIndex; + + if (cursor->block == cursor->lastBlock) { + // The last reference block will usually be a runt. + cursor->endIndex = refCounts->blockCount; + } else { + cursor->endIndex += COUNTS_PER_BLOCK; + } + return true; +} + +/**********************************************************************/ +int makeRefCounts(BlockCount blockCount, + Slab *slab, + PhysicalBlockNumber origin, + ReadOnlyNotifier *readOnlyNotifier, + RefCounts **refCountsPtr) +{ + BlockCount refBlockCount = getSavedReferenceCountSize(blockCount); + RefCounts *refCounts; + int result = ALLOCATE_EXTENDED(RefCounts, refBlockCount, ReferenceBlock, + "ref counts structure", &refCounts); + if (result != UDS_SUCCESS) { + return result; + } + + // Allocate such that the runt slab has a full-length memory array, + // plus a little padding so we can word-search even at the very end. + size_t bytes = ((refBlockCount * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD)); + result = ALLOCATE(bytes, ReferenceCount, "ref counts array", + &refCounts->counters); + if (result != UDS_SUCCESS) { + freeRefCounts(&refCounts); + return result; + } + + refCounts->slab = slab; + refCounts->blockCount = blockCount; + refCounts->freeBlocks = blockCount; + refCounts->origin = origin; + refCounts->referenceBlockCount = refBlockCount; + refCounts->readOnlyNotifier = readOnlyNotifier; + refCounts->statistics = &slab->allocator->refCountStatistics; + refCounts->searchCursor.firstBlock = &refCounts->blocks[0]; + refCounts->searchCursor.lastBlock = &refCounts->blocks[refBlockCount - 1]; + resetSearchCursor(refCounts); + + for (size_t index = 0; index < refBlockCount; index++) { + refCounts->blocks[index] = (ReferenceBlock) { + .refCounts = refCounts, + }; + } + + *refCountsPtr = refCounts; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeRefCounts(RefCounts **refCountsPtr) +{ + RefCounts *refCounts = *refCountsPtr; + if (refCounts == NULL) { + return; + } + + FREE(refCounts->counters); + FREE(refCounts); + *refCountsPtr = NULL; +} + +/** + * Check whether a RefCounts has active I/O. + * + * @param refCounts The RefCounts to check + * + * @return true if there is reference block I/O or a summary + * update in progress + **/ +__attribute__((warn_unused_result)) +static bool hasActiveIO(RefCounts *refCounts) +{ + return ((refCounts->activeCount > 0) || refCounts->updatingSlabSummary); +} + +/**********************************************************************/ +bool areRefCountsActive(RefCounts *refCounts) +{ + if (hasActiveIO(refCounts)) { + return true; + } + + // When not suspending or recovering, the refCounts must be clean. + AdminStateCode code = refCounts->slab->state.state; + return (hasWaiters(&refCounts->dirtyBlocks) + && (code != ADMIN_STATE_SUSPENDING) + && (code != ADMIN_STATE_RECOVERING)); +} + +/**********************************************************************/ +static void enterRefCountsReadOnlyMode(RefCounts *refCounts, int result) +{ + enterReadOnlyMode(refCounts->readOnlyNotifier, result); + checkIfSlabDrained(refCounts->slab); +} + +/** + * Enqueue a block on the dirty queue. + * + * @param block The block to enqueue + **/ +static void enqueueDirtyBlock(ReferenceBlock *block) +{ + int result = enqueueWaiter(&block->refCounts->dirtyBlocks, &block->waiter); + if (result != VDO_SUCCESS) { + // This should never happen. + enterRefCountsReadOnlyMode(block->refCounts, result); + } +} + +/** + * Mark a reference count block as dirty, potentially adding it to the dirty + * queue if it wasn't already dirty. + * + * @param block The reference block to mark as dirty + **/ +static void dirtyBlock(ReferenceBlock *block) +{ + if (block->isDirty) { + return; + } + + block->isDirty = true; + if (block->isWriting) { + // The conclusion of the current write will enqueue the block again. + return; + } + + enqueueDirtyBlock(block); +} + +/**********************************************************************/ +BlockCount getUnreferencedBlockCount(RefCounts *refCounts) +{ + return refCounts->freeBlocks; +} + +/**********************************************************************/ +ReferenceBlock *getReferenceBlock(RefCounts *refCounts, SlabBlockNumber index) +{ + return &refCounts->blocks[index / COUNTS_PER_BLOCK]; +} + +/** + * Get the reference counter that covers the given physical block number. + * + * @param [in] refCounts The refcounts object + * @param [in] pbn The physical block number + * @param [out] counterPtr A pointer to the reference counter + + **/ +static int getReferenceCounter(RefCounts *refCounts, + PhysicalBlockNumber pbn, + ReferenceCount **counterPtr) +{ + SlabBlockNumber index; + int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &index); + if (result != VDO_SUCCESS) { + return result; + } + + *counterPtr = &refCounts->counters[index]; + + return VDO_SUCCESS; +} + +/**********************************************************************/ +uint8_t getAvailableReferences(RefCounts *refCounts, PhysicalBlockNumber pbn) +{ + ReferenceCount *counterPtr = NULL; + int result = getReferenceCounter(refCounts, pbn, &counterPtr); + if (result != VDO_SUCCESS) { + return 0; + } + + if (*counterPtr == PROVISIONAL_REFERENCE_COUNT) { + return (MAXIMUM_REFERENCE_COUNT - 1); + } + + return (MAXIMUM_REFERENCE_COUNT - *counterPtr); +} + +/** + * Increment the reference count for a data block. + * + * @param [in] refCounts The refCounts responsible for the block + * @param [in] block The reference block which contains the + * block being updated + * @param [in] slabBlockNumber The block to update + * @param [in] oldStatus The reference status of the data block + * before this increment + * @param [in] lock The PBNLock associated with this + * increment (may be NULL) + * @param [in,out] counterPtr A pointer to the count for the data block + * @param [out] freeStatusChanged A pointer which will be set to true if + * this update changed the free status of + * the block + * + * @return VDO_SUCCESS or an error + **/ +static int incrementForData(RefCounts *refCounts, + ReferenceBlock *block, + SlabBlockNumber slabBlockNumber, + ReferenceStatus oldStatus, + PBNLock *lock, + ReferenceCount *counterPtr, + bool *freeStatusChanged) +{ + switch (oldStatus) { + case RS_FREE: + *counterPtr = 1; + block->allocatedCount++; + refCounts->freeBlocks--; + *freeStatusChanged = true; + break; + + case RS_PROVISIONAL: + *counterPtr = 1; + *freeStatusChanged = false; + break; + + default: + // Single or shared + if (*counterPtr >= MAXIMUM_REFERENCE_COUNT) { + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Incrementing a block already having" + " 254 references (slab %u, offset %" + PRIu32 ")", + refCounts->slab->slabNumber, + slabBlockNumber); + } + (*counterPtr)++; + *freeStatusChanged = false; + } + + if (lock != NULL) { + unassignProvisionalReference(lock); + } + return VDO_SUCCESS; +} + +/** + * Decrement the reference count for a data block. + * + * @param [in] refCounts The refCounts responsible for the block + * @param [in] block The reference block which contains the + * block being updated + * @param [in] slabBlockNumber The block to update + * @param [in] oldStatus The reference status of the data block + * before this decrement + * @param [in] lock The PBNLock associated with the block + * being decremented (may be NULL) + * @param [in,out] counterPtr A pointer to the count for the data block + * @param [out] freeStatusChanged A pointer which will be set to true if + * this update changed the free status of + * the block + * + * @return VDO_SUCCESS or an error + **/ +static int decrementForData(RefCounts *refCounts, + ReferenceBlock *block, + SlabBlockNumber slabBlockNumber, + ReferenceStatus oldStatus, + PBNLock *lock, + ReferenceCount *counterPtr, + bool *freeStatusChanged) +{ + switch (oldStatus) { + case RS_FREE: + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Decrementing free block at offset %" + PRIu32 " in slab %u", slabBlockNumber, + refCounts->slab->slabNumber); + + case RS_PROVISIONAL: + case RS_SINGLE: + if (lock != NULL) { + // There is a read lock on this block, so the block must not become + // unreferenced. + *counterPtr = PROVISIONAL_REFERENCE_COUNT; + *freeStatusChanged = false; + assignProvisionalReference(lock); + } else { + *counterPtr = EMPTY_REFERENCE_COUNT; + block->allocatedCount--; + refCounts->freeBlocks++; + *freeStatusChanged = true; + } + break; + + default: + // Shared + (*counterPtr)--; + *freeStatusChanged = false; + } + + return VDO_SUCCESS; +} + +/** + * Increment the reference count for a block map page. All block map increments + * should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map blocks + * never dedupe they should never be adjusted from any other state. The + * adjustment always results in MAXIMUM_REFERENCE_COUNT as this value is used to + * prevent dedupe against block map blocks. + * + * @param [in] refCounts The refCounts responsible for the block + * @param [in] block The reference block which contains the + * block being updated + * @param [in] slabBlockNumber The block to update + * @param [in] oldStatus The reference status of the block + * before this increment + * @param [in] lock The PBNLock associated with this + * increment (may be NULL) + * @param [in] normalOperation Whether we are in normal operation vs. + * recovery or rebuild + * @param [in,out] counterPtr A pointer to the count for the block + * @param [out] freeStatusChanged A pointer which will be set to true if + * this update changed the free status of the + * block + * + * @return VDO_SUCCESS or an error + **/ +static int incrementForBlockMap(RefCounts *refCounts, + ReferenceBlock *block, + SlabBlockNumber slabBlockNumber, + ReferenceStatus oldStatus, + PBNLock *lock, + bool normalOperation, + ReferenceCount *counterPtr, + bool *freeStatusChanged) +{ + switch (oldStatus) { + case RS_FREE: + if (normalOperation) { + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Incrementing unallocated block map block" + " (slab %u, offset %" PRIu32 ")", + refCounts->slab->slabNumber, + slabBlockNumber); + } + + *counterPtr = MAXIMUM_REFERENCE_COUNT; + block->allocatedCount++; + refCounts->freeBlocks--; + *freeStatusChanged = true; + return VDO_SUCCESS; + + case RS_PROVISIONAL: + if (!normalOperation) { + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Block map block had provisional " + "reference during replay" + " (slab %u, offset %" PRIu32 ")", + refCounts->slab->slabNumber, + slabBlockNumber); + } + + *counterPtr = MAXIMUM_REFERENCE_COUNT; + *freeStatusChanged = false; + if (lock != NULL) { + unassignProvisionalReference(lock); + } + return VDO_SUCCESS; + + default: + return logErrorWithStringError(VDO_REF_COUNT_INVALID, + "Incrementing a block map block which is " + "already referenced %u times (slab %u, " + "offset %" PRIu32 ")", + *counterPtr, + refCounts->slab->slabNumber, + slabBlockNumber); + } +} + +/** + * Update the reference count of a block. + * + * @param [in] refCounts The refCounts responsible for the + * block + * @param [in] block The reference block which contains the + * block being updated + * @param [in] slabBlockNumber The block to update + * @param [in] slabJournalPoint The slab journal point at which this + * update is journaled + * @param [in] operation How to update the count + * @param [in] normalOperation Whether we are in normal operation vs. + * recovery or rebuild + * @param [out] freeStatusChanged A pointer which will be set to true if + * this update changed the free status of + * the block + * @param [out] provisionalDecrementPtr A pointer which will be set to true if + * this update was a decrement of a + * provisional reference + * + * @return VDO_SUCCESS or an error + **/ +static int updateReferenceCount(RefCounts *refCounts, + ReferenceBlock *block, + SlabBlockNumber slabBlockNumber, + const JournalPoint *slabJournalPoint, + ReferenceOperation operation, + bool normalOperation, + bool *freeStatusChanged, + bool *provisionalDecrementPtr) +{ + ReferenceCount *counterPtr = &refCounts->counters[slabBlockNumber]; + ReferenceStatus oldStatus = referenceCountToStatus(*counterPtr); + PBNLock *lock = getReferenceOperationPBNLock(operation); + int result; + + switch (operation.type) { + case DATA_INCREMENT: + result = incrementForData(refCounts, block, slabBlockNumber, oldStatus, + lock, counterPtr, freeStatusChanged); + break; + + case DATA_DECREMENT: + result = decrementForData(refCounts, block, slabBlockNumber, oldStatus, + lock, counterPtr, freeStatusChanged); + if ((result == VDO_SUCCESS) && (oldStatus == RS_PROVISIONAL)) { + if (provisionalDecrementPtr != NULL) { + *provisionalDecrementPtr = true; + } + return VDO_SUCCESS; + } + break; + + case BLOCK_MAP_INCREMENT: + result = incrementForBlockMap(refCounts, block, slabBlockNumber, oldStatus, + lock, normalOperation, counterPtr, + freeStatusChanged); + break; + + default: + logError("Unknown reference count operation: %u", operation.type); + enterRefCountsReadOnlyMode(refCounts, VDO_NOT_IMPLEMENTED); + result = VDO_NOT_IMPLEMENTED; + } + + if (result != VDO_SUCCESS) { + return result; + } + + if (isValidJournalPoint(slabJournalPoint)) { + refCounts->slabJournalPoint = *slabJournalPoint; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int adjustReferenceCount(RefCounts *refCounts, + ReferenceOperation operation, + const JournalPoint *slabJournalPoint, + bool *freeStatusChanged) +{ + if (!isSlabOpen(refCounts->slab)) { + return VDO_INVALID_ADMIN_STATE; + } + + SlabBlockNumber slabBlockNumber; + int result = slabBlockNumberFromPBN(refCounts->slab, operation.pbn, + &slabBlockNumber); + if (result != VDO_SUCCESS) { + return result; + } + + ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); + bool provisionalDecrement = false; + result = updateReferenceCount(refCounts, block, slabBlockNumber, + slabJournalPoint, operation, + NORMAL_OPERATION, freeStatusChanged, + &provisionalDecrement); + if ((result != VDO_SUCCESS) || provisionalDecrement) { + return result; + } + + if (block->isDirty && (block->slabJournalLock > 0)) { + /* + * This block is already dirty and a slab journal entry has been made + * for it since the last time it was clean. We must release the per-entry + * slab journal lock for the entry associated with the update we are now + * doing. + */ + result = ASSERT(isValidJournalPoint(slabJournalPoint), + "Reference count adjustments need slab journal points."); + if (result != VDO_SUCCESS) { + return result; + } + + SequenceNumber entryLock = slabJournalPoint->sequenceNumber; + adjustSlabJournalBlockReference(refCounts->slab->journal, entryLock, -1); + return VDO_SUCCESS; + } + + /* + * This may be the first time we are applying an update for which there + * is a slab journal entry to this block since the block was + * cleaned. Therefore, we convert the per-entry slab journal lock to an + * uncommitted reference block lock, if there is a per-entry lock. + */ + if (isValidJournalPoint(slabJournalPoint)) { + block->slabJournalLock = slabJournalPoint->sequenceNumber; + } else { + block->slabJournalLock = 0; + } + + dirtyBlock(block); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int adjustReferenceCountForRebuild(RefCounts *refCounts, + PhysicalBlockNumber pbn, + JournalOperation operation) +{ + SlabBlockNumber slabBlockNumber; + int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &slabBlockNumber); + if (result != VDO_SUCCESS) { + return result; + } + + ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); + bool unusedFreeStatus; + ReferenceOperation physicalOperation = { + .type = operation, + }; + result = updateReferenceCount(refCounts, block, slabBlockNumber, NULL, + physicalOperation, !NORMAL_OPERATION, + &unusedFreeStatus, NULL); + if (result != VDO_SUCCESS) { + return result; + } + + dirtyBlock(block); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int replayReferenceCountChange(RefCounts *refCounts, + const JournalPoint *entryPoint, + SlabJournalEntry entry) +{ + ReferenceBlock *block = getReferenceBlock(refCounts, entry.sbn); + SectorCount sector + = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR; + if (!beforeJournalPoint(&block->commitPoints[sector], entryPoint)) { + // This entry is already reflected in the existing counts, so do nothing. + return VDO_SUCCESS; + } + + // This entry is not yet counted in the reference counts. + bool unusedFreeStatus; + ReferenceOperation operation = { + .type = entry.operation + }; + int result = updateReferenceCount(refCounts, block, entry.sbn, + entryPoint, operation, !NORMAL_OPERATION, + &unusedFreeStatus, NULL); + if (result != VDO_SUCCESS) { + return result; + } + + dirtyBlock(block); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int getReferenceStatus(RefCounts *refCounts, + PhysicalBlockNumber pbn, + ReferenceStatus *statusPtr) +{ + ReferenceCount *counterPtr = NULL; + int result = getReferenceCounter(refCounts, pbn, &counterPtr); + if (result != VDO_SUCCESS) { + return result; + } + + *statusPtr = referenceCountToStatus(*counterPtr); + return VDO_SUCCESS; +} + +/**********************************************************************/ +bool areEquivalentReferenceCounters(RefCounts *counterA, RefCounts *counterB) +{ + if ((counterA->blockCount != counterB->blockCount) + || (counterA->freeBlocks != counterB->freeBlocks) + || (counterA->referenceBlockCount != counterB->referenceBlockCount)) { + return false; + } + + for (size_t i = 0; i < counterA->referenceBlockCount; i++) { + ReferenceBlock *blockA = &counterA->blocks[i]; + ReferenceBlock *blockB = &counterB->blocks[i]; + if (blockA->allocatedCount != blockB->allocatedCount) { + return false; + } + } + + return (memcmp(counterA->counters, counterB->counters, + sizeof(ReferenceCount) * counterA->blockCount) == 0); +} + +/** + * Find the array index of the first zero byte in word-sized range of + * reference counters. The search does no bounds checking; the function relies + * on the array being sufficiently padded. + * + * @param wordPtr A pointer to the eight counter bytes to check + * @param startIndex The array index corresponding to wordPtr[0] + * @param failIndex The array index to return if no zero byte is found + + * @return the array index of the first zero byte in the word, or + * the value passed as failIndex if no zero byte was found + **/ +static inline SlabBlockNumber findZeroByteInWord(const byte *wordPtr, + SlabBlockNumber startIndex, + SlabBlockNumber failIndex) +{ + uint64_t word = getUInt64LE(wordPtr); + + // This looks like a loop, but GCC will unroll the eight iterations for us. + for (unsigned int offset = 0; offset < BYTES_PER_WORD; offset++) { + // Assumes little-endian byte order, which we have on X86. + if ((word & 0xFF) == 0) { + return (startIndex + offset); + } + word >>= 8; + } + + return failIndex; +} + +/**********************************************************************/ +bool findFreeBlock(const RefCounts *refCounts, + SlabBlockNumber startIndex, + SlabBlockNumber endIndex, + SlabBlockNumber *indexPtr) +{ + SlabBlockNumber zeroIndex; + SlabBlockNumber nextIndex = startIndex; + byte *nextCounter = &refCounts->counters[nextIndex]; + byte *endCounter = &refCounts->counters[endIndex]; + + // Search every byte of the first unaligned word. (Array is padded so + // reading past end is safe.) + zeroIndex = findZeroByteInWord(nextCounter, nextIndex, endIndex); + if (zeroIndex < endIndex) { + *indexPtr = zeroIndex; + return true; + } + + // On architectures where unaligned word access is expensive, this + // would be a good place to advance to an alignment boundary. + nextIndex += BYTES_PER_WORD; + nextCounter += BYTES_PER_WORD; + + // Now we're word-aligned; check an word at a time until we find a word + // containing a zero. (Array is padded so reading past end is safe.) + while (nextCounter < endCounter) { + /* + * The following code is currently an exact copy of the code preceding the + * loop, but if you try to merge them by using a do loop, it runs slower + * because a jump instruction gets added at the start of the iteration. + */ + zeroIndex = findZeroByteInWord(nextCounter, nextIndex, endIndex); + if (zeroIndex < endIndex) { + *indexPtr = zeroIndex; + return true; + } + + nextIndex += BYTES_PER_WORD; + nextCounter += BYTES_PER_WORD; + } + + return false; +} + +/** + * Search the reference block currently saved in the search cursor for a + * reference count of zero, starting at the saved counter index. + * + * @param [in] refCounts The RefCounts object to search + * @param [out] freeIndexPtr A pointer to receive the array index of the + * zero reference count + * + * @return true if an unreferenced counter was found + **/ +static bool searchCurrentReferenceBlock(const RefCounts *refCounts, + SlabBlockNumber *freeIndexPtr) +{ + // Don't bother searching if the current block is known to be full. + return ((refCounts->searchCursor.block->allocatedCount < COUNTS_PER_BLOCK) + && findFreeBlock(refCounts, refCounts->searchCursor.index, + refCounts->searchCursor.endIndex, freeIndexPtr)); +} + +/** + * Search each reference block for a reference count of zero, starting at the + * reference block and counter index saved in the search cursor and searching + * up to the end of the last reference block. The search does not wrap. + * + * @param [in] refCounts The RefCounts object to search + * @param [out] freeIndexPtr A pointer to receive the array index of the + * zero reference count + * + * @return true if an unreferenced counter was found + **/ +static bool searchReferenceBlocks(RefCounts *refCounts, + SlabBlockNumber *freeIndexPtr) +{ + // Start searching at the saved search position in the current block. + if (searchCurrentReferenceBlock(refCounts, freeIndexPtr)) { + return true; + } + + // Search each reference block up to the end of the slab. + while (advanceSearchCursor(refCounts)) { + if (searchCurrentReferenceBlock(refCounts, freeIndexPtr)) { + return true; + } + } + + return false; +} + +/** + * Do the bookkeeping for making a provisional reference. + * + * @param refCounts The RefCounts + * @param slabBlockNumber The block to reference + **/ +static void makeProvisionalReference(RefCounts *refCounts, + SlabBlockNumber slabBlockNumber) +{ + // Make the initial transition from an unreferenced block to a provisionally + // allocated block. + refCounts->counters[slabBlockNumber] = PROVISIONAL_REFERENCE_COUNT; + + // Account for the allocation. + ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); + block->allocatedCount++; + refCounts->freeBlocks--; +} + +/**********************************************************************/ +int allocateUnreferencedBlock(RefCounts *refCounts, + PhysicalBlockNumber *allocatedPtr) +{ + if (!isSlabOpen(refCounts->slab)) { + return VDO_INVALID_ADMIN_STATE; + } + + SlabBlockNumber freeIndex; + if (!searchReferenceBlocks(refCounts, &freeIndex)) { + return VDO_NO_SPACE; + } + + ASSERT_LOG_ONLY((refCounts->counters[freeIndex] == EMPTY_REFERENCE_COUNT), + "free block must have refCount of zero"); + makeProvisionalReference(refCounts, freeIndex); + + // Update the search hint so the next search will start at the array + // index just past the free block we just found. + refCounts->searchCursor.index = (freeIndex + 1); + + *allocatedPtr = indexToPBN(refCounts, freeIndex); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int provisionallyReferenceBlock(RefCounts *refCounts, + PhysicalBlockNumber pbn, + PBNLock *lock) +{ + if (!isSlabOpen(refCounts->slab)) { + return VDO_INVALID_ADMIN_STATE; + } + + SlabBlockNumber slabBlockNumber; + int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &slabBlockNumber); + if (result != VDO_SUCCESS) { + return result; + } + + if (refCounts->counters[slabBlockNumber] == EMPTY_REFERENCE_COUNT) { + makeProvisionalReference(refCounts, slabBlockNumber); + if (lock != NULL) { + assignProvisionalReference(lock); + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +BlockCount countUnreferencedBlocks(RefCounts *refCounts, + PhysicalBlockNumber startPBN, + PhysicalBlockNumber endPBN) +{ + BlockCount freeBlocks = 0; + SlabBlockNumber startIndex = pbnToIndex(refCounts, startPBN); + SlabBlockNumber endIndex = pbnToIndex(refCounts, endPBN); + for (SlabBlockNumber index = startIndex; index < endIndex; index++) { + if (refCounts->counters[index] == EMPTY_REFERENCE_COUNT) { + freeBlocks++; + } + } + + return freeBlocks; +} + +/** + * Convert a ReferenceBlock's generic wait queue entry back into the + * ReferenceBlock. + * + * @param waiter The wait queue entry to convert + * + * @return The wrapping ReferenceBlock + **/ +static inline ReferenceBlock *waiterAsReferenceBlock(Waiter *waiter) +{ + STATIC_ASSERT(offsetof(ReferenceBlock, waiter) == 0); + return (ReferenceBlock *) waiter; +} + +/** + * WaitCallback to clean dirty reference blocks when resetting. + * + * @param blockWaiter The dirty block + * @param context Unused + **/ +static void +clearDirtyReferenceBlocks(Waiter *blockWaiter, + void *context __attribute__((unused))) +{ + waiterAsReferenceBlock(blockWaiter)->isDirty = false; +} + +/**********************************************************************/ +void resetReferenceCounts(RefCounts *refCounts) +{ + // We can just use memset() since each ReferenceCount is exactly one byte. + STATIC_ASSERT(sizeof(ReferenceCount) == 1); + memset(refCounts->counters, 0, refCounts->blockCount); + refCounts->freeBlocks = refCounts->blockCount; + refCounts->slabJournalPoint = (JournalPoint) { + .sequenceNumber = 0, + .entryCount = 0, + }; + + for (size_t i = 0; i < refCounts->referenceBlockCount; i++) { + refCounts->blocks[i].allocatedCount = 0; + } + + notifyAllWaiters(&refCounts->dirtyBlocks, clearDirtyReferenceBlocks, NULL); +} + +/**********************************************************************/ +BlockCount getSavedReferenceCountSize(BlockCount blockCount) +{ + return computeBucketCount(blockCount, COUNTS_PER_BLOCK); +} + +/** + * A waiter callback that resets the writing state of refCounts. + **/ +static void finishSummaryUpdate(Waiter *waiter, void *context) +{ + RefCounts *refCounts = refCountsFromWaiter(waiter); + refCounts->updatingSlabSummary = false; + + int result = *((int *) context); + if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) { + checkIfSlabDrained(refCounts->slab); + return; + } + + logErrorWithStringError(result, "failed to update slab summary"); + enterRefCountsReadOnlyMode(refCounts, result); +} + +/** + * Update slab summary that the RefCounts is clean. + * + * @param refCounts The RefCounts object that is being written + **/ +static void updateSlabSummaryAsClean(RefCounts *refCounts) +{ + SlabSummaryZone *summary = getSlabSummaryZone(refCounts->slab->allocator); + if (summary == NULL) { + return; + } + + // Update the slab summary to indicate this refCounts is clean. + TailBlockOffset offset + = getSummarizedTailBlockOffset(summary, refCounts->slab->slabNumber); + refCounts->updatingSlabSummary = true; + refCounts->slabSummaryWaiter.callback = finishSummaryUpdate; + updateSlabSummaryEntry(summary, &refCounts->slabSummaryWaiter, + refCounts->slab->slabNumber, offset, true, true, + getSlabFreeBlockCount(refCounts->slab)); +} + +/** + * Handle an I/O error reading or writing a reference count block. + * + * @param completion The VIO doing the I/O as a completion + **/ +static void handleIOError(VDOCompletion *completion) +{ + int result = completion->result; + VIOPoolEntry *entry = completion->parent; + RefCounts *refCounts = ((ReferenceBlock *) entry->parent)->refCounts; + returnVIO(refCounts->slab->allocator, entry); + refCounts->activeCount--; + enterRefCountsReadOnlyMode(refCounts, result); +} + +/** + * After a reference block has written, clean it, release its locks, and return + * its VIO to the pool. + * + * @param completion The VIO that just finished writing + **/ +static void finishReferenceBlockWrite(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + ReferenceBlock *block = entry->parent; + RefCounts *refCounts = block->refCounts; + refCounts->activeCount--; + + // Release the slab journal lock. + adjustSlabJournalBlockReference(refCounts->slab->journal, + block->slabJournalLockToRelease, -1); + returnVIO(refCounts->slab->allocator, entry); + + /* + * We can't clear the isWriting flag earlier as releasing the slab journal + * lock may cause us to be dirtied again, but we don't want to double + * enqueue. + */ + block->isWriting = false; + + if (isReadOnly(refCounts->readOnlyNotifier)) { + checkIfSlabDrained(refCounts->slab); + return; + } + + // Re-queue the block if it was re-dirtied while it was writing. + if (block->isDirty) { + enqueueDirtyBlock(block); + if (isSlabDraining(refCounts->slab)) { + // We must be saving, and this block will otherwise not be relaunched. + saveDirtyReferenceBlocks(refCounts); + } + + return; + } + + // Mark the RefCounts as clean in the slab summary if there are no dirty + // or writing blocks and no summary update in progress. + if (!hasActiveIO(refCounts) && !hasWaiters(&refCounts->dirtyBlocks)) { + updateSlabSummaryAsClean(refCounts); + } +} + +/**********************************************************************/ +ReferenceCount *getReferenceCountersForBlock(ReferenceBlock *block) +{ + size_t blockIndex = block - block->refCounts->blocks; + return &block->refCounts->counters[blockIndex * COUNTS_PER_BLOCK]; +} + +/**********************************************************************/ +void packReferenceBlock(ReferenceBlock *block, void *buffer) +{ + PackedJournalPoint commitPoint; + packJournalPoint(&block->refCounts->slabJournalPoint, &commitPoint); + + PackedReferenceBlock *packed = buffer; + ReferenceCount *counters = getReferenceCountersForBlock(block); + for (SectorCount i = 0; i < SECTORS_PER_BLOCK; i++) { + packed->sectors[i].commitPoint = commitPoint; + memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR), + (sizeof(ReferenceCount) * COUNTS_PER_SECTOR)); + } +} + +/** + * After a dirty block waiter has gotten a VIO from the VIO pool, copy its + * counters and associated data into the VIO, and launch the write. + * + * @param blockWaiter The waiter of the dirty block + * @param vioContext The VIO returned by the pool + **/ +static void writeReferenceBlock(Waiter *blockWaiter, void *vioContext) +{ + VIOPoolEntry *entry = vioContext; + ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); + packReferenceBlock(block, entry->buffer); + + size_t blockOffset = (block - block->refCounts->blocks); + PhysicalBlockNumber pbn = (block->refCounts->origin + blockOffset); + block->slabJournalLockToRelease = block->slabJournalLock; + entry->parent = block; + + /* + * Mark the block as clean, since we won't be committing any updates that + * happen after this moment. As long as VIO order is preserved, two + * VIOs updating this block at once will not cause complications. + */ + block->isDirty = false; + + // Flush before writing to ensure that the recovery journal and slab journal + // entries which cover this reference update are stable (VDO-2331). + relaxedAdd64(&block->refCounts->statistics->blocksWritten, 1); + entry->vio->completion.callbackThreadID + = block->refCounts->slab->allocator->threadID; + launchWriteMetadataVIOWithFlush(entry->vio, pbn, finishReferenceBlockWrite, + handleIOError, true, false); +} + +/** + * Launch the write of a dirty reference block by first acquiring a VIO for it + * from the pool. This can be asynchronous since the writer will have to wait + * if all VIOs in the pool are currently in use. + * + * @param blockWaiter The waiter of the block which is starting to write + * @param context The parent refCounts of the block + **/ +static void launchReferenceBlockWrite(Waiter *blockWaiter, void *context) +{ + RefCounts *refCounts = context; + if (isReadOnly(refCounts->readOnlyNotifier)) { + return; + } + + refCounts->activeCount++; + ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); + block->isWriting = true; + blockWaiter->callback = writeReferenceBlock; + int result = acquireVIO(refCounts->slab->allocator, blockWaiter); + if (result != VDO_SUCCESS) { + // This should never happen. + refCounts->activeCount--; + enterRefCountsReadOnlyMode(refCounts, result); + } +} + +/**********************************************************************/ +void saveOldestReferenceBlock(RefCounts *refCounts) +{ + notifyNextWaiter(&refCounts->dirtyBlocks, launchReferenceBlockWrite, + refCounts); +} + +/**********************************************************************/ +void saveSeveralReferenceBlocks(RefCounts *refCounts, size_t flushDivisor) +{ + BlockCount dirtyBlockCount = countWaiters(&refCounts->dirtyBlocks); + if (dirtyBlockCount == 0) { + return; + } + + BlockCount blocksToWrite = dirtyBlockCount / flushDivisor; + // Always save at least one block. + if (blocksToWrite == 0) { + blocksToWrite = 1; + } + + for (BlockCount written = 0; written < blocksToWrite; written++) { + saveOldestReferenceBlock(refCounts); + } +} + +/**********************************************************************/ +void saveDirtyReferenceBlocks(RefCounts *refCounts) +{ + notifyAllWaiters(&refCounts->dirtyBlocks, launchReferenceBlockWrite, + refCounts); + checkIfSlabDrained(refCounts->slab); +} + +/**********************************************************************/ +void dirtyAllReferenceBlocks(RefCounts *refCounts) +{ + for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { + dirtyBlock(&refCounts->blocks[i]); + } +} + +/** + * Clear the provisional reference counts from a reference block. + * + * @param block The block to clear + **/ +static void clearProvisionalReferences(ReferenceBlock *block) +{ + ReferenceCount *counters = getReferenceCountersForBlock(block); + for (BlockCount j = 0; j < COUNTS_PER_BLOCK; j++) { + if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { + counters[j] = EMPTY_REFERENCE_COUNT; + block->allocatedCount--; + } + } +} + +/** + * Unpack reference counts blocks into the internal memory structure. + * + * @param packed The written reference block to be unpacked + * @param block The internal reference block to be loaded + **/ +static void unpackReferenceBlock(PackedReferenceBlock *packed, + ReferenceBlock *block) +{ + RefCounts *refCounts = block->refCounts; + ReferenceCount *counters = getReferenceCountersForBlock(block); + for (SectorCount i = 0; i < SECTORS_PER_BLOCK; i++) { + PackedReferenceSector *sector = &packed->sectors[i]; + unpackJournalPoint(§or->commitPoint, &block->commitPoints[i]); + memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts, + (sizeof(ReferenceCount) * COUNTS_PER_SECTOR)); + // The slabJournalPoint must be the latest point found in any sector. + if (beforeJournalPoint(&refCounts->slabJournalPoint, + &block->commitPoints[i])) { + refCounts->slabJournalPoint = block->commitPoints[i]; + } + + if ((i > 0) && !areEquivalentJournalPoints(&block->commitPoints[0], + &block->commitPoints[i])) { + size_t blockIndex = block - block->refCounts->blocks; + logWarning("Torn write detected in sector %u of reference block" + " %zu of slab %" PRIu16, + i, blockIndex, block->refCounts->slab->slabNumber); + } + } + + block->allocatedCount = 0; + for (BlockCount i = 0; i < COUNTS_PER_BLOCK; i++) { + if (counters[i] != EMPTY_REFERENCE_COUNT) { + block->allocatedCount++; + } + } +} + +/** + * After a reference block has been read, unpack it. + * + * @param completion The VIO that just finished reading + **/ +static void finishReferenceBlockLoad(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + ReferenceBlock *block = entry->parent; + unpackReferenceBlock((PackedReferenceBlock *) entry->buffer, block); + + RefCounts *refCounts = block->refCounts; + returnVIO(refCounts->slab->allocator, entry); + refCounts->activeCount--; + clearProvisionalReferences(block); + + refCounts->freeBlocks -= block->allocatedCount; + checkIfSlabDrained(block->refCounts->slab); +} + +/** + * After a block waiter has gotten a VIO from the VIO pool, load the block. + * + * @param blockWaiter The waiter of the block to load + * @param vioContext The VIO returned by the pool + **/ +static void loadReferenceBlock(Waiter *blockWaiter, void *vioContext) +{ + VIOPoolEntry *entry = vioContext; + ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); + size_t blockOffset = (block - block->refCounts->blocks); + PhysicalBlockNumber pbn = (block->refCounts->origin + blockOffset); + entry->parent = block; + + entry->vio->completion.callbackThreadID + = block->refCounts->slab->allocator->threadID; + launchReadMetadataVIO(entry->vio, pbn, finishReferenceBlockLoad, + handleIOError); +} + +/** + * Load reference blocks from the underlying storage into a pre-allocated + * reference counter. + * + * @param refCounts The reference counter to be loaded + **/ +static void loadReferenceBlocks(RefCounts *refCounts) +{ + refCounts->freeBlocks = refCounts->blockCount; + refCounts->activeCount = refCounts->referenceBlockCount; + for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { + Waiter *blockWaiter = &refCounts->blocks[i].waiter; + blockWaiter->callback = loadReferenceBlock; + int result = acquireVIO(refCounts->slab->allocator, blockWaiter); + if (result != VDO_SUCCESS) { + // This should never happen. + refCounts->activeCount -= (refCounts->referenceBlockCount - i); + enterRefCountsReadOnlyMode(refCounts, result); + return; + } + } +} + +/**********************************************************************/ +void drainRefCounts(RefCounts *refCounts) +{ + Slab *slab = refCounts->slab; + bool save = false; + switch (slab->state.state) { + case ADMIN_STATE_SCRUBBING: + if (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)) { + loadReferenceBlocks(refCounts); + return; + } + + break; + + case ADMIN_STATE_SAVE_FOR_SCRUBBING: + if (!mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)) { + // These reference counts were never written, so mark them all dirty. + dirtyAllReferenceBlocks(refCounts); + } + save = true; + break; + + case ADMIN_STATE_REBUILDING: + if (shouldSaveFullyBuiltSlab(slab)) { + dirtyAllReferenceBlocks(refCounts); + save = true; + } + break; + + case ADMIN_STATE_SAVING: + save = !isUnrecoveredSlab(slab); + break; + + case ADMIN_STATE_RECOVERING: + case ADMIN_STATE_SUSPENDING: + break; + + default: + notifyRefCountsAreDrained(slab, VDO_SUCCESS); + return; + } + + if (save) { + saveDirtyReferenceBlocks(refCounts); + } +} + +/**********************************************************************/ +void acquireDirtyBlockLocks(RefCounts *refCounts) +{ + dirtyAllReferenceBlocks(refCounts); + for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { + refCounts->blocks[i].slabJournalLock = 1; + } + + adjustSlabJournalBlockReference(refCounts->slab->journal, 1, + refCounts->referenceBlockCount); +} + +/**********************************************************************/ +void dumpRefCounts(const RefCounts *refCounts) +{ + // Terse because there are a lot of slabs to dump and syslog is lossy. + logInfo(" refCounts: free=%" PRIu32 "/%" PRIu32 " blocks=%" PRIu32 + " dirty=%zu active=%zu journal@(%llu,%" PRIu16 ")%s", + refCounts->freeBlocks, refCounts->blockCount, + refCounts->referenceBlockCount, + countWaiters(&refCounts->dirtyBlocks), + refCounts->activeCount, + refCounts->slabJournalPoint.sequenceNumber, + refCounts->slabJournalPoint.entryCount, + (refCounts->updatingSlabSummary ? " updating" : "")); +} diff --git a/source/vdo/base/refCounts.h b/source/vdo/base/refCounts.h new file mode 100644 index 0000000..f140c8c --- /dev/null +++ b/source/vdo/base/refCounts.h @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCounts.h#7 $ + */ + +#ifndef REF_COUNTS_H +#define REF_COUNTS_H + +#include "completion.h" +#include "journalPoint.h" +#include "slab.h" +#include "types.h" + +/** + * Create a reference counting object. + * + *

A reference counting object can keep a reference count for every physical + * block in the VDO configuration. Since we expect the vast majority of the + * blocks to have 0 or 1 reference counts, the structure is optimized for that + * situation. + * + * @param [in] blockCount The number of physical blocks that can be + * referenced + * @param [in] slab The slab of the ref counts object + * @param [in] origin The layer PBN at which to save RefCounts + * @param [in] readOnlyNotifier The context for tracking read-only mode + * @param [out] refCountsPtr The pointer to hold the new ref counts object + * + * @return a success or error code + **/ +int makeRefCounts(BlockCount blockCount, + Slab *slab, + PhysicalBlockNumber origin, + ReadOnlyNotifier *readOnlyNotifier, + RefCounts **refCountsPtr) + __attribute__((warn_unused_result)); + +/** + * Free a reference counting object and null out the reference to it. + * + * @param refCountsPtr The reference to the reference counting object to free + **/ +void freeRefCounts(RefCounts **refCountsPtr); + +/** + * Check whether a RefCounts is active. + * + * @param refCounts The RefCounts to check + **/ +bool areRefCountsActive(RefCounts *refCounts) + __attribute__((warn_unused_result)); + +/** + * Get the stored count of the number of blocks that are currently free. + * + * @param refCounts The RefCounts object + * + * @return the number of blocks with a reference count of zero + **/ +BlockCount getUnreferencedBlockCount(RefCounts *refCounts) + __attribute__((warn_unused_result)); + +/** + * Determine how many times a reference count can be incremented without + * overflowing. + * + * @param refCounts The RefCounts object + * @param pbn The physical block number + * + * @return the number of increments that can be performed + **/ +uint8_t getAvailableReferences(RefCounts *refCounts, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Adjust the reference count of a block. + * + * @param [in] refCounts The refcounts object + * @param [in] operation The operation to perform + * @param [in] slabJournalPoint The slab journal entry for this adjustment + * @param [out] freeStatusChanged A pointer which will be set to true if the + * free status of the block changed + * + * + * @return A success or error code, specifically: + * VDO_REF_COUNT_INVALID if a decrement would result in a negative + * reference count, or an increment in a + * count greater than MAXIMUM_REFS + * + **/ +int adjustReferenceCount(RefCounts *refCounts, + ReferenceOperation operation, + const JournalPoint *slabJournalPoint, + bool *freeStatusChanged) + __attribute__((warn_unused_result)); + +/** + * Adjust the reference count of a block during rebuild. + * + * @param refCounts The refcounts object + * @param pbn The number of the block to adjust + * @param operation The operation to perform on the count + * + * @return VDO_SUCCESS or an error + **/ +int adjustReferenceCountForRebuild(RefCounts *refCounts, + PhysicalBlockNumber pbn, + JournalOperation operation) + __attribute__((warn_unused_result)); + +/** + * Replay the reference count adjustment from a slab journal entry into the + * reference count for a block. The adjustment will be ignored if it was already + * recorded in the reference count. + * + * @param refCounts The refcounts object + * @param entryPoint The slab journal point for the entry + * @param entry The slab journal entry being replayed + * + * @return VDO_SUCCESS or an error code + **/ +int replayReferenceCountChange(RefCounts *refCounts, + const JournalPoint *entryPoint, + SlabJournalEntry entry) + __attribute__((warn_unused_result)); + +/** + * Check whether two reference counters are equivalent. This method is + * used for unit testing. + * + * @param counterA The first counter to compare + * @param counterB The second counter to compare + * + * @return true if the two counters are equivalent + **/ +bool areEquivalentReferenceCounters(RefCounts *counterA, RefCounts *counterB) + __attribute__((warn_unused_result)); + +/** + * Find a block with a reference count of zero in the range of physical block + * numbers tracked by the reference counter. If a free block is found, that + * block is allocated by marking it as provisionally referenced, and the + * allocated block number is returned. + * + * @param [in] refCounts The reference counters to scan + * @param [out] allocatedPtr A pointer to hold the physical block number of + * the block that was found and allocated + * + * @return VDO_SUCCESS if a free block was found and allocated; + * VDO_NO_SPACE if there are no unreferenced blocks; + * otherwise an error code + **/ +int allocateUnreferencedBlock(RefCounts *refCounts, + PhysicalBlockNumber *allocatedPtr) + __attribute__((warn_unused_result)); + +/** + * Provisionally reference a block if it is unreferenced. + * + * @param refCounts The reference counters + * @param pbn The PBN to reference + * @param lock The PBNLock on the block (may be NULL) + * + * @return VDO_SUCCESS or an error + **/ +int provisionallyReferenceBlock(RefCounts *refCounts, + PhysicalBlockNumber pbn, + PBNLock *lock) + __attribute__((warn_unused_result)); + +/** + * Count all unreferenced blocks in a range [startBlock, endBlock) of physical + * block numbers. + * + * @param refCounts The reference counters to scan + * @param startPBN The physical block number at which to start + * scanning (included in the scan) + * @param endPBN The physical block number at which to stop + * scanning (excluded from the scan) + * + * @return The number of unreferenced blocks + **/ +BlockCount countUnreferencedBlocks(RefCounts *refCounts, + PhysicalBlockNumber startPBN, + PhysicalBlockNumber endPBN) + __attribute__((warn_unused_result)); + +/** + * Get the number of blocks required to save a reference counts state covering + * the specified number of data blocks. + * + * @param blockCount The number of physical data blocks that can be referenced + * + * @return The number of blocks required to save reference counts with the + * given block count + **/ +BlockCount getSavedReferenceCountSize(BlockCount blockCount) + __attribute__((warn_unused_result)); + +/** + * Request a RefCounts save several dirty blocks asynchronously. This function + * currently writes 1 / flushDivisor of the dirty blocks. + * + * @param refCounts The RefCounts object to notify + * @param flushDivisor The inverse fraction of the dirty blocks to write + **/ +void saveSeveralReferenceBlocks(RefCounts *refCounts, size_t flushDivisor); + +/** + * Ask a RefCounts to save all its dirty blocks asynchronously. + * + * @param refCounts The RefCounts object to notify + **/ +void saveDirtyReferenceBlocks(RefCounts *refCounts); + +/** + * Mark all reference count blocks as dirty. + * + * @param refCounts The RefCounts of the reference blocks + **/ +void dirtyAllReferenceBlocks(RefCounts *refCounts); + +/** + * Drain all reference count I/O. Depending upon the type of drain being + * performed (as recorded in the RefCount's Slab), the reference blocks may + * be loaded from disk or dirty reference blocks may be written out. + * + * @param refCounts The reference counts to drain + **/ +void drainRefCounts(RefCounts *refCounts); + +/** + * Mark all reference count blocks dirty and cause them to hold locks on slab + * journal block 1. + * + * @param refCounts The RefCounts of the reference blocks + **/ +void acquireDirtyBlockLocks(RefCounts *refCounts); + +/** + * Dump information about this RefCounts structure. + * + * @param refCounts The RefCounts to dump + **/ +void dumpRefCounts(const RefCounts *refCounts); + +#endif // REF_COUNTS_H diff --git a/source/vdo/base/refCountsInternals.h b/source/vdo/base/refCountsInternals.h new file mode 100644 index 0000000..a1bd1db --- /dev/null +++ b/source/vdo/base/refCountsInternals.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCountsInternals.h#4 $ + */ + +#ifndef REF_COUNTS_INTERNALS_H +#define REF_COUNTS_INTERNALS_H + +#include "refCounts.h" + +#include "journalPoint.h" +#include "referenceBlock.h" +#include "slab.h" +#include "blockAllocatorInternals.h" +#include "waitQueue.h" + +/** + * Represents the possible status of a block. + **/ +typedef enum referenceStatus { + RS_FREE, // this block is free + RS_SINGLE, // this block is singly-referenced + RS_SHARED, // this block is shared + RS_PROVISIONAL // this block is provisionally allocated +} ReferenceStatus; + +/** + * The SearchCursor represents the saved position of a free block search. + **/ +typedef struct searchCursor { + /** The reference block containing the current search index */ + ReferenceBlock *block; + /** The position at which to start searching for the next free counter */ + SlabBlockNumber index; + /** The position just past the last valid counter in the current block */ + SlabBlockNumber endIndex; + + /** A pointer to the first reference block in the slab */ + ReferenceBlock *firstBlock; + /** A pointer to the last reference block in the slab */ + ReferenceBlock *lastBlock; +} SearchCursor; + +/* + * RefCounts structure + * + * A reference count is maintained for each PhysicalBlockNumber. The vast + * majority of blocks have a very small reference count (usually 0 or 1). + * For references less than or equal to MAXIMUM_REFS (254) the reference count + * is stored in counters[pbn]. + * + */ +struct refCounts { + /** The slab of this reference block */ + Slab *slab; + + /** The size of the counters array */ + uint32_t blockCount; + /** The number of free blocks */ + uint32_t freeBlocks; + /** The array of reference counts */ + ReferenceCount *counters; // use ALLOCATE to align data ptr + + /** The saved block pointer and array indexes for the free block search */ + SearchCursor searchCursor; + + /** A list of the dirty blocks waiting to be written out */ + WaitQueue dirtyBlocks; + /** The number of blocks which are currently writing */ + size_t activeCount; + + /** A waiter object for updating the slab summary */ + Waiter slabSummaryWaiter; + /** Whether slab summary update is in progress */ + bool updatingSlabSummary; + + /** The notifier for read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The refcount statistics, shared by all refcounts in our physical zone */ + AtomicRefCountStatistics *statistics; + /** The layer PBN for the first ReferenceBlock */ + PhysicalBlockNumber origin; + /** The latest slab journal entry this RefCounts has been updated with */ + JournalPoint slabJournalPoint; + + /** The number of reference count blocks */ + uint32_t referenceBlockCount; + /** reference count block array */ + ReferenceBlock blocks[]; +}; + +/** + * Convert a reference count to a reference status. + * + * @param count The count to convert + * + * @return The appropriate reference status + **/ +__attribute__((warn_unused_result)) +ReferenceStatus referenceCountToStatus(ReferenceCount count); + +/** + * Convert a generic VDOCompletion to a RefCounts. + * + * @param completion The completion to convert + * + * @return The completion as a RefCounts + **/ +RefCounts *asRefCounts(VDOCompletion *completion) + __attribute__((warn_unused_result)); + +/** + * Get the reference block that covers the given block index (exposed for + * testing). + * + * @param refCounts The refcounts object + * @param index The block index + **/ +ReferenceBlock *getReferenceBlock(RefCounts *refCounts, SlabBlockNumber index) + __attribute__((warn_unused_result)); + +/** + * Find the reference counters for a given block (exposed for testing). + * + * @param block The ReferenceBlock in question + * + * @return A pointer to the reference counters for this block + **/ +ReferenceCount *getReferenceCountersForBlock(ReferenceBlock *block) + __attribute__((warn_unused_result)); + +/** + * Copy data from a reference block to a buffer ready to be written out + * (exposed for testing). + * + * @param block The block to copy + * @param buffer The char buffer to fill with the packed block + **/ +void packReferenceBlock(ReferenceBlock *block, void *buffer); + +/** + * Get the reference status of a block. Exposed only for unit testing. + * + * @param [in] refCounts The refcounts object + * @param [in] pbn The physical block number + * @param [out] statusPtr Where to put the status of the block + * + * @return A success or error code, specifically: + * VDO_OUT_OF_RANGE if the pbn is out of range. + **/ +int getReferenceStatus(RefCounts *refCounts, + PhysicalBlockNumber pbn, + ReferenceStatus *statusPtr) + __attribute__((warn_unused_result)); + +/** + * Find the first block with a reference count of zero in the specified range + * of reference counter indexes. Exposed for unit testing. + * + * @param [in] refCounts The reference counters to scan + * @param [in] startIndex The array index at which to start scanning + * (included in the scan) + * @param [in] endIndex The array index at which to stop scanning + * (excluded from the scan) + * @param [out] indexPtr A pointer to hold the array index of the free block + * + * @return true if a free block was found in the specified range + **/ +bool findFreeBlock(const RefCounts *refCounts, + SlabBlockNumber startIndex, + SlabBlockNumber endIndex, + SlabBlockNumber *indexPtr) + __attribute__((warn_unused_result)); + +/** + * Request a RefCounts save its oldest dirty block asynchronously. + * + * @param refCounts The RefCounts object to notify + **/ +void saveOldestReferenceBlock(RefCounts *refCounts); + +/** + * Reset all reference counts back to RS_FREE. + * + * @param refCounts The reference counters to reset + **/ +void resetReferenceCounts(RefCounts *refCounts); + +#endif // REF_COUNTS_INTERNALS_H diff --git a/source/vdo/base/referenceBlock.h b/source/vdo/base/referenceBlock.h new file mode 100644 index 0000000..8014c3b --- /dev/null +++ b/source/vdo/base/referenceBlock.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceBlock.h#1 $ + */ + +#ifndef REFERENCE_BLOCK_H +#define REFERENCE_BLOCK_H + +#include "constants.h" +#include "journalPoint.h" +#include "types.h" +#include "waitQueue.h" + +/** + * A type representing a reference count. + **/ +typedef uint8_t ReferenceCount; + +/** + * Special ReferenceCount values. + **/ +enum { + EMPTY_REFERENCE_COUNT = 0, + MAXIMUM_REFERENCE_COUNT = 254, + PROVISIONAL_REFERENCE_COUNT = 255, +}; + +enum { + COUNTS_PER_SECTOR = ((VDO_SECTOR_SIZE - sizeof(PackedJournalPoint)) + / sizeof(ReferenceCount)), + COUNTS_PER_BLOCK = COUNTS_PER_SECTOR * SECTORS_PER_BLOCK, +}; + +/** + * The format of a ReferenceSector on disk. + **/ +typedef struct { + PackedJournalPoint commitPoint; + ReferenceCount counts[COUNTS_PER_SECTOR]; +} __attribute__((packed)) PackedReferenceSector; + +typedef struct { + PackedReferenceSector sectors[SECTORS_PER_BLOCK]; +} PackedReferenceBlock; + +/* + * ReferenceBlock structure + * + * Blocks are used as a proxy, permitting saves of partial refcounts. + **/ +typedef struct { + /** This block waits on the refCounts to tell it to write */ + Waiter waiter; + /** The parent RefCount structure */ + RefCounts *refCounts; + /** The number of references in this block that represent allocations */ + BlockSize allocatedCount; + /** The slab journal block on which this block must hold a lock */ + SequenceNumber slabJournalLock; + /** + * The slab journal block which should be released when this block + * is committed + **/ + SequenceNumber slabJournalLockToRelease; + /** The point up to which each sector is accurate on disk */ + JournalPoint commitPoints[SECTORS_PER_BLOCK]; + /** Whether this block has been modified since it was written to disk */ + bool isDirty; + /** Whether this block is currently writing */ + bool isWriting; +} ReferenceBlock; + +#endif // REFERENCE_BLOCK_H diff --git a/source/vdo/base/referenceCountRebuild.c b/source/vdo/base/referenceCountRebuild.c new file mode 100644 index 0000000..a3d91ac --- /dev/null +++ b/source/vdo/base/referenceCountRebuild.c @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceCountRebuild.c#6 $ + */ + +#include "referenceCountRebuild.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMap.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "forest.h" +#include "constants.h" +#include "numUtils.h" +#include "refCounts.h" +#include "slabDepot.h" +#include "vdoInternal.h" +#include "vdoPageCache.h" + +/** + * A reference count rebuild completion. + * Note that the page completions kept in this structure are not immediately + * freed, so the corresponding pages will be locked down in the page cache + * until the rebuild frees them. + **/ +typedef struct { + /** completion header */ + VDOCompletion completion; + /** the completion for flushing the block map */ + VDOCompletion subTaskCompletion; + /** the thread on which all block map operations must be done */ + ThreadID logicalThreadID; + /** the admin thread */ + ThreadID adminThreadID; + /** the block map */ + BlockMap *blockMap; + /** the slab depot */ + SlabDepot *depot; + /** whether this recovery has been aborted */ + bool aborted; + /** whether we are currently launching the initial round of requests */ + bool launching; + /** The number of logical blocks observed used */ + BlockCount *logicalBlocksUsed; + /** The number of block map data blocks */ + BlockCount *blockMapDataBlocks; + /** the next page to fetch */ + PageCount pageToFetch; + /** the number of leaf pages in the block map */ + PageCount leafPages; + /** the last slot of the block map */ + BlockMapSlot lastSlot; + /** number of pending (non-ready) requests*/ + PageCount outstanding; + /** number of page completions */ + PageCount pageCount; + /** array of requested, potentially ready page completions */ + VDOPageCompletion pageCompletions[]; +} RebuildCompletion; + +/** + * Convert a VDOCompletion to a RebuildCompletion. + * + * @param completion The completion to convert + * + * @return The completion as a RebuildCompletion + **/ +__attribute__((warn_unused_result)) +static inline RebuildCompletion *asRebuildCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(RebuildCompletion, completion) == 0); + assertCompletionType(completion->type, REFERENCE_COUNT_REBUILD_COMPLETION); + return (RebuildCompletion *) completion; +} + +/** + * Free a RebuildCompletion and null out the reference to it. + * + * @param completionPtr a pointer to the completion to free + **/ +static void freeRebuildCompletion(VDOCompletion **completionPtr) +{ + VDOCompletion *completion = *completionPtr; + if (completion == NULL) { + return; + } + + RebuildCompletion *rebuild = asRebuildCompletion(completion); + destroyEnqueueable(&rebuild->subTaskCompletion); + destroyEnqueueable(completion); + FREE(rebuild); + *completionPtr = NULL; +} + +/** + * Free the RebuildCompletion and notify the parent that the block map + * rebuild is done. This callback is registered in rebuildBlockMap(). + * + * @param completion The RebuildCompletion + **/ +static void finishRebuild(VDOCompletion *completion) +{ + int result = completion->result; + VDOCompletion *parent = completion->parent; + freeRebuildCompletion(&completion); + finishCompletion(parent, result); +} + +/** + * Make a new rebuild completion. + * + * @param [in] vdo The VDO + * @param [in] logicalBlocksUsed A pointer to hold the logical blocks used + * @param [in] blockMapDataBlocks A pointer to hold the number of block map + * data blocks + * @param [in] parent The parent of the rebuild completion + * @param [out] rebuildPtr The new block map rebuild completion + * + * @return a success or error code + **/ +static int makeRebuildCompletion(VDO *vdo, + BlockCount *logicalBlocksUsed, + BlockCount *blockMapDataBlocks, + VDOCompletion *parent, + RebuildCompletion **rebuildPtr) +{ + BlockMap *blockMap = getBlockMap(vdo); + PageCount pageCount + = minPageCount(getConfiguredCacheSize(vdo) >> 1, + MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS); + + RebuildCompletion *rebuild; + int result = ALLOCATE_EXTENDED(RebuildCompletion, pageCount, + VDOPageCompletion, __func__, &rebuild); + if (result != UDS_SUCCESS) { + return result; + } + + result = initializeEnqueueableCompletion(&rebuild->completion, + REFERENCE_COUNT_REBUILD_COMPLETION, + vdo->layer); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = &rebuild->completion; + freeRebuildCompletion(&completion); + return result; + } + + result = initializeEnqueueableCompletion(&rebuild->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + VDOCompletion *completion = &rebuild->completion; + freeRebuildCompletion(&completion); + return result; + } + + rebuild->blockMap = blockMap; + rebuild->depot = vdo->depot; + rebuild->logicalBlocksUsed = logicalBlocksUsed; + rebuild->blockMapDataBlocks = blockMapDataBlocks; + rebuild->pageCount = pageCount; + rebuild->leafPages = computeBlockMapPageCount(blockMap->entryCount); + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + rebuild->logicalThreadID = getLogicalZoneThread(threadConfig, 0); + rebuild->adminThreadID = getAdminThread(threadConfig); + + ASSERT_LOG_ONLY((getCallbackThreadID() == rebuild->logicalThreadID), + "%s must be called on logical thread %u (not %u)", __func__, + rebuild->logicalThreadID, getCallbackThreadID()); + prepareCompletion(&rebuild->completion, finishRebuild, finishRebuild, + rebuild->logicalThreadID, parent); + + *rebuildPtr = rebuild; + return VDO_SUCCESS; +} + +/** + * Flush the block map now that all the reference counts are rebuilt. This + * callback is registered in finishIfDone(). + * + * @param completion The sub-task completion + **/ +static void flushBlockMapUpdates(VDOCompletion *completion) +{ + logInfo("Flushing block map changes"); + prepareToFinishParent(completion, completion->parent); + drainBlockMap(asRebuildCompletion(completion->parent)->blockMap, + ADMIN_STATE_RECOVERING, completion); +} + +/** + * Check whether the rebuild is done. If it succeeded, continue by flushing the + * block map. + * + * @param rebuild The rebuild completion + * + * @return true if the rebuild is complete + **/ +static bool finishIfDone(RebuildCompletion *rebuild) +{ + if (rebuild->launching || (rebuild->outstanding > 0)) { + return false; + } + + if (rebuild->aborted) { + completeCompletion(&rebuild->completion); + return true; + } + + if (rebuild->pageToFetch < rebuild->leafPages) { + return false; + } + + prepareCompletion(&rebuild->subTaskCompletion, flushBlockMapUpdates, + finishParentCallback, rebuild->adminThreadID, rebuild); + invokeCallback(&rebuild->subTaskCompletion); + return true; +} + +/** + * Record that there has been an error during the rebuild. + * + * @param rebuild The rebuild completion + * @param result The error result to use, if one is not already saved + **/ +static void abortRebuild(RebuildCompletion *rebuild, int result) +{ + rebuild->aborted = true; + setCompletionResult(&rebuild->completion, result); +} + +/** + * Handle an error loading a page. + * + * @param completion The VDOPageCompletion + **/ +static void handlePageLoadError(VDOCompletion *completion) +{ + RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); + rebuild->outstanding--; + abortRebuild(rebuild, completion->result); + releaseVDOPageCompletion(completion); + finishIfDone(rebuild); +} + +/** + * Rebuild reference counts from a block map page. + * + * @param rebuild The rebuild completion + * @param completion The page completion holding the page + * + * @return VDO_SUCCESS or an error + **/ +static int rebuildReferenceCountsFromPage(RebuildCompletion *rebuild, + VDOCompletion *completion) +{ + BlockMapPage *page = dereferenceWritableVDOPage(completion); + int result = ASSERT(page != NULL, "page available"); + if (result != VDO_SUCCESS) { + return result; + } + + if (!isBlockMapPageInitialized(page)) { + return VDO_SUCCESS; + } + + // Remove any bogus entries which exist beyond the end of the logical space. + if (getBlockMapPagePBN(page) == rebuild->lastSlot.pbn) { + for (SlotNumber slot = rebuild->lastSlot.slot; + slot < BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { + DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); + if (isMappedLocation(&mapping)) { + page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + requestVDOPageWrite(completion); + } + } + } + + // Inform the slab depot of all entries on this page. + for (SlotNumber slot = 0; slot < BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { + DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); + if (!isValidLocation(&mapping)) { + // This entry is invalid, so remove it from the page. + page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + requestVDOPageWrite(completion); + continue; + } + + if (!isMappedLocation(&mapping)) { + continue; + } + + (*rebuild->logicalBlocksUsed)++; + if (mapping.pbn == ZERO_BLOCK) { + continue; + } + + if (!isPhysicalDataBlock(rebuild->depot, mapping.pbn)) { + // This is a nonsense mapping. Remove it from the map so we're at least + // consistent and mark the page dirty. + page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + requestVDOPageWrite(completion); + continue; + } + + Slab *slab = getSlab(rebuild->depot, mapping.pbn); + int result = adjustReferenceCountForRebuild(slab->referenceCounts, + mapping.pbn, DATA_INCREMENT); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, + "Could not adjust reference count for PBN" + " %llu, slot %u mapped to PBN %llu", + getBlockMapPagePBN(page), slot, mapping.pbn); + page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); + requestVDOPageWrite(completion); + } + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void fetchPage(RebuildCompletion *rebuild, VDOCompletion *completion); + +/** + * Process a page which has just been loaded. This callback is registered by + * fetchPage(). + * + * @param completion The VDOPageCompletion for the fetched page + **/ +static void pageLoaded(VDOCompletion *completion) +{ + RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); + rebuild->outstanding--; + + int result = rebuildReferenceCountsFromPage(rebuild, completion); + if (result != VDO_SUCCESS) { + abortRebuild(rebuild, result); + } + + releaseVDOPageCompletion(completion); + if (finishIfDone(rebuild)) { + return; + } + + // Advance progress to the next page, and fetch the next page we + // haven't yet requested. + fetchPage(rebuild, completion); +} + +/** + * Fetch a page from the block map. + * + * @param rebuild the RebuildCompletion + * @param completion the page completion to use + **/ +static void fetchPage(RebuildCompletion *rebuild, VDOCompletion *completion) +{ + while (rebuild->pageToFetch < rebuild->leafPages) { + PhysicalBlockNumber pbn = findBlockMapPagePBN(rebuild->blockMap, + rebuild->pageToFetch++); + if (pbn == ZERO_BLOCK) { + continue; + } + + if (!isPhysicalDataBlock(rebuild->depot, pbn)) { + abortRebuild(rebuild, VDO_BAD_MAPPING); + if (finishIfDone(rebuild)) { + return; + } + continue; + } + + initVDOPageCompletion(((VDOPageCompletion *) completion), + rebuild->blockMap->zones[0].pageCache, + pbn, true, &rebuild->completion, + pageLoaded, handlePageLoadError); + rebuild->outstanding++; + getVDOPageAsync(completion); + return; + } +} + +/** + * Rebuild reference counts from the leaf block map pages now that reference + * counts have been rebuilt from the interior tree pages (which have been + * loaded in the process). This callback is registered in + * rebuildReferenceCounts(). + * + * @param completion The sub-task completion + **/ +static void rebuildFromLeaves(VDOCompletion *completion) +{ + RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); + *rebuild->logicalBlocksUsed = 0; + + // The PBN calculation doesn't work until the tree pages have been loaded, + // so we can't set this value at the start of rebuild. + rebuild->lastSlot = (BlockMapSlot) { + .slot = rebuild->blockMap->entryCount % BLOCK_MAP_ENTRIES_PER_PAGE, + .pbn = findBlockMapPagePBN(rebuild->blockMap, rebuild->leafPages - 1), + }; + + // Prevent any page from being processed until all pages have been launched. + rebuild->launching = true; + for (PageCount i = 0; i < rebuild->pageCount; i++) { + fetchPage(rebuild, &rebuild->pageCompletions[i].completion); + } + rebuild->launching = false; + finishIfDone(rebuild); +} + +/** + * Process a single entry from the block map tree. + * + *

Implements EntryCallback. + * + * @param pbn A pbn which holds a block map tree page + * @param completion The parent completion of the traversal + * + * @return VDO_SUCCESS or an error + **/ +static int processEntry(PhysicalBlockNumber pbn, VDOCompletion *completion) +{ + RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); + if ((pbn == ZERO_BLOCK) || !isPhysicalDataBlock(rebuild->depot, pbn)) { + return logErrorWithStringError(VDO_BAD_CONFIGURATION, + "PBN %llu out of range", + pbn); + } + + Slab *slab = getSlab(rebuild->depot, pbn); + int result = adjustReferenceCountForRebuild(slab->referenceCounts, pbn, + BLOCK_MAP_INCREMENT); + if (result != VDO_SUCCESS) { + return logErrorWithStringError(result, + "Could not adjust reference count for " + "block map tree PBN %llu", + pbn); + } + + (*rebuild->blockMapDataBlocks)++; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void rebuildReferenceCounts(VDO *vdo, + VDOCompletion *parent, + BlockCount *logicalBlocksUsed, + BlockCount *blockMapDataBlocks) +{ + RebuildCompletion *rebuild; + int result = makeRebuildCompletion(vdo, logicalBlocksUsed, + blockMapDataBlocks, parent, &rebuild); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + // Completion chaining from page cache hits can lead to stack overflow + // during the rebuild, so clear out the cache before this rebuild phase. + result = invalidateVDOPageCache(rebuild->blockMap->zones[0].pageCache); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + // First traverse the block map trees. + *rebuild->blockMapDataBlocks = 0; + VDOCompletion *completion = &rebuild->subTaskCompletion; + prepareCompletion(completion, rebuildFromLeaves, finishParentCallback, + rebuild->logicalThreadID, rebuild); + traverseForest(rebuild->blockMap, processEntry, completion); +} diff --git a/source/vdo/base/referenceCountRebuild.h b/source/vdo/base/referenceCountRebuild.h new file mode 100644 index 0000000..59363ac --- /dev/null +++ b/source/vdo/base/referenceCountRebuild.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceCountRebuild.h#1 $ + */ + +#ifndef REFERENCE_COUNT_REBUILD_H +#define REFERENCE_COUNT_REBUILD_H + +#include "types.h" + +/** + * Rebuild the reference counts from the block map (read-only rebuild). + * + * @param [in] vdo The VDO + * @param [in] parent The completion to notify when the rebuild is + * complete + * @param [out] logicalBlocksUsed A pointer to hold the logical blocks used + * @param [out] blockMapDataBlocks A pointer to hold the number of block map + * data blocks + **/ +void rebuildReferenceCounts(VDO *vdo, + VDOCompletion *parent, + BlockCount *logicalBlocksUsed, + BlockCount *blockMapDataBlocks); + +#endif // REFERENCE_COUNT_REBUILD_H diff --git a/source/vdo/base/referenceOperation.c b/source/vdo/base/referenceOperation.c new file mode 100644 index 0000000..a8ea9a0 --- /dev/null +++ b/source/vdo/base/referenceOperation.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceOperation.c#1 $ + */ + +#include "referenceOperation.h" + +#include "physicalZone.h" +#include "types.h" + +/**********************************************************************/ +static PBNLock *returnPBNLock(ReferenceOperation operation) +{ + return (PBNLock *) operation.context; +} + +/**********************************************************************/ +void setUpReferenceOperationWithLock(JournalOperation type, + PhysicalBlockNumber pbn, + BlockMappingState state, + PBNLock *lock, + ReferenceOperation *operation) +{ + *operation = (ReferenceOperation) { + .type = type, + .pbn = pbn, + .state = state, + .lockGetter = returnPBNLock, + .context = lock, + }; +} + +/**********************************************************************/ +static PBNLock *lookUpPBNLock(ReferenceOperation operation) +{ + return ((operation.context == NULL) + ? NULL : getPBNLock(operation.context, operation.pbn)); +} + +/**********************************************************************/ +void setUpReferenceOperationWithZone(JournalOperation type, + PhysicalBlockNumber pbn, + BlockMappingState state, + PhysicalZone *zone, + ReferenceOperation *operation) +{ + *operation = (ReferenceOperation) { + .type = type, + .pbn = pbn, + .state = state, + .lockGetter = lookUpPBNLock, + .context = zone, + }; +} diff --git a/source/vdo/base/referenceOperation.h b/source/vdo/base/referenceOperation.h new file mode 100644 index 0000000..c846ec6 --- /dev/null +++ b/source/vdo/base/referenceOperation.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceOperation.h#1 $ + */ + +#ifndef REFERENCE_OPERATION_H +#define REFERENCE_OPERATION_H + +#include "types.h" + +typedef struct referenceOperation ReferenceOperation; + +/** + * Get the PBNLock associated with a ReferenceOperation. + * + * @param operation The ReferenceOperation + * + * @return The PBNLock on the block of a ReferenceOperation or NULL if there + * isn't one + **/ +typedef PBNLock *PBNLockGetter(ReferenceOperation operation); + +/** + * The current operation on a physical block (from the point of view of the + * DataVIO doing the operation) + **/ +struct referenceOperation { + /** The operation being performed */ + JournalOperation type; + /** The PBN of the block being operated on */ + PhysicalBlockNumber pbn; + /** The mapping state of the block being operated on */ + BlockMappingState state; + /** A function to use to get any PBNLock associated with this operation */ + PBNLockGetter *lockGetter; + /** The context to pass to the PBNLockGetter */ + void *context; +}; + +/** + * Get the PBNLock associated with the current ReferenceOperation. + * + * @param operation The reference operation + * + * @return The PBNLock on the block of the current operation or NULL if there + * isn't one + **/ +__attribute__((warn_unused_result)) +static inline +PBNLock *getReferenceOperationPBNLock(ReferenceOperation operation) +{ + return ((operation.lockGetter == NULL) + ? NULL : operation.lockGetter(operation)); +} + +/** + * Set up a ReferenceOperation for which we already have the lock. + * + * @param type The type of operation + * @param pbn The PBN of the block on which to operate + * @param state The mapping state of the block on which to operate + * @param lock The PBNLock to associate with the operation + * @param operation The ReferenceOperation to set up + **/ +void setUpReferenceOperationWithLock(JournalOperation type, + PhysicalBlockNumber pbn, + BlockMappingState state, + PBNLock *lock, + ReferenceOperation *operation); + +/** + * Set up a ReferenceOperation for which we will need to look up the lock later. + * + * @param type The type of operation + * @param pbn The PBN of the block on which to operate + * @param state The mapping state of the block on which to operate + * @param zone The PhysicalZone from which the PBNLock can be retrieved + * when needed + * @param operation The ReferenceOperation to set up + **/ +void setUpReferenceOperationWithZone(JournalOperation type, + PhysicalBlockNumber pbn, + BlockMappingState state, + PhysicalZone *zone, + ReferenceOperation *operation); + +#endif // REFERENCE_OPERATION_H diff --git a/source/vdo/base/releaseVersions.h b/source/vdo/base/releaseVersions.h new file mode 100644 index 0000000..7620f17 --- /dev/null +++ b/source/vdo/base/releaseVersions.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef RELEASE_VERSIONS_H +#define RELEASE_VERSIONS_H + +enum { + OXYGEN_RELEASE_VERSION_NUMBER = 109583, + FLUORINE_RELEASE_VERSION_NUMBER = 115838, + NEON_RELEASE_VERSION_NUMBER = 120965, + SODIUM_RELEASE_VERSION_NUMBER = 127441, + MAGNESIUM_RELEASE_VERSION_NUMBER = 131337, + ALUMINUM_RELEASE_VERSION_NUMBER = 133524, + HEAD_RELEASE_VERSION_NUMBER = 0, + CURRENT_RELEASE_VERSION_NUMBER = ALUMINUM_RELEASE_VERSION_NUMBER, +}; + +#endif /* not RELEASE_VERSIONS_H */ diff --git a/source/vdo/base/ringNode.h b/source/vdo/base/ringNode.h new file mode 100644 index 0000000..5f389f4 --- /dev/null +++ b/source/vdo/base/ringNode.h @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/ringNode.h#1 $ + */ + +#ifndef RING_NODE_H +#define RING_NODE_H + +#include "types.h" + +/** + * A ring node is a member of a doubly-linked circular list. + * + * Each node is usually embedded within a data structure that contains the + * relevant payload. In addition the ring head is also represented by a + * node where the next field designates the first element of the ring and the + * prev field designates the last. + * + * An empty ring contains next and prev fields that point back to the ring + * head itself. + * + * Typical iteration over a ring, from the front and back: + * + * for (RingNode *n = head->next; n != head; n = n->next) { ... } + * for (RingNode *p = head->prev; p != head; p = p->prev) { ... } + **/ +typedef struct ringNode RingNode; + +struct ringNode { + RingNode *next; + RingNode *prev; +}; + +/** + * Initialize a ring to be empty. + * + * @param head The head of the ring + **/ +static inline void initializeRing(RingNode *head) +{ + head->next = head->prev = head; +} + +/** + * Check whether a ring is empty. + * + * @param head The head of the ring + * + * @return true if the ring is empty + **/ +static inline bool isRingEmpty(const RingNode *head) +{ + return (head->next == head); +} + +/** + * Check whether a ring contains exactly one node. + * + * @param head The head of the ring + * + * @return true if the ring contains exactly one member + **/ +static inline bool isRingSingleton(const RingNode *head) +{ + return (!isRingEmpty(head) && (head->prev == head->next)); +} + +/** + * Unsplice a contiguous chain of at least one node from its ring. + * + * @param first the first entry in the ring to unsplice + * @param last the last entry in the ring to unsplice, + * may be the same as ``first`` + * + * The effect of this is to create two rings, the one designated + * by first through last, and the other consisting of anything remaining. + **/ +static inline void unspliceRingChain(RingNode *first, + RingNode *last) +{ + first->prev->next = last->next; + last->next->prev = first->prev; + first->prev = last; + last->next = first; +} + +/** + * Remove a ring node from its ring. + * + * @param node the ring node + * + * @return the removed node, for convenience + **/ +static inline RingNode *unspliceRingNode(RingNode *node) +{ + unspliceRingChain(node, node); + return node; +} + +/** + * Splice a contiguous chain of at least one node after the specified entry, + * which may be the head of a ring. + * + * @param first the first entry in a contiguous span of nodes + * @param last the last entry in a contiguous span of nodes, + * may be the same as ``first`` + * @param where the entry after which ``first`` through ``last`` + * shall appear + * + * The effect of this is to unsplice first through last (if necessary) and + * insert them after ``where`` so that the previous nodes after ``where`` + * now appear after ``last``. + **/ +static inline void spliceRingChainAfter(RingNode *first, + RingNode *last, + RingNode *where) +{ + if (last->next != first) { + unspliceRingChain(first, last); + } + last->next = where->next; + first->prev = where; + where->next->prev = last; + where->next = first; +} + +/** + * Splice a contiguous chain of at least one node before the specified entry, + * which may be the tail of a list. + * + * @param first the first entry in a contiguous span of nodes + * @param last the last entry in a contiguous span of nodes, + * may be the same as ``first`` + * @param where the entry before which ``first`` through ``last`` + * shall appear + * + * The effect of this is to unsplice first through last (if necessary) and + * insert them before ``where`` so that the previous nodes before ``where`` + * now appear before ``first``. + **/ +static inline void spliceRingChainBefore(RingNode *first, + RingNode *last, + RingNode *where) +{ + if (last->next != first) { + unspliceRingChain(first, last); + } + first->prev = where->prev; + last->next = where; + where->prev->next = first; + where->prev = last; +} + +/** + * Push a single node on the end of a ring. + * + * @param head The ring head + * @param node The node to push + **/ +static inline void pushRingNode(RingNode *head, RingNode *node) +{ + spliceRingChainBefore(node, node, head); +} + +/** + * Pop a single node off the end of a ring. + * + * @param head The ring head + * + * @return NULL if the ring was empty, otherwise the node that was + * removed from the ring (``head->prev``) + **/ +static inline RingNode *popRingNode(RingNode *head) +{ + return (isRingEmpty(head) ? NULL : unspliceRingNode(head->prev)); +} + +/** + * Remove a single node off the front of the list + **/ +static inline RingNode *chopRingNode(RingNode *head) +{ + return (isRingEmpty(head) ? NULL : unspliceRingNode(head->next)); +} + +#endif // RING_NODE_H diff --git a/source/vdo/base/slab.c b/source/vdo/base/slab.c new file mode 100644 index 0000000..f2903d6 --- /dev/null +++ b/source/vdo/base/slab.c @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.c#9 $ + */ + +#include "slab.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminState.h" +#include "blockAllocatorInternals.h" +#include "completion.h" +#include "constants.h" +#include "numUtils.h" +#include "pbnLock.h" +#include "recoveryJournal.h" +#include "refCounts.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "slabJournalInternals.h" +#include "slabSummary.h" + +/**********************************************************************/ +int configureSlab(BlockCount slabSize, + BlockCount slabJournalBlocks, + SlabConfig *slabConfig) +{ + if (slabJournalBlocks >= slabSize) { + return VDO_BAD_CONFIGURATION; + } + + /* + * This calculation should technically be a recurrence, but the total number + * of metadata blocks is currently less than a single block of refCounts, so + * we'd gain at most one data block in each slab with more iteration. + */ + BlockCount refBlocks + = getSavedReferenceCountSize(slabSize - slabJournalBlocks); + BlockCount metaBlocks = (refBlocks + slabJournalBlocks); + + // Make sure test code hasn't configured slabs to be too small. + if (metaBlocks >= slabSize) { + return VDO_BAD_CONFIGURATION; + } + + /* + * If the slab size is very small, assume this must be a unit test and + * override the number of data blocks to be a power of two (wasting blocks + * in the slab). Many tests need their dataBlocks fields to be the exact + * capacity of the configured volume, and that used to fall out since they + * use a power of two for the number of data blocks, the slab size was a + * power of two, and every block in a slab was a data block. + * + * XXX Try to figure out some way of structuring testParameters and unit + * tests so this hack isn't needed without having to edit several unit tests + * every time the metadata size changes by one block. + */ + BlockCount dataBlocks = slabSize - metaBlocks; + if ((slabSize < 1024) && !isPowerOfTwo(dataBlocks)) { + dataBlocks = ((BlockCount) 1 << logBaseTwo(dataBlocks)); + } + + /* + * Configure the slab journal thresholds. The flush threshold is 168 of 224 + * blocks in production, or 3/4ths, so we use this ratio for all sizes. + */ + BlockCount flushingThreshold = ((slabJournalBlocks * 3) + 3) / 4; + /* + * The blocking threshold should be far enough from the the flushing + * threshold to not produce delays, but far enough from the end of the + * journal to allow multiple successive recovery failures. + */ + BlockCount remaining = slabJournalBlocks - flushingThreshold; + BlockCount blockingThreshold = flushingThreshold + ((remaining * 5) / 7); + /* + * The scrubbing threshold should be at least 2048 entries before the end of + * the journal. + */ + BlockCount minimalExtraSpace + = 1 + (MAXIMUM_USER_VIOS / SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK); + BlockCount scrubbingThreshold = blockingThreshold; + if (slabJournalBlocks > minimalExtraSpace) { + scrubbingThreshold = slabJournalBlocks - minimalExtraSpace; + } + if (blockingThreshold > scrubbingThreshold) { + blockingThreshold = scrubbingThreshold; + } + + *slabConfig = (SlabConfig) { + .slabBlocks = slabSize, + .dataBlocks = dataBlocks, + .referenceCountBlocks = refBlocks, + .slabJournalBlocks = slabJournalBlocks, + .slabJournalFlushingThreshold = flushingThreshold, + .slabJournalBlockingThreshold = blockingThreshold, + .slabJournalScrubbingThreshold = scrubbingThreshold + }; + return VDO_SUCCESS; +} + +/**********************************************************************/ +PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig *slabConfig, + PhysicalBlockNumber origin) +{ + return origin + slabConfig->dataBlocks + slabConfig->referenceCountBlocks; +} + +/**********************************************************************/ +int makeSlab(PhysicalBlockNumber slabOrigin, + BlockAllocator *allocator, + PhysicalBlockNumber translation, + RecoveryJournal *recoveryJournal, + SlabCount slabNumber, + bool isNew, + Slab **slabPtr) +{ + Slab *slab; + int result = ALLOCATE(1, Slab, __func__, &slab); + if (result != VDO_SUCCESS) { + return result; + } + + const SlabConfig *slabConfig = getSlabConfig(allocator->depot); + + slab->allocator = allocator; + slab->start = slabOrigin; + slab->end = slab->start + slabConfig->slabBlocks; + slab->slabNumber = slabNumber; + initializeRing(&slab->ringNode); + + slab->refCountsOrigin = slabOrigin + slabConfig->dataBlocks + translation; + slab->journalOrigin = (getSlabJournalStartBlock(slabConfig, slabOrigin) + + translation); + + result = makeSlabJournal(allocator, slab, recoveryJournal, &slab->journal); + if (result != VDO_SUCCESS) { + freeSlab(&slab); + return result; + } + + if (isNew) { + slab->state.state = ADMIN_STATE_NEW; + result = allocateRefCountsForSlab(slab); + if (result != VDO_SUCCESS) { + freeSlab(&slab); + return result; + } + } + + *slabPtr = slab; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int allocateRefCountsForSlab(Slab *slab) +{ + BlockAllocator *allocator = slab->allocator; + const SlabConfig *slabConfig = getSlabConfig(allocator->depot); + + int result = ASSERT(slab->referenceCounts == NULL, + "Slab %u doesn't allocate refcounts twice", + slab->slabNumber); + if (result != VDO_SUCCESS) { + return result; + } + + return makeRefCounts(slabConfig->dataBlocks, slab, slab->refCountsOrigin, + allocator->readOnlyNotifier, &slab->referenceCounts); +} + +/**********************************************************************/ +void freeSlab(Slab **slabPtr) +{ + Slab *slab = *slabPtr; + if (slab == NULL) { + return; + } + + unspliceRingNode(&slab->ringNode); + freeSlabJournal(&slab->journal); + freeRefCounts(&slab->referenceCounts); + FREE(slab); + *slabPtr = NULL; +} + +/**********************************************************************/ +ZoneCount getSlabZoneNumber(Slab *slab) +{ + return slab->allocator->zoneNumber; +} + +/**********************************************************************/ +void markSlabReplaying(Slab *slab) +{ + if (slab->status == SLAB_REBUILT) { + slab->status = SLAB_REPLAYING; + } +} + +/**********************************************************************/ +void markSlabUnrecovered(Slab *slab) +{ + slab->status = SLAB_REQUIRES_SCRUBBING; +} + +/**********************************************************************/ +BlockCount getSlabFreeBlockCount(const Slab *slab) +{ + return getUnreferencedBlockCount(slab->referenceCounts); +} + +/**********************************************************************/ +int modifySlabReferenceCount(Slab *slab, + const JournalPoint *journalPoint, + ReferenceOperation operation) +{ + if (slab == NULL) { + return VDO_SUCCESS; + } + + /* + * If the slab is unrecovered, preserve the refCount state and let scrubbing + * correct the refCount. Note that the slab journal has already captured all + * refCount updates. + */ + if (isUnrecoveredSlab(slab)) { + SequenceNumber entryLock = journalPoint->sequenceNumber; + adjustSlabJournalBlockReference(slab->journal, entryLock, -1); + return VDO_SUCCESS; + } + + bool freeStatusChanged; + int result = adjustReferenceCount(slab->referenceCounts, operation, + journalPoint, &freeStatusChanged); + if (result != VDO_SUCCESS) { + return result; + } + + if (freeStatusChanged) { + adjustFreeBlockCount(slab, !isIncrementOperation(operation.type)); + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int acquireProvisionalReference(Slab *slab, + PhysicalBlockNumber pbn, + PBNLock *lock) +{ + if (hasProvisionalReference(lock)) { + return VDO_SUCCESS; + } + + int result = provisionallyReferenceBlock(slab->referenceCounts, pbn, lock); + if (result != VDO_SUCCESS) { + return result; + } + + if (hasProvisionalReference(lock)) { + adjustFreeBlockCount(slab, false); + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int slabBlockNumberFromPBN(Slab *slab, + PhysicalBlockNumber physicalBlockNumber, + SlabBlockNumber *slabBlockNumberPtr) +{ + if (physicalBlockNumber < slab->start) { + return VDO_OUT_OF_RANGE; + } + + uint64_t slabBlockNumber = physicalBlockNumber - slab->start; + if (slabBlockNumber >= getSlabConfig(slab->allocator->depot)->dataBlocks) { + return VDO_OUT_OF_RANGE; + } + + *slabBlockNumberPtr = slabBlockNumber; + return VDO_SUCCESS; +} + +/**********************************************************************/ +bool shouldSaveFullyBuiltSlab(const Slab *slab) +{ + // Write out the refCounts if the slab has written them before, or it has + // any non-zero reference counts, or there are any slab journal blocks. + BlockCount dataBlocks = getSlabConfig(slab->allocator->depot)->dataBlocks; + return (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber) + || (getSlabFreeBlockCount(slab) != dataBlocks) + || !isSlabJournalBlank(slab->journal)); +} + +/** + * Initiate a slab action. + * + * Implements AdminInitiator. + **/ +static void initiateSlabAction(AdminState *state) +{ + Slab *slab = container_of(state, Slab, state); + if (isDraining(state)) { + if (state->state == ADMIN_STATE_SCRUBBING) { + slab->status = SLAB_REBUILDING; + } + + drainSlabJournal(slab->journal); + + if (slab->referenceCounts != NULL) { + drainRefCounts(slab->referenceCounts); + } + + checkIfSlabDrained(slab); + return; + } + + if (isLoading(state)) { + decodeSlabJournal(slab->journal); + return; + } + + if (isResuming(state)) { + queueSlab(slab); + finishResuming(state); + return; + } + + finishOperationWithResult(state, VDO_INVALID_ADMIN_STATE); +} + +/**********************************************************************/ +void startSlabAction(Slab *slab, + AdminStateCode operation, + VDOCompletion *parent) +{ + startOperationWithWaiter(&slab->state, operation, parent, + initiateSlabAction); +} + +/**********************************************************************/ +void notifySlabJournalIsLoaded(Slab *slab, int result) +{ + if ((result == VDO_SUCCESS) && isCleanLoad(&slab->state)) { + // Since this is a normal or new load, we don't need the memory to read and + // process the recovery journal, so we can allocate reference counts now. + result = allocateRefCountsForSlab(slab); + } + + finishLoadingWithResult(&slab->state, result); +} + +/**********************************************************************/ +bool isSlabOpen(Slab *slab) +{ + return (!isQuiescing(&slab->state) && !isQuiescent(&slab->state)); +} + +/**********************************************************************/ +bool isSlabDraining(Slab *slab) +{ + return isDraining(&slab->state); +} + +/**********************************************************************/ +void checkIfSlabDrained(Slab *slab) +{ + if (isDraining(&slab->state) + && !isSlabJournalActive(slab->journal) + && ((slab->referenceCounts == NULL) + || !areRefCountsActive(slab->referenceCounts))) { + finishDrainingWithResult(&slab->state, + (isReadOnly(slab->allocator->readOnlyNotifier) + ? VDO_READ_ONLY : VDO_SUCCESS)); + } +} + +/**********************************************************************/ +void notifySlabJournalIsDrained(Slab *slab, int result) +{ + if (slab->referenceCounts == NULL) { + // This can happen when shutting down a VDO that was in read-only mode when + // loaded. + notifyRefCountsAreDrained(slab, result); + return; + } + + setOperationResult(&slab->state, result); + drainRefCounts(slab->referenceCounts); +} + +/**********************************************************************/ +void notifyRefCountsAreDrained(Slab *slab, int result) +{ + finishDrainingWithResult(&slab->state, result); +} + +/**********************************************************************/ +bool isSlabResuming(Slab *slab) +{ + return isResuming(&slab->state); +} + +/**********************************************************************/ +void finishScrubbingSlab(Slab *slab) +{ + slab->status = SLAB_REBUILT; + queueSlab(slab); + reopenSlabJournal(slab->journal); +} + +/**********************************************************************/ +static const char *statusToString(SlabRebuildStatus status) +{ + switch (status) { + case SLAB_REBUILT: + return "REBUILT"; + case SLAB_REQUIRES_SCRUBBING: + return "SCRUBBING"; + case SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING: + return "PRIORITY_SCRUBBING"; + case SLAB_REBUILDING: + return "REBUILDING"; + case SLAB_REPLAYING: + return "REPLAYING"; + default: + return "UNKNOWN"; + } +} + +/**********************************************************************/ +void dumpSlab(const Slab *slab) +{ + if (slab->referenceCounts != NULL) { + // Terse because there are a lot of slabs to dump and syslog is lossy. + logInfo("slab %u: P%u, %llu free", + slab->slabNumber, slab->priority, getSlabFreeBlockCount(slab)); + } else { + logInfo("slab %u: status %s", slab->slabNumber, + statusToString(slab->status)); + } + + dumpSlabJournal(slab->journal); + + if (slab->referenceCounts != NULL) { + dumpRefCounts(slab->referenceCounts); + } else { + logInfo("refCounts is null"); + } +} diff --git a/source/vdo/base/slab.h b/source/vdo/base/slab.h new file mode 100644 index 0000000..c7f204b --- /dev/null +++ b/source/vdo/base/slab.h @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.h#8 $ + */ + +#ifndef VDO_SLAB_H +#define VDO_SLAB_H + +#include "permassert.h" + +#include "adminState.h" +#include "fixedLayout.h" +#include "journalPoint.h" +#include "referenceOperation.h" +#include "ringNode.h" +#include "types.h" + +typedef uint32_t SlabBlockNumber; + +typedef enum { + SLAB_REBUILT = 0, + SLAB_REPLAYING, + SLAB_REQUIRES_SCRUBBING, + SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING, + SLAB_REBUILDING, +} SlabRebuildStatus; + +/** + * This is the type declaration for the Slab type. (The struct tag is named + * vdoSlab to avoid a conflict with the linux kernel type). A Slab currently + * consists of a run of 2^23 data blocks, but that will soon change to + * dedicate a small number of those blocks for metadata storage for the + * reference counts and slab journal for the slab. + **/ +struct vdoSlab { + /** A RingNode to queue this slab in a BlockAllocator ring */ + RingNode ringNode; + + /** The BlockAllocator that owns this slab */ + BlockAllocator *allocator; + + /** The reference counts for the data blocks in this slab */ + RefCounts *referenceCounts; + /** The journal for this slab */ + SlabJournal *journal; + + /** The slab number of this slab */ + SlabCount slabNumber; + /** The offset in the allocator partition of the first block in this slab */ + PhysicalBlockNumber start; + /** The offset of the first block past the end of this slab */ + PhysicalBlockNumber end; + /** The starting translated PBN of the slab journal */ + PhysicalBlockNumber journalOrigin; + /** The starting translated PBN of the reference counts */ + PhysicalBlockNumber refCountsOrigin; + + /** The administrative state of the slab */ + AdminState state; + /** The status of the slab */ + SlabRebuildStatus status; + /** Whether the slab was ever queued for scrubbing */ + bool wasQueuedForScrubbing; + + /** The priority at which this slab has been queued for allocation */ + uint8_t priority; +}; + +/** + * Measure and initialize the configuration to use for each slab. + * + * @param [in] slabSize The number of blocks per slab + * @param [in] slabJournalBlocks The number of blocks for the slab journal + * @param [out] slabConfig The slab configuration to initialize + * + * @return VDO_SUCCESS or an error code + **/ +int configureSlab(BlockCount slabSize, + BlockCount slabJournalBlocks, + SlabConfig *slabConfig) + __attribute__((warn_unused_result)); + +/** + * Convert a Slab's RingNode back to the Slab. + * + * @param ringNode The RingNode to convert + * + * @return The RingNode as a Slab + **/ +static inline Slab *slabFromRingNode(RingNode *ringNode) +{ + STATIC_ASSERT(offsetof(Slab, ringNode) == 0); + return (Slab *) ringNode; +} + +/** + * Get the physical block number of the start of the slab journal + * relative to the start block allocator partition. + * + * @param slabConfig The slab configuration of the VDO + * @param origin The first block of the slab + **/ +__attribute__((warn_unused_result)) +PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig *slabConfig, + PhysicalBlockNumber origin); + +/** + * Construct a new, empty slab. + * + * @param [in] slabOrigin The physical block number within the block + * allocator partition of the first block in the + * slab + * @param [in] allocator The block allocator to which the slab belongs + * @param [in] translation The translation from the depot's partition to + * the physical storage + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [in] slabNumber The slab number of the slab + * @param [in] isNew true if this slab is being + * allocated as part of a resize + * @param [out] slabPtr A pointer to receive the new slab + * + * @return VDO_SUCCESS or an error code + **/ +int makeSlab(PhysicalBlockNumber slabOrigin, + BlockAllocator *allocator, + PhysicalBlockNumber translation, + RecoveryJournal *recoveryJournal, + SlabCount slabNumber, + bool isNew, + Slab **slabPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate the reference counts for a slab. + * + * @param slab The slab whose reference counts need allocation. + * + * @return VDO_SUCCESS or an error code + **/ +int allocateRefCountsForSlab(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Destroy a slab and null out the reference to it. + * + * @param slabPtr The reference to the slab to destroy + **/ +void freeSlab(Slab **slabPtr); + +/** + * Get the physical zone number of a slab. + * + * @param slab The slab + * + * @return The number of the slab's physical zone + **/ +ZoneCount getSlabZoneNumber(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Check whether a slab is unrecovered. + * + * @param slab The slab to check + * + * @return true if the slab is unrecovered + **/ +static inline bool isUnrecoveredSlab(const Slab *slab) +{ + return (slab->status != SLAB_REBUILT); +} + +/** + * Check whether a slab is being replayed into. + * + * @param slab The slab to check + * + * @return true if the slab is replaying + **/ +static inline bool isReplayingSlab(const Slab *slab) +{ + return (slab->status == SLAB_REPLAYING); +} + +/** + * Check whether a slab is being rebuilt. + * + * @param slab The slab to check + * + * @return true if the slab is being rebuilt + **/ +static inline bool slabIsRebuilding(const Slab *slab) +{ + return (slab->status == SLAB_REBUILDING); +} + +/** + * Mark a slab as replaying, during offline recovery. + * + * @param slab The slab to mark + **/ +void markSlabReplaying(Slab *slab); + +/** + * Mark a slab as unrecovered, for online recovery. + * + * @param slab The slab to mark + **/ +void markSlabUnrecovered(Slab *slab); + +/** + * Get the current number of free blocks in a slab. + * + * @param slab The slab to query + * + * @return the number of free blocks in the slab + **/ +BlockCount getSlabFreeBlockCount(const Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Increment or decrement the reference count of a block in a slab. + * + * @param slab The slab containing the block (may be NULL when + * referencing the zero block) + * @param journalPoint The slab journal entry corresponding to this change + * @param operation The operation to perform on the reference count + * + * @return VDO_SUCCESS or an error + **/ +int modifySlabReferenceCount(Slab *slab, + const JournalPoint *journalPoint, + ReferenceOperation operation) + __attribute__((warn_unused_result)); + +/** + * Acquire a provisional reference on behalf of a PBN lock if the block it + * locks is unreferenced. + * + * @param slab The slab which contains the block + * @param pbn The physical block to reference + * @param lock The lock + * + * @return VDO_SUCCESS or an error + **/ +int acquireProvisionalReference(Slab *slab, + PhysicalBlockNumber pbn, + PBNLock *lock) + __attribute__((warn_unused_result)); + +/** + * Determine the index within the slab of a particular physical block number. + * + * @param [in] slab The slab + * @param [in] physicalBlockNumber The physical block number + * @param [out] slabBlockNumberPtr A pointer to the slab block number + * + * @return VDO_SUCCESS or an error code + **/ +int slabBlockNumberFromPBN(Slab *slab, + PhysicalBlockNumber physicalBlockNumber, + SlabBlockNumber *slabBlockNumberPtr) + __attribute__((warn_unused_result)); + +/** + * Check whether the reference counts for a given rebuilt slab should be saved. + * Implements SlabStatusChecker. + * + * @param slab The slab to check + * + * @return true if the slab should be saved + **/ +bool shouldSaveFullyBuiltSlab(const Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Start an administrative operation on a slab. + * + * @param slab The slab to load + * @param operation The type of load to perform + * @param parent The object to notify when the operation is complete + **/ +void startSlabAction(Slab *slab, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Inform a slab that its journal has been loaded. + * + * @param slab The slab whose journal has been loaded + * @param result The result of the load operation + **/ +void notifySlabJournalIsLoaded(Slab *slab, int result); + +/** + * Check whether a slab is open, i.e. is neither quiescent nor quiescing. + * + * @param slab The slab to check + * + * @return true if the slab is open + **/ +bool isSlabOpen(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Check whether a slab is currently draining. + * + * @param slab The slab to check + * + * @return true if the slab is performing a drain operation + **/ +bool isSlabDraining(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Check whether a slab has drained, and if so, send a notification thereof. + * + * @param slab The slab to check + **/ +void checkIfSlabDrained(Slab *slab); + +/** + * Inform a slab that its journal has finished draining. + * + * @param slab The slab whose journal has been drained + * @param result The result of the drain operation + **/ +void notifySlabJournalIsDrained(Slab *slab, int result); + +/** + * Inform a slab that its RefCounts have finished draining. + * + * @param slab The slab whose RefCounts has been drained + * @param result The result of the drain operation + **/ +void notifyRefCountsAreDrained(Slab *slab, int result); + +/** + * Check whether a slab is currently resuming. + * + * @param slab The slab to check + * + * @return true if the slab is performing a resume operation + **/ +bool isSlabResuming(Slab *slab) + __attribute__((warn_unused_result)); + +/** + * Finish scrubbing a slab now that it has been rebuilt by updating its status, + * queueing it for allocation, and reopening its journal. + * + * @param slab The slab whose reference counts have been rebuilt from its + * journal + **/ +void finishScrubbingSlab(Slab *slab); + +/** + * Dump information about a slab to the log for debugging. + * + * @param slab The slab to dump + **/ +void dumpSlab(const Slab *slab); + +#endif // VDO_SLAB_H diff --git a/source/vdo/base/slabDepot.c b/source/vdo/base/slabDepot.c new file mode 100644 index 0000000..6c10c29 --- /dev/null +++ b/source/vdo/base/slabDepot.c @@ -0,0 +1,1145 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepot.c#23 $ + */ + +#include "slabDepot.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "actionManager.h" +#include "adminState.h" +#include "blockAllocatorInternals.h" +#include "constants.h" +#include "header.h" +#include "numUtils.h" +#include "readOnlyNotifier.h" +#include "refCounts.h" +#include "slab.h" +#include "slabDepotInternals.h" +#include "slabJournal.h" +#include "slabIterator.h" +#include "slabSummary.h" +#include "threadConfig.h" +#include "types.h" + +typedef struct { + SlabConfig slabConfig; + PhysicalBlockNumber firstBlock; + PhysicalBlockNumber lastBlock; + ZoneCount zoneCount; +} __attribute__((packed)) SlabDepotState2_0; + +static const Header SLAB_DEPOT_HEADER_2_0 = { + .id = SLAB_DEPOT, + .version = { + .majorVersion = 2, + .minorVersion = 0, + }, + .size = sizeof(SlabDepotState2_0), +}; + +/** + * Compute the number of slabs a depot with given parameters would have. + * + * @param firstBlock PBN of the first data block + * @param lastBlock PBN of the last data block + * @param slabSizeShift Exponent for the number of blocks per slab + * + * @return The number of slabs + **/ +__attribute__((warn_unused_result)) +static SlabCount computeSlabCount(PhysicalBlockNumber firstBlock, + PhysicalBlockNumber lastBlock, + unsigned int slabSizeShift) +{ + BlockCount dataBlocks = lastBlock - firstBlock; + return (SlabCount) (dataBlocks >> slabSizeShift); +} + +/**********************************************************************/ +SlabCount calculateSlabCount(SlabDepot *depot) +{ + return computeSlabCount(depot->firstBlock, depot->lastBlock, + depot->slabSizeShift); +} + +/** + * Get an iterator over all the slabs in the depot. + * + * @param depot The depot + * + * @return An iterator over the depot's slabs + **/ +static SlabIterator getSlabIterator(SlabDepot *depot) +{ + return iterateSlabs(depot->slabs, depot->slabCount - 1, 0, 1); +} + +/** + * Allocate a new slab pointer array. Any existing slab pointers will be + * copied into the new array, and slabs will be allocated as needed. The + * newly allocated slabs will not be distributed for use by the block + * allocators. + * + * @param depot The depot + * @param slabCount The number of slabs the depot should have in the new + * array + * + * @return VDO_SUCCESS or an error code + **/ +static int allocateSlabs(SlabDepot *depot, SlabCount slabCount) +{ + int result = ALLOCATE(slabCount, Slab *, "slab pointer array", + &depot->newSlabs); + if (result != VDO_SUCCESS) { + return result; + } + + bool resizing = false; + if (depot->slabs != NULL) { + memcpy(depot->newSlabs, depot->slabs, depot->slabCount * sizeof(Slab *)); + resizing = true; + } + + BlockCount slabSize = getSlabConfig(depot)->slabBlocks; + PhysicalBlockNumber slabOrigin + = depot->firstBlock + (depot->slabCount * slabSize); + + // The translation between allocator partition PBNs and layer PBNs. + BlockCount translation = depot->origin - depot->firstBlock; + depot->newSlabCount = depot->slabCount; + while (depot->newSlabCount < slabCount) { + BlockAllocator *allocator + = depot->allocators[depot->newSlabCount % depot->zoneCount]; + Slab **slabPtr = &depot->newSlabs[depot->newSlabCount]; + result = makeSlab(slabOrigin, allocator, translation, depot->journal, + depot->newSlabCount, resizing, slabPtr); + if (result != VDO_SUCCESS) { + return result; + } + // Increment here to ensure that abandonNewSlabs will clean up correctly. + depot->newSlabCount++; + + slabOrigin += slabSize; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void abandonNewSlabs(SlabDepot *depot) +{ + if (depot->newSlabs == NULL) { + return; + } + for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { + freeSlab(&depot->newSlabs[i]); + } + depot->newSlabCount = 0; + FREE(depot->newSlabs); + depot->newSlabs = NULL; + depot->newSize = 0; +} + +/** + * Get the ID of the thread on which a given allocator operates. + * + *

Implements ZoneThreadGetter. + **/ +static ThreadID getAllocatorThreadID(void *context, ZoneCount zoneNumber) +{ + return getBlockAllocatorForZone(context, zoneNumber)->threadID; +} + +/** + * Prepare to commit oldest tail blocks. + * + *

Implements ActionPreamble. + **/ +static void prepareForTailBlockCommit(void *context, VDOCompletion *parent) +{ + SlabDepot *depot = context; + depot->activeReleaseRequest = depot->newReleaseRequest; + completeCompletion(parent); +} + +/** + * Schedule a tail block commit if necessary. This method should not be called + * directly. Rather, call scheduleDefaultAction() on the depot's action + * manager. + * + *

Implements ActionScheduler, + **/ +static bool scheduleTailBlockCommit(void *context) +{ + SlabDepot *depot = context; + if (depot->newReleaseRequest == depot->activeReleaseRequest) { + return false; + } + + return scheduleAction(depot->actionManager, prepareForTailBlockCommit, + releaseTailBlockLocks, NULL, NULL); +} + +/** + * Allocate those components of the slab depot which are needed only at load + * time, not at format time. + * + * @param depot The depot + * @param nonce The nonce of the VDO + * @param threadConfig The thread config of the VDO + * @param vioPoolSize The size of the VIO pool + * @param layer The physical layer below this depot + * @param summaryPartition The partition which holds the slab summary + * + * @return VDO_SUCCESS or an error + **/ +static int allocateComponents(SlabDepot *depot, + Nonce nonce, + const ThreadConfig *threadConfig, + BlockCount vioPoolSize, + PhysicalLayer *layer, + Partition *summaryPartition) +{ + /* + * If createVIO is NULL, the slab depot is only being used to format + * or audit the VDO. These only require the SuperBlock component, so we can + * just skip allocating all the memory needed for runtime components. + */ + if (layer->createMetadataVIO == NULL) { + return VDO_SUCCESS; + } + + int result = initializeEnqueueableCompletion(&depot->scrubbingCompletion, + SUB_TASK_COMPLETION, layer); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeActionManager(depot->zoneCount, getAllocatorThreadID, + getJournalZoneThread(threadConfig), depot, + scheduleTailBlockCommit, layer, + &depot->actionManager); + if (result != VDO_SUCCESS) { + return result; + } + + depot->origin = depot->firstBlock; + + result = makeSlabSummary(layer, summaryPartition, threadConfig, + depot->slabSizeShift, depot->slabConfig.dataBlocks, + depot->readOnlyNotifier, &depot->slabSummary); + if (result != VDO_SUCCESS) { + return result; + } + + SlabCount slabCount = calculateSlabCount(depot); + if (threadConfig->physicalZoneCount > slabCount) { + return logErrorWithStringError(VDO_BAD_CONFIGURATION, + "%u physical zones exceeds slab count %u", + threadConfig->physicalZoneCount, slabCount); + } + + // Allocate the block allocators. + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + ThreadID threadID = getPhysicalZoneThread(threadConfig, zone); + result = makeBlockAllocator(depot, zone, threadID, nonce, vioPoolSize, + layer, depot->readOnlyNotifier, + &depot->allocators[zone]); + if (result != VDO_SUCCESS) { + return result; + } + } + + // Allocate slabs. + result = allocateSlabs(depot, slabCount); + if (result != VDO_SUCCESS) { + return result; + } + + // Use the new slabs. + for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { + Slab *slab = depot->newSlabs[i]; + registerSlabWithAllocator(slab->allocator, slab); + depot->slabCount++; + } + + depot->slabs = depot->newSlabs; + depot->newSlabs = NULL; + depot->newSlabCount = 0; + + return VDO_SUCCESS; +} + +/** + * Allocate a slab depot. + * + * @param [in] state The parameters for the new depot + * @param [in] threadConfig The thread config of the VDO + * @param [in] nonce The nonce of the VDO + * @param [in] vioPoolSize The size of the VIO pool + * @param [in] layer The physical layer below this depot + * @param [in] summaryPartition The partition which holds the slab summary + * (if NULL, the depot is format-only) + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] depotPtr A pointer to hold the depot + * + * @return A success or error code + **/ +__attribute__((warn_unused_result)) +static int allocateDepot(const SlabDepotState2_0 *state, + const ThreadConfig *threadConfig, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) +{ + // Calculate the bit shift for efficiently mapping block numbers to slabs. + // Using a shift requires that the slab size be a power of two. + BlockCount slabSize = state->slabConfig.slabBlocks; + if (!isPowerOfTwo(slabSize)) { + return logErrorWithStringError(UDS_INVALID_ARGUMENT, + "slab size must be a power of two"); + } + unsigned int slabSizeShift = logBaseTwo(slabSize); + + SlabDepot *depot; + int result = ALLOCATE_EXTENDED(SlabDepot, threadConfig->physicalZoneCount, + BlockAllocator *, __func__, &depot); + if (result != VDO_SUCCESS) { + return result; + } + + depot->oldZoneCount = state->zoneCount; + depot->zoneCount = threadConfig->physicalZoneCount; + depot->slabConfig = state->slabConfig; + depot->readOnlyNotifier = readOnlyNotifier; + depot->firstBlock = state->firstBlock; + depot->lastBlock = state->lastBlock; + depot->slabSizeShift = slabSizeShift; + depot->journal = recoveryJournal; + + result = allocateComponents(depot, nonce, threadConfig, vioPoolSize, + layer, summaryPartition); + if (result != VDO_SUCCESS) { + freeSlabDepot(&depot); + return result; + } + + *depotPtr = depot; + return VDO_SUCCESS; +} + +/** + * Configure the SlabDepot for the specified storage capacity, finding the + * number of data blocks that will fit and still leave room for the depot + * metadata, then return the saved state for that configuration. + * + * @param [in] blockCount The number of blocks in the underlying storage + * @param [in] firstBlock The number of the first block that may be allocated + * @param [in] slabConfig The configuration of a single slab + * @param [in] zoneCount The number of zones the depot will use + * @param [out] state The state structure to be configured + * + * @return VDO_SUCCESS or an error code + **/ +static int configureState(BlockCount blockCount, + PhysicalBlockNumber firstBlock, + SlabConfig slabConfig, + ZoneCount zoneCount, + SlabDepotState2_0 *state) +{ + BlockCount slabSize = slabConfig.slabBlocks; + logDebug("slabDepot configureState(blockCount=%" PRIu64 + ", firstBlock=%llu, slabSize=%llu, zoneCount=%u)", + blockCount, firstBlock, slabSize, zoneCount); + + // We do not allow runt slabs, so we waste up to a slab's worth. + size_t slabCount = (blockCount / slabSize); + if (slabCount == 0) { + return VDO_NO_SPACE; + } + + if (slabCount > MAX_SLABS) { + return VDO_TOO_MANY_SLABS; + } + + BlockCount totalSlabBlocks = slabCount * slabConfig.slabBlocks; + BlockCount totalDataBlocks = slabCount * slabConfig.dataBlocks; + PhysicalBlockNumber lastBlock = firstBlock + totalSlabBlocks; + + *state = (SlabDepotState2_0) { + .slabConfig = slabConfig, + .firstBlock = firstBlock, + .lastBlock = lastBlock, + .zoneCount = zoneCount, + }; + + logDebug("slabDepot lastBlock=%llu, totalDataBlocks=%" PRIu64 + ", slabCount=%zu, leftOver=%llu", + lastBlock, totalDataBlocks, slabCount, + blockCount - (lastBlock - firstBlock)); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeSlabDepot(BlockCount blockCount, + PhysicalBlockNumber firstBlock, + SlabConfig slabConfig, + const ThreadConfig *threadConfig, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) +{ + SlabDepotState2_0 state; + int result = configureState(blockCount, firstBlock, slabConfig, 0, &state); + if (result != VDO_SUCCESS) { + return result; + } + + SlabDepot *depot = NULL; + result = allocateDepot(&state, threadConfig, nonce, vioPoolSize, layer, + summaryPartition, readOnlyNotifier, recoveryJournal, + &depot); + if (result != VDO_SUCCESS) { + return result; + } + + *depotPtr = depot; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeSlabDepot(SlabDepot **depotPtr) +{ + SlabDepot *depot = *depotPtr; + if (depot == NULL) { + return; + } + + abandonNewSlabs(depot); + + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + freeBlockAllocator(&depot->allocators[zone]); + } + + if (depot->slabs != NULL) { + for (SlabCount i = 0; i < depot->slabCount; i++) { + freeSlab(&depot->slabs[i]); + } + } + + FREE(depot->slabs); + freeActionManager(&depot->actionManager); + freeSlabSummary(&depot->slabSummary); + destroyEnqueueable(&depot->scrubbingCompletion); + FREE(depot); + *depotPtr = NULL; +} + +/**********************************************************************/ +size_t getSlabDepotEncodedSize(void) +{ + return ENCODED_HEADER_SIZE + sizeof(SlabDepotState2_0); +} + +/** + * Decode a slab config from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param config The config structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeSlabConfig(Buffer *buffer, SlabConfig *config) +{ + BlockCount count; + int result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabBlocks = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->dataBlocks = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->referenceCountBlocks = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabJournalBlocks = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabJournalFlushingThreshold = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabJournalBlockingThreshold = count; + + result = getUInt64LEFromBuffer(buffer, &count); + if (result != UDS_SUCCESS) { + return result; + } + config->slabJournalScrubbingThreshold = count; + + return UDS_SUCCESS; +} + +/** + * Encode a slab config into a buffer. + * + * @param config The config structure to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error code + **/ +static int encodeSlabConfig(const SlabConfig *config, Buffer *buffer) +{ + int result = putUInt64LEIntoBuffer(buffer, config->slabBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->dataBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->referenceCountBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->slabJournalBlocks); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->slabJournalFlushingThreshold); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->slabJournalBlockingThreshold); + if (result != UDS_SUCCESS) { + return result; + } + + return putUInt64LEIntoBuffer(buffer, config->slabJournalScrubbingThreshold); +} + +/**********************************************************************/ +int encodeSlabDepot(const SlabDepot *depot, Buffer *buffer) +{ + int result = encodeHeader(&SLAB_DEPOT_HEADER_2_0, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = encodeSlabConfig(&depot->slabConfig, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, depot->firstBlock); + if (result != UDS_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, depot->lastBlock); + if (result != UDS_SUCCESS) { + return result; + } + + /* + * If this depot is currently using 0 zones, it must have been + * synchronously loaded by a tool and is now being saved. We + * did not load and combine the slab summary, so we still need + * to do that next time we load with the old zone count rather + * than 0. + */ + ZoneCount zonesToRecord = depot->zoneCount; + if (depot->zoneCount == 0) { + zonesToRecord = depot->oldZoneCount; + } + result = putByte(buffer, zonesToRecord); + if (result != UDS_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + return ASSERT(SLAB_DEPOT_HEADER_2_0.size == encodedSize, + "encoded block map component size must match header size"); +} + +/** + * Decode slab depot component state version 2.0 from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param state The state structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +static int decodeSlabDepotState_2_0(Buffer *buffer, SlabDepotState2_0 *state) +{ + size_t initialLength = contentLength(buffer); + + int result = decodeSlabConfig(buffer, &state->slabConfig); + if (result != UDS_SUCCESS) { + return result; + } + + PhysicalBlockNumber firstBlock; + result = getUInt64LEFromBuffer(buffer, &firstBlock); + if (result != UDS_SUCCESS) { + return result; + } + state->firstBlock = firstBlock; + + PhysicalBlockNumber lastBlock; + result = getUInt64LEFromBuffer(buffer, &lastBlock); + if (result != UDS_SUCCESS) { + return result; + } + state->lastBlock = lastBlock; + + result = getByte(buffer, &state->zoneCount); + if (result != UDS_SUCCESS) { + return result; + } + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(SLAB_DEPOT_HEADER_2_0.size == decodedSize, + "decoded slab depot component size must match header size"); +} + +/**********************************************************************/ +int decodeSlabDepot(Buffer *buffer, + const ThreadConfig *threadConfig, + Nonce nonce, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) +{ + Header header; + int result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&SLAB_DEPOT_HEADER_2_0, &header, true, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + SlabDepotState2_0 state; + result = decodeSlabDepotState_2_0(buffer, &state); + if (result != UDS_SUCCESS) { + return result; + } + + return allocateDepot(&state, threadConfig, nonce, VIO_POOL_SIZE, layer, + summaryPartition, readOnlyNotifier, recoveryJournal, + depotPtr); +} + +/**********************************************************************/ +int decodeSodiumSlabDepot(Buffer *buffer, + const ThreadConfig *threadConfig, + Nonce nonce, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) +{ + // Sodium uses version 2.0 of the slab depot state. + return decodeSlabDepot(buffer, threadConfig, nonce, layer, summaryPartition, + readOnlyNotifier, recoveryJournal, depotPtr); +} + +/**********************************************************************/ +int allocateSlabRefCounts(SlabDepot *depot) +{ + SlabIterator iterator = getSlabIterator(depot); + while (hasNextSlab(&iterator)) { + int result = allocateRefCountsForSlab(nextSlab(&iterator)); + if (result != VDO_SUCCESS) { + return result; + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +BlockAllocator *getBlockAllocatorForZone(SlabDepot *depot, + ZoneCount zoneNumber) +{ + return depot->allocators[zoneNumber]; +} + +/**********************************************************************/ +int getSlabNumber(const SlabDepot *depot, + PhysicalBlockNumber pbn, + SlabCount *slabNumberPtr) +{ + if (pbn < depot->firstBlock) { + return VDO_OUT_OF_RANGE; + } + + SlabCount slabNumber = (pbn - depot->firstBlock) >> depot->slabSizeShift; + if (slabNumber >= depot->slabCount) { + return VDO_OUT_OF_RANGE; + } + + *slabNumberPtr = slabNumber; + return VDO_SUCCESS; +} + +/**********************************************************************/ +Slab *getSlab(const SlabDepot *depot, PhysicalBlockNumber pbn) +{ + if (pbn == ZERO_BLOCK) { + return NULL; + } + + SlabCount slabNumber; + int result = getSlabNumber(depot, pbn, &slabNumber); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(depot->readOnlyNotifier, result); + return NULL; + } + + return depot->slabs[slabNumber]; + +} + +/**********************************************************************/ +SlabJournal *getSlabJournal(const SlabDepot *depot, PhysicalBlockNumber pbn) +{ + Slab *slab = getSlab(depot, pbn); + return ((slab != NULL) ? slab->journal : NULL); +} + +/**********************************************************************/ +uint8_t getIncrementLimit(SlabDepot *depot, PhysicalBlockNumber pbn) +{ + Slab *slab = getSlab(depot, pbn); + if ((slab == NULL) || isUnrecoveredSlab(slab)) { + return 0; + } + + return getAvailableReferences(slab->referenceCounts, pbn); +} + +/**********************************************************************/ +bool isPhysicalDataBlock(const SlabDepot *depot, PhysicalBlockNumber pbn) +{ + if (pbn == ZERO_BLOCK) { + return true; + } + + SlabCount slabNumber; + if (getSlabNumber(depot, pbn, &slabNumber) != VDO_SUCCESS) { + return false; + } + + SlabBlockNumber sbn; + int result = slabBlockNumberFromPBN(depot->slabs[slabNumber], pbn, &sbn); + return (result == VDO_SUCCESS); +} + +/**********************************************************************/ +BlockCount getDepotAllocatedBlocks(const SlabDepot *depot) +{ + BlockCount total = 0; + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + // The allocators are responsible for thread safety. + total += getAllocatedBlocks(depot->allocators[zone]); + } + return total; +} + +/**********************************************************************/ +BlockCount getDepotDataBlocks(const SlabDepot *depot) +{ + // XXX This needs to be thread safe, but resize changes the slab count. It + // does so on the admin thread (our usual caller), so it's usually safe. + return (depot->slabCount * depot->slabConfig.dataBlocks); +} + +/**********************************************************************/ +BlockCount getDepotFreeBlocks(const SlabDepot *depot) +{ + /* + * We can't ever shrink a volume except when resize fails, and we can't + * allocate from the new slabs until after the resize succeeds, so by + * getting the number of allocated blocks first, we ensure the allocated + * count is always less than the capacity. Doing it in the other order on a + * full volume could lose a race with a sucessful resize, resulting in a + * nonsensical negative/underflow result. + */ + BlockCount allocated = getDepotAllocatedBlocks(depot); + memoryFence(); + return (getDepotDataBlocks(depot) - allocated); +} + +/**********************************************************************/ +SlabCount getDepotSlabCount(const SlabDepot *depot) +{ + return depot->slabCount; +} + +/**********************************************************************/ +SlabCount getDepotUnrecoveredSlabCount(const SlabDepot *depot) +{ + SlabCount total = 0; + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + // The allocators are responsible for thread safety. + total += getUnrecoveredSlabCount(depot->allocators[zone]); + } + return total; +} + +/** + * The preamble of a load operation which loads the slab summary. + * + *

Implements ActionPreamble. + **/ +static void startDepotLoad(void *context, VDOCompletion *parent) +{ + SlabDepot *depot = context; + loadSlabSummary(depot->slabSummary, + getCurrentManagerOperation(depot->actionManager), + depot->oldZoneCount, parent); +} + +/**********************************************************************/ +void loadSlabDepot(SlabDepot *depot, + AdminStateCode operation, + VDOCompletion *parent, + void *context) +{ + if (assertLoadOperation(operation, parent)) { + scheduleOperationWithContext(depot->actionManager, operation, + startDepotLoad, loadBlockAllocator, NULL, + context, parent); + } +} + +/**********************************************************************/ +void prepareToAllocate(SlabDepot *depot, + SlabDepotLoadType loadType, + VDOCompletion *parent) +{ + depot->loadType = loadType; + atomicStore32(&depot->zonesToScrub, depot->zoneCount); + scheduleAction(depot->actionManager, NULL, prepareAllocatorToAllocate, + NULL, parent); +} + +/**********************************************************************/ +void updateSlabDepotSize(SlabDepot *depot) +{ + depot->lastBlock = depot->newLastBlock; +} + +/**********************************************************************/ +int prepareToGrowSlabDepot(SlabDepot *depot, BlockCount newSize) +{ + if ((newSize >> depot->slabSizeShift) <= depot->slabCount) { + return VDO_INCREMENT_TOO_SMALL; + } + + // Generate the depot configuration for the new block count. + SlabDepotState2_0 newState; + int result = configureState(newSize, depot->firstBlock, depot->slabConfig, + depot->zoneCount, &newState); + if (result != VDO_SUCCESS) { + return result; + } + + SlabCount newSlabCount = computeSlabCount(depot->firstBlock, + newState.lastBlock, + depot->slabSizeShift); + if (newSlabCount <= depot->slabCount) { + return logErrorWithStringError(VDO_INCREMENT_TOO_SMALL, + "Depot can only grow"); + } + if (newSlabCount == depot->newSlabCount) { + // Check it out, we've already got all the new slabs allocated! + return VDO_SUCCESS; + } + + abandonNewSlabs(depot); + result = allocateSlabs(depot, newSlabCount); + if (result != VDO_SUCCESS) { + abandonNewSlabs(depot); + return result; + } + + depot->newSize = newSize; + depot->oldLastBlock = depot->lastBlock; + depot->newLastBlock = newState.lastBlock; + + return VDO_SUCCESS; +} + +/** + * Finish registering new slabs now that all of the allocators have received + * their new slabs. + * + *

Implements ActionConclusion. + **/ +static int finishRegistration(void *context) +{ + SlabDepot *depot = context; + depot->slabCount = depot->newSlabCount; + FREE(depot->slabs); + depot->slabs = depot->newSlabs; + depot->newSlabs = NULL; + depot->newSlabCount = 0; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void useNewSlabs(SlabDepot *depot, VDOCompletion *parent) +{ + ASSERT_LOG_ONLY(depot->newSlabs != NULL, "Must have new slabs to use"); + scheduleOperation(depot->actionManager, ADMIN_STATE_SUSPENDED_OPERATION, + NULL, registerNewSlabsForAllocator, finishRegistration, + parent); +} + +/**********************************************************************/ +void drainSlabDepot(SlabDepot *depot, + AdminStateCode operation, + VDOCompletion *parent) +{ + scheduleOperation(depot->actionManager, operation, NULL, drainBlockAllocator, + NULL, parent); +} + +/**********************************************************************/ +void resumeSlabDepot(SlabDepot *depot, VDOCompletion *parent) +{ + if (isReadOnly(depot->readOnlyNotifier)) { + finishCompletion(parent, VDO_READ_ONLY); + return; + } + + scheduleOperation(depot->actionManager, ADMIN_STATE_RESUMING, NULL, + resumeBlockAllocator, NULL, parent); +} + +/**********************************************************************/ +void commitOldestSlabJournalTailBlocks(SlabDepot *depot, + SequenceNumber recoveryBlockNumber) +{ + if (depot == NULL) { + return; + } + + depot->newReleaseRequest = recoveryBlockNumber; + scheduleDefaultAction(depot->actionManager); +} + +/**********************************************************************/ +const SlabConfig *getSlabConfig(const SlabDepot *depot) +{ + return &depot->slabConfig; +} + +/**********************************************************************/ +SlabSummary *getSlabSummary(const SlabDepot *depot) +{ + return depot->slabSummary; +} + +/**********************************************************************/ +SlabSummaryZone *getSlabSummaryForZone(const SlabDepot *depot, ZoneCount zone) +{ + if (depot->slabSummary == NULL) { + return NULL; + } + return getSummaryForZone(depot->slabSummary, zone); +} + +/**********************************************************************/ +void scrubAllUnrecoveredSlabs(SlabDepot *depot, + void *parent, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID, + VDOCompletion *launchParent) +{ + prepareCompletion(&depot->scrubbingCompletion, callback, errorHandler, + threadID, parent); + scheduleAction(depot->actionManager, NULL, scrubAllUnrecoveredSlabsInZone, + NULL, launchParent); +} + +/**********************************************************************/ +void notifyZoneFinishedScrubbing(VDOCompletion *completion) +{ + SlabDepot *depot = completion->parent; + if (atomicAdd32(&depot->zonesToScrub, -1) == 0) { + // We're the last! + completeCompletion(&depot->scrubbingCompletion); + } +} + +/**********************************************************************/ +bool hasUnrecoveredSlabs(SlabDepot *depot) +{ + return (atomicLoad32(&depot->zonesToScrub) > 0); +} + +/**********************************************************************/ +BlockCount getNewDepotSize(const SlabDepot *depot) +{ + return (depot->newSlabs == NULL) ? 0 : depot->newSize; +} + +/**********************************************************************/ +bool areEquivalentDepots(SlabDepot *depotA, SlabDepot *depotB) +{ + if ((depotA->firstBlock != depotB->firstBlock) + || (depotA->lastBlock != depotB->lastBlock) + || (depotA->slabCount != depotB->slabCount) + || (depotA->slabSizeShift != depotB->slabSizeShift) + || (getDepotAllocatedBlocks(depotA) + != getDepotAllocatedBlocks(depotB))) { + return false; + } + + for (size_t i = 0; i < depotA->slabCount; i++) { + Slab *slabA = depotA->slabs[i]; + Slab *slabB = depotB->slabs[i]; + if ((slabA->start != slabB->start) + || (slabA->end != slabB->end) + || !areEquivalentReferenceCounters(slabA->referenceCounts, + slabB->referenceCounts)) { + return false; + } + } + + return true; +} + +/**********************************************************************/ +void allocateFromLastSlab(SlabDepot *depot) +{ + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + allocateFromAllocatorLastSlab(depot->allocators[zone]); + } +} + +/**********************************************************************/ +BlockAllocatorStatistics +getDepotBlockAllocatorStatistics(const SlabDepot *depot) +{ + BlockAllocatorStatistics totals; + memset(&totals, 0, sizeof(totals)); + + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + BlockAllocator *allocator = depot->allocators[zone]; + BlockAllocatorStatistics stats = getBlockAllocatorStatistics(allocator); + totals.slabCount += stats.slabCount; + totals.slabsOpened += stats.slabsOpened; + totals.slabsReopened += stats.slabsReopened; + } + + return totals; +} + +/**********************************************************************/ +RefCountsStatistics getDepotRefCountsStatistics(const SlabDepot *depot) +{ + RefCountsStatistics depotStats; + memset(&depotStats, 0, sizeof(depotStats)); + + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + BlockAllocator *allocator = depot->allocators[zone]; + RefCountsStatistics stats = getRefCountsStatistics(allocator); + depotStats.blocksWritten += stats.blocksWritten; + } + + return depotStats; +} + +/**********************************************************************/ +SlabJournalStatistics getDepotSlabJournalStatistics(const SlabDepot *depot) +{ + SlabJournalStatistics depotStats; + memset(&depotStats, 0, sizeof(depotStats)); + + for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { + BlockAllocator *allocator = depot->allocators[zone]; + SlabJournalStatistics stats = getSlabJournalStatistics(allocator); + depotStats.diskFullCount += stats.diskFullCount; + depotStats.flushCount += stats.flushCount; + depotStats.blockedCount += stats.blockedCount; + depotStats.blocksWritten += stats.blocksWritten; + depotStats.tailBusyCount += stats.tailBusyCount; + } + + return depotStats; +} + +/**********************************************************************/ +void dumpSlabDepot(const SlabDepot *depot) +{ + logInfo("Slab Depot"); + logInfo(" zoneCount=%u oldZoneCount=%u slabCount=%" PRIu32 + " activeReleaseRequest=%llu newReleaseRequest=%llu", + (unsigned int) depot->zoneCount, (unsigned int) depot->oldZoneCount, + depot->slabCount, depot->activeReleaseRequest, + depot->newReleaseRequest); +} diff --git a/source/vdo/base/slabDepot.h b/source/vdo/base/slabDepot.h new file mode 100644 index 0000000..b439470 --- /dev/null +++ b/source/vdo/base/slabDepot.h @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepot.h#12 $ + */ + +#ifndef SLAB_DEPOT_H +#define SLAB_DEPOT_H + +#include "buffer.h" + +#include "adminState.h" +#include "completion.h" +#include "fixedLayout.h" +#include "journalPoint.h" +#include "statistics.h" +#include "types.h" +#include "waitQueue.h" + +/** + * A SlabDepot is responsible for managing all of the slabs and block + * allocators of a VDO. It has a single array of slabs in order to eliminate + * the need for additional math in order to compute which physical zone a PBN + * is in. It also has a BlockAllocator per zone. + * + * Load operations are required to be performed on a single thread. Normal + * operations are assumed to be performed in the appropriate zone. Allocations + * and reference count updates must be done from the thread of their physical + * zone. Requests to commit slab journal tail blocks from the recovery journal + * must be done on the journal zone thread. Save operations are required to be + * launched from the same thread as the original load operation. + **/ + +typedef enum { + NORMAL_LOAD, + RECOVERY_LOAD, + REBUILD_LOAD +} SlabDepotLoadType; + +/** + * Calculate the number of slabs a depot would have. + * + * @param depot The depot + * + * @return The number of slabs + **/ +SlabCount calculateSlabCount(SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Create a slab depot. + * + * @param [in] blockCount The number of blocks initially available + * @param [in] firstBlock The number of the first block which may be + * allocated + * @param [in] slabConfig The slab configuration + * @param [in] threadConfig The thread configuration of the VDO + * @param [in] nonce The nonce of the VDO + * @param [in] vioPoolSize The size of the VIO pool + * @param [in] layer The physical layer below this depot + * @param [in] summaryPartition The partition which holds the slab summary + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] depotPtr A pointer to hold the depot + * + * @return A success or error code + **/ +int makeSlabDepot(BlockCount blockCount, + PhysicalBlockNumber firstBlock, + SlabConfig slabConfig, + const ThreadConfig *threadConfig, + Nonce nonce, + BlockCount vioPoolSize, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a slab depot and null out the reference to it. + * + * @param depotPtr The reference to the depot to destroy + **/ +void freeSlabDepot(SlabDepot **depotPtr); + +/** + * Get the size of the encoded state of a slab depot. + * + * @return The encoded size of the depot's state + **/ +size_t getSlabDepotEncodedSize(void) + __attribute__((warn_unused_result)); + +/** + * Encode the state of a slab depot into a buffer. + * + * @param depot The depot to encode + * @param buffer The buffer to encode into + * + * @return UDS_SUCCESS or an error + **/ +int encodeSlabDepot(const SlabDepot *depot, Buffer *buffer) + __attribute__((warn_unused_result)); + +/** + * Decode the state of a slab depot saved in a buffer. + * + * @param [in] buffer The buffer containing the saved state + * @param [in] threadConfig The thread config of the VDO + * @param [in] nonce The nonce of the VDO + * @param [in] layer The physical layer below this depot + * @param [in] summaryPartition The partition which holds the slab summary + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] depotPtr A pointer to hold the depot + * + * @return A success or error code + **/ +int decodeSodiumSlabDepot(Buffer *buffer, + const ThreadConfig *threadConfig, + Nonce nonce, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) + __attribute__((warn_unused_result)); + +/** + * Decode the state of a slab depot saved in a buffer. + * + * @param [in] buffer The buffer containing the saved state + * @param [in] threadConfig The thread config of the VDO + * @param [in] nonce The nonce of the VDO + * @param [in] layer The physical layer below this depot + * @param [in] summaryPartition The partition which holds the slab summary + * @param [in] readOnlyNotifier The context for entering read-only mode + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] depotPtr A pointer to hold the depot + * + * @return A success or error code + **/ +int decodeSlabDepot(Buffer *buffer, + const ThreadConfig *threadConfig, + Nonce nonce, + PhysicalLayer *layer, + Partition *summaryPartition, + ReadOnlyNotifier *readOnlyNotifier, + RecoveryJournal *recoveryJournal, + SlabDepot **depotPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate the RefCounts for all slabs in the depot. This method may be called + * only before entering normal operation from the load thread. + * + * @param depot The depot whose RefCounts need allocation + * + * @return VDO_SUCCESS or an error + **/ +int allocateSlabRefCounts(SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the block allocator for a specified physical zone from a depot. + * + * @param depot The depot + * @param zoneNumber The physical zone + * + * @return The block allocator for the specified zone + **/ +BlockAllocator *getBlockAllocatorForZone(SlabDepot *depot, + ZoneCount zoneNumber) + __attribute__((warn_unused_result)); + +/** + * Get the number of the slab that contains a specified block. + * + * @param depot The slab depot + * @param pbn The physical block number + * @param slabNumberPtr A pointer to hold the slab number + * + * @return VDO_SUCCESS or an error + **/ +int getSlabNumber(const SlabDepot *depot, + PhysicalBlockNumber pbn, + SlabCount *slabNumberPtr) + __attribute__((warn_unused_result)); + +/** + * Get the slab object for the slab that contains a specified block. Will put + * the VDO in read-only mode if the PBN is not a valid data block nor the zero + * block. + * + * @param depot The slab depot + * @param pbn The physical block number + * + * @return The slab containing the block, or NULL if the block number is the + * zero block or otherwise out of range + **/ +Slab *getSlab(const SlabDepot *depot, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Get the slab journal for the slab that contains a specified block. + * + * @param depot The slab depot + * @param pbn The physical block number within the block depot partition + * of any block in the slab + * + * @return The slab journal of the slab containing the block, or NULL if the + * block number is for the zero block or otherwise out of range + **/ +SlabJournal *getSlabJournal(const SlabDepot *depot, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Determine how many new references a block can acquire. This method must be + * called from the the physical zone thread of the PBN. + * + * @param depot The slab depot + * @param pbn The physical block number that is being queried + * + * @return the number of available references + **/ +uint8_t getIncrementLimit(SlabDepot *depot, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Determine whether the given PBN refers to a data block. + * + * @param depot The depot + * @param pbn The physical block number to ask about + * + * @return True if the PBN corresponds to a data block + **/ +bool isPhysicalDataBlock(const SlabDepot *depot, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Get the total number of data blocks allocated across all the slabs in the + * depot, which is the total number of blocks with a non-zero reference count. + * This may be called from any thread. + * + * @param depot The slab depot + * + * @return The total number of blocks with a non-zero reference count + **/ +BlockCount getDepotAllocatedBlocks(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total of the statistics from all the block allocators in the depot. + * + * @param depot The slab depot + * + * @return The statistics from all block allocators in the depot + **/ +BlockAllocatorStatistics +getDepotBlockAllocatorStatistics(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total number of data blocks in all the slabs in the depot. This may + * be called from any thread. + * + * @param depot The slab depot + * + * @return The total number of data blocks in all slabs + **/ +BlockCount getDepotDataBlocks(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total number of free blocks remaining in all the slabs in the + * depot, which is the total number of blocks that have a zero reference + * count. This may be called from any thread. + * + * @param depot The slab depot + * + * @return The total number of blocks with a zero reference count + **/ +BlockCount getDepotFreeBlocks(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total number of slabs in the depot + * + * @param depot The slab depot + * + * @return The total number of slabs + **/ +SlabCount getDepotSlabCount(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the total number of unrecovered slabs in the depot, which is the total + * number of unrecovered slabs from all zones. This may be called from any + * thread. + * + * @param depot The slab depot + * + * @return The total number of slabs that are unrecovered + **/ +SlabCount getDepotUnrecoveredSlabCount(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the aggregated slab journal statistics for the depot. + * + * @param depot The slab depot + * + * @return The aggregated statistics for all slab journals in the depot + **/ +SlabJournalStatistics getDepotSlabJournalStatistics(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the cumulative RefCounts statistics for the depot. + * + * @param depot The slab depot + * + * @return The cumulative statistics for all RefCounts in the depot + **/ +RefCountsStatistics getDepotRefCountsStatistics(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Asynchronously load any slab depot state that isn't included in the + * SuperBlock component. This method may be called only before entering normal + * operation from the load thread. + * + * @param depot The depot to load + * @param operation The type of load to perform + * @param parent The completion to finish when the load is complete + * @param context Additional context for the load operation; may be NULL + **/ +void loadSlabDepot(SlabDepot *depot, + AdminStateCode operation, + VDOCompletion *parent, + void *context); + +/** + * Prepare the slab depot to come online and start allocating blocks. This + * method may be called only before entering normal operation from the load + * thread. It must be called before allocation may proceed. + * + * @param depot The depot to prepare + * @param loadType The load type + * @param parent The completion to finish when the operation is complete + **/ +void prepareToAllocate(SlabDepot *depot, + SlabDepotLoadType loadType, + VDOCompletion *parent); + +/** + * Update the slab depot to reflect its new size in memory. This size is saved + * to disk as part of the super block. + * + * @param depot The depot to update + **/ +void updateSlabDepotSize(SlabDepot *depot); + +/** + * Allocate new memory needed for a resize of a slab depot to the given size. + * + * @param depot The depot to prepare to resize + * @param newSize The number of blocks in the new depot + * + * @return VDO_SUCCESS or an error + **/ +int prepareToGrowSlabDepot(SlabDepot *depot, BlockCount newSize) + __attribute__((warn_unused_result)); + +/** + * Use the new slabs allocated for resize. + * + * @param depot The depot + * @param parent The object to notify when complete + **/ +void useNewSlabs(SlabDepot *depot, VDOCompletion *parent); + +/** + * Abandon any new slabs in this depot, freeing them as needed. + * + * @param depot The depot + **/ +void abandonNewSlabs(SlabDepot *depot); + +/** + * Drain all slab depot I/O. If saving, or flushing, all dirty depot metadata + * will be written out. If saving or suspending, the depot will be left in a + * suspended state. + * + * @param depot The depot to drain + * @param operation The drain operation (flush, rebuild, suspend, or save) + * @param parent The completion to finish when the drain is complete + **/ +void drainSlabDepot(SlabDepot *depot, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Resume a suspended slab depot. + * + * @param depot The depot to resume + * @param parent The completion to finish when the depot has resumed + **/ +void resumeSlabDepot(SlabDepot *depot, VDOCompletion *parent); + +/** + * Commit all dirty tail blocks which are locking a given recovery journal + * block. This method must be called from the journal zone thread. + * + * @param depot The depot + * @param recoveryBlockNumber The sequence number of the recovery journal + * block whose locks should be released + **/ +void commitOldestSlabJournalTailBlocks(SlabDepot *depot, + SequenceNumber recoveryBlockNumber); + +/** + * Get the SlabConfig of a depot. + * + * @param depot The slab depot + * + * @return The slab configuration of the specified depot + **/ +const SlabConfig *getSlabConfig(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the slab summary. + * + * @param depot The slab depot + * + * @return The slab summary + **/ +SlabSummary *getSlabSummary(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Get the portion of the slab summary for a given physical zone. + * + * @param depot The slab depot + * @param zone The zone + * + * @return The portion of the slab summary for the specified zone + **/ +SlabSummaryZone *getSlabSummaryForZone(const SlabDepot *depot, ZoneCount zone) + __attribute__((warn_unused_result)); + +/** + * Scrub all unrecovered slabs. + * + * @param depot The depot to scrub + * @param parent The object to notify when scrubbing is complete + * @param callback The function to call when scrubbing is complete + * @param errorHandler The handler for scrubbing errors + * @param threadID The thread on which to run the callback + * @param launchParent The object to notify when scrubbing has been launched + * for all zones + **/ +void scrubAllUnrecoveredSlabs(SlabDepot *depot, + void *parent, + VDOAction *callback, + VDOAction *errorHandler, + ThreadID threadID, + VDOCompletion *launchParent); + +/** + * Check whether there are outstanding unrecovered slabs. + * + * @param depot The slab depot + * + * @return Whether there are outstanding unrecovered slabs + **/ +bool hasUnrecoveredSlabs(SlabDepot *depot); + +/** + * Get the physical size to which this depot is prepared to grow. + * + * @param depot The slab depot + * + * @return The new number of blocks the depot will be grown to, or 0 if the + * depot is not prepared to grow + **/ +BlockCount getNewDepotSize(const SlabDepot *depot) + __attribute__((warn_unused_result)); + +/** + * Dump the slab depot, in a thread-unsafe fashion. + * + * @param depot The slab depot + **/ +void dumpSlabDepot(const SlabDepot *depot); + +#endif // SLAB_DEPOT_H diff --git a/source/vdo/base/slabDepotInternals.h b/source/vdo/base/slabDepotInternals.h new file mode 100644 index 0000000..7dfe57b --- /dev/null +++ b/source/vdo/base/slabDepotInternals.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepotInternals.h#13 $ + */ + +#ifndef SLAB_DEPOT_INTERNALS_H +#define SLAB_DEPOT_INTERNALS_H + +#include "slabDepot.h" + +#include "atomic.h" + +#include "actionManager.h" + +struct slabDepot { + ZoneCount zoneCount; + ZoneCount oldZoneCount; + SlabConfig slabConfig; + SlabSummary *slabSummary; + ReadOnlyNotifier *readOnlyNotifier; + ActionManager *actionManager; + + PhysicalBlockNumber firstBlock; + PhysicalBlockNumber lastBlock; + PhysicalBlockNumber origin; + + /** slabSize == (1 << slabSizeShift) */ + unsigned int slabSizeShift; + + /** Determines how slabs should be queued during load */ + SlabDepotLoadType loadType; + + /** The state for notifying slab journals to release recovery journal */ + SequenceNumber activeReleaseRequest; + SequenceNumber newReleaseRequest; + + /** The completion for scrubbing */ + VDOCompletion scrubbingCompletion; + Atomic32 zonesToScrub; + + /** Cached journal pointer for slab creation */ + RecoveryJournal *journal; + + /** Array of pointers to individually allocated slabs */ + Slab **slabs; + /** The number of slabs currently allocated and stored in 'slabs' */ + SlabCount slabCount; + + /** Array of pointers to a larger set of slabs (used during resize) */ + Slab **newSlabs; + /** The number of slabs currently allocated and stored in 'newSlabs' */ + SlabCount newSlabCount; + /** The size that 'newSlabs' was allocated for */ + BlockCount newSize; + + /** The last block before resize, for rollback */ + PhysicalBlockNumber oldLastBlock; + /** The last block after resize, for resize */ + PhysicalBlockNumber newLastBlock; + + /** The block allocators for this depot */ + BlockAllocator *allocators[]; +}; + +/** + * Destroy a slab. + * + * @param slab The slab to destroy + **/ +void destroySlab(Slab *slab); + +/** + * Inform a slab's depot that the slab has been created. + * + * @param slab The slab to register + **/ +void registerSlabWithDepot(Slab *slab); + +/** + * Notify a slab depot that one of its allocators has finished scrubbing slabs. + * This method should only be called if the scrubbing was successful. This + * callback is registered by each block allocator in + * scrubAllUnrecoveredSlabsInZone(). + * + * @param completion A completion whose parent must be a slab depot + **/ +void notifyZoneFinishedScrubbing(VDOCompletion *completion); + +/** + * Check whether two depots are equivalent (i.e. represent the same + * state and have the same reference counter). This method is used for unit + * testing. + * + * @param depotA The first depot to compare + * @param depotB The second depot to compare + * + * @return true if the two depots are equivalent + **/ +bool areEquivalentDepots(SlabDepot *depotA, SlabDepot *depotB) + __attribute__((warn_unused_result)); + +/** + * Start allocating from the highest numbered slab in each zone. + * + * @param depot The depot + **/ +void allocateFromLastSlab(SlabDepot *depot); + +#endif /* SLAB_DEPOT_INTERNALS_H */ diff --git a/source/vdo/base/slabIterator.h b/source/vdo/base/slabIterator.h new file mode 100644 index 0000000..e977c2d --- /dev/null +++ b/source/vdo/base/slabIterator.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabIterator.h#1 $ + */ + +#ifndef SLAB_ITERATOR_H +#define SLAB_ITERATOR_H + +#include "slab.h" +#include "types.h" + +/** + * SlabIterator is a structure for iterating over a set of slabs. + **/ +typedef struct { + Slab **slabs; + Slab *next; + SlabCount end; + SlabCount stride; +} SlabIterator; + +/** + * Return a SlabIterator initialized to iterate over an array of slabs + * with a given stride. Iteration always occurs from higher to lower numbered + * slabs. + * + * @param slabs The array of slabs + * @param start The number of the slab to start iterating from + * @param end The number of the last slab which may be returned + * @param stride The difference in slab number between successive slabs + * + * @return an initialized iterator structure + **/ +static inline SlabIterator iterateSlabs(Slab **slabs, + SlabCount start, + SlabCount end, + SlabCount stride) +{ + return (SlabIterator) { + .slabs = slabs, + .next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]), + .end = end, + .stride = stride, + }; +} + +/** + * Check whether another Slab would be returned by the iterator. + * + * @param iterator The iterator to poll + * + * @return true if the next call to nextSlab + * will return a Slab + **/ +static inline bool hasNextSlab(const SlabIterator *iterator) +{ + return (iterator->next != NULL); +} + +/** + * Get the next Slab, advancing the iterator. + * + * @param iterator The iterator over the Slab chain + * + * @return the next Slab or NULL if the array of slabs is empty + * or if all the appropriate Slabs have been returned + **/ +static inline Slab *nextSlab(SlabIterator *iterator) +{ + Slab *slab = iterator->next; + if ((slab == NULL) + || (slab->slabNumber < iterator->end + iterator->stride)) { + iterator->next = NULL; + } else { + iterator->next = iterator->slabs[slab->slabNumber - iterator->stride]; + } + return slab; +} + +#endif // SLAB_ITERATOR_H diff --git a/source/vdo/base/slabJournal.c b/source/vdo/base/slabJournal.c new file mode 100644 index 0000000..1895f80 --- /dev/null +++ b/source/vdo/base/slabJournal.c @@ -0,0 +1,1321 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournal.c#18 $ + */ + +#include "slabJournalInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" + +#include "adminState.h" +#include "blockAllocatorInternals.h" +#include "dataVIO.h" +#include "recoveryJournal.h" +#include "refCounts.h" +#include "slabDepot.h" +#include "slabSummary.h" + +/** + * Return the slab journal from the resource waiter. + * + * @param waiter The waiter + * + * @return The slab journal + **/ +__attribute__((warn_unused_result)) +static inline SlabJournal *slabJournalFromResourceWaiter(Waiter *waiter) +{ + STATIC_ASSERT(offsetof(SlabJournal, resourceWaiter) == 0); + return (SlabJournal *) waiter; +} + +/** + * Return the slab journal from the flush waiter. + * + * @param waiter The waiter + * + * @return The slab journal + **/ +__attribute__((warn_unused_result)) +static inline SlabJournal *slabJournalFromFlushWaiter(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + return (SlabJournal *) + ((uintptr_t) waiter - offsetof(SlabJournal, flushWaiter)); +} + +/**********************************************************************/ +SlabJournal *slabJournalFromDirtyNode(RingNode *node) +{ + if (node == NULL) { + return NULL; + } + return (SlabJournal *) ((uintptr_t) node - offsetof(SlabJournal, dirtyNode)); +} + +/** + * Return the slab journal from the slab summary waiter. + * + * @param waiter The waiter + * + * @return The slab journal + **/ +__attribute__((warn_unused_result)) +static inline SlabJournal *slabJournalFromSlabSummaryWaiter(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + return (SlabJournal *) + ((uintptr_t) waiter - offsetof(SlabJournal, slabSummaryWaiter)); +} + +/** + * Get the physical block number for a given sequence number. + * + * @param journal The journal + * @param sequence The sequence number of the desired block + * + * @return the block number corresponding to the sequence number + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getBlockNumber(SlabJournal *journal, + SequenceNumber sequence) +{ + TailBlockOffset offset = getSlabJournalBlockOffset(journal, sequence); + return (journal->slab->journalOrigin + offset); +} + +/** + * Get the lock object for a slab journal block by sequence number. + * + * @param journal Slab journal to retrieve from + * @param sequenceNumber Sequence number of the block + * + * @return the lock object for the given sequence number + **/ +__attribute__((warn_unused_result)) +static inline JournalLock *getLock(SlabJournal *journal, + SequenceNumber sequenceNumber) +{ + TailBlockOffset offset = getSlabJournalBlockOffset(journal, sequenceNumber); + return &journal->locks[offset]; +} + +/** + * Check whether the VDO is in read-only mode. + * + * @param journal The journal whose owning VDO should be checked + * + * @return true if the VDO is in read-only mode + **/ +__attribute__((warn_unused_result)) +static inline bool isVDOReadOnly(SlabJournal *journal) +{ + return isReadOnly(journal->slab->allocator->readOnlyNotifier); +} + +/** + * Check whether there are entry waiters which should delay a flush. + * + * @param journal The journal to check + * + * @return true if there are no entry waiters, or if the slab + * is unrecovered + **/ +__attribute__((warn_unused_result)) +static inline bool mustMakeEntriesToFlush(SlabJournal *journal) +{ + return (!slabIsRebuilding(journal->slab) + && hasWaiters(&journal->entryWaiters)); +} + +/** + * Check whether a reap is currently in progress. + * + * @param journal The journal which may be reaping + * + * @return true if the journal is reaping + **/ +__attribute__((warn_unused_result)) +static inline bool isReaping(SlabJournal *journal) +{ + return (journal->head != journal->unreapable); +} + +/**********************************************************************/ +bool isSlabJournalActive(SlabJournal *journal) +{ + return (mustMakeEntriesToFlush(journal) + || isReaping(journal) + || journal->waitingToCommit + || !isRingEmpty(&journal->uncommittedBlocks) + || journal->updatingSlabSummary); +} + +/** + * Initialize tail block as a new block. + * + * @param journal The journal whose tail block is being initialized + **/ +static void initializeTailBlock(SlabJournal *journal) +{ + SlabJournalBlockHeader *header = &journal->tailHeader; + header->sequenceNumber = journal->tail; + header->entryCount = 0; + header->hasBlockMapIncrements = false; +} + +/** + * Set all journal fields appropriately to start journaling. + * + * @param journal The journal to be reset, based on its tail sequence number + **/ +static void initializeJournalState(SlabJournal *journal) +{ + journal->unreapable = journal->head; + journal->reapLock = getLock(journal, journal->unreapable); + journal->nextCommit = journal->tail; + journal->summarized = journal->lastSummarized = journal->tail; + initializeTailBlock(journal); +} + +/** + * Check whether a journal block is full. + * + * @param journal The slab journal for the block + * + * @return true if the tail block is full + **/ +__attribute__((warn_unused_result)) +static bool blockIsFull(SlabJournal *journal) +{ + JournalEntryCount count = journal->tailHeader.entryCount; + return (journal->tailHeader.hasBlockMapIncrements + ? (journal->fullEntriesPerBlock == count) + : (journal->entriesPerBlock == count)); +} + +/**********************************************************************/ +static void addEntries(SlabJournal *journal); +static void updateTailBlockLocation(SlabJournal *journal); +static void releaseJournalLocks(Waiter *waiter, void *context); + +/**********************************************************************/ +int makeSlabJournal(BlockAllocator *allocator, + Slab *slab, + RecoveryJournal *recoveryJournal, + SlabJournal **journalPtr) +{ + SlabJournal *journal; + const SlabConfig *slabConfig = getSlabConfig(allocator->depot); + int result = ALLOCATE_EXTENDED(SlabJournal, slabConfig->slabJournalBlocks, + JournalLock, __func__, &journal); + if (result != VDO_SUCCESS) { + return result; + } + + journal->slab = slab; + journal->size = slabConfig->slabJournalBlocks; + journal->flushingThreshold = slabConfig->slabJournalFlushingThreshold; + journal->blockingThreshold = slabConfig->slabJournalBlockingThreshold; + journal->scrubbingThreshold = slabConfig->slabJournalScrubbingThreshold; + journal->entriesPerBlock = SLAB_JOURNAL_ENTRIES_PER_BLOCK; + journal->fullEntriesPerBlock = SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK; + journal->events = &allocator->slabJournalStatistics; + journal->recoveryJournal = recoveryJournal; + journal->summary = getSlabSummaryZone(allocator); + journal->tail = 1; + journal->head = 1; + + journal->flushingDeadline = journal->flushingThreshold; + // Set there to be some time between the deadline and the blocking threshold, + // so that hopefully all are done before blocking. + if ((journal->blockingThreshold - journal->flushingThreshold) > 5) { + journal->flushingDeadline = journal->blockingThreshold - 5; + } + + journal->slabSummaryWaiter.callback = releaseJournalLocks; + + result = ALLOCATE(VDO_BLOCK_SIZE, char, "PackedSlabJournalBlock", + (char **) &journal->block); + if (result != VDO_SUCCESS) { + freeSlabJournal(&journal); + return result; + } + + initializeRing(&journal->dirtyNode); + initializeRing(&journal->uncommittedBlocks); + + journal->tailHeader.nonce = slab->allocator->nonce; + journal->tailHeader.metadataType = VDO_METADATA_SLAB_JOURNAL; + initializeJournalState(journal); + + *journalPtr = journal; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeSlabJournal(SlabJournal **journalPtr) +{ + SlabJournal *journal = *journalPtr; + if (journal == NULL) { + return; + } + + FREE(journal->block); + FREE(journal); + *journalPtr = NULL; +} + +/**********************************************************************/ +bool isSlabJournalBlank(const SlabJournal *journal) +{ + return ((journal != NULL) + && (journal->tail == 1) + && (journal->tailHeader.entryCount == 0)); +} + +/**********************************************************************/ +bool isSlabJournalDirty(const SlabJournal *journal) +{ + return (journal->recoveryLock != 0); +} + +/** + * Put a slab journal on the dirty ring of its allocator in the correct order. + * + * @param journal The journal to be marked dirty + * @param lock The recovery journal lock held by the slab journal + **/ +static void markSlabJournalDirty(SlabJournal *journal, SequenceNumber lock) +{ + ASSERT_LOG_ONLY(!isSlabJournalDirty(journal), "slab journal was clean"); + + journal->recoveryLock = lock; + RingNode *dirtyRing = &journal->slab->allocator->dirtySlabJournals; + RingNode *node = dirtyRing->prev; + while (node != dirtyRing) { + SlabJournal *dirtyJournal = slabJournalFromDirtyNode(node); + if (dirtyJournal->recoveryLock <= journal->recoveryLock) { + break; + } + + node = node->prev; + } + + pushRingNode(node->next, &journal->dirtyNode); +} + +/**********************************************************************/ +static void markSlabJournalClean(SlabJournal *journal) +{ + journal->recoveryLock = 0; + unspliceRingNode(&journal->dirtyNode); +} + +/** + * Implements WaiterCallback. This callback is invoked on all VIOs waiting + * to make slab journal entries after the VDO has gone into read-only mode. + **/ +static void abortWaiter(Waiter *waiter, + void *context __attribute__((unused))) +{ + continueDataVIO(waiterAsDataVIO(waiter), VDO_READ_ONLY); +} + +/**********************************************************************/ +void abortSlabJournalWaiters(SlabJournal *journal) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == journal->slab->allocator->threadID), + "abortSlabJournalWaiters() called on correct thread"); + notifyAllWaiters(&journal->entryWaiters, abortWaiter, journal); + checkIfSlabDrained(journal->slab); +} + +/** + * Put the journal in read-only mode. All attempts to add entries after + * this function is called will fail. All VIOs waiting for to make entries + * will be awakened with an error. All flushes will complete as soon as all + * pending IO is done. + * + * @param journal The journal which has failed + * @param errorCode The error result triggering this call + **/ +static void enterJournalReadOnlyMode(SlabJournal *journal, int errorCode) +{ + enterReadOnlyMode(journal->slab->allocator->readOnlyNotifier, errorCode); + abortSlabJournalWaiters(journal); +} + +/** + * Actually advance the head of the journal now that any necessary flushes + * are complete. + * + * @param journal The journal to be reaped + **/ +static void finishReaping(SlabJournal *journal) +{ + journal->head = journal->unreapable; + addEntries(journal); + checkIfSlabDrained(journal->slab); +} + +/**********************************************************************/ +static void reapSlabJournal(SlabJournal *journal); + +/** + * Finish reaping now that we have flushed the lower layer and then try + * reaping again in case we deferred reaping due to an outstanding VIO. + * + * @param completion The flush VIO + **/ +static void completeReaping(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + SlabJournal *journal = entry->parent; + returnVIO(journal->slab->allocator, entry); + finishReaping(journal); + reapSlabJournal(journal); +} + +/** + * Handle an error flushing the lower layer. + * + * @param completion The flush VIO + **/ +static void handleFlushError(VDOCompletion *completion) +{ + SlabJournal *journal = ((VIOPoolEntry *) completion->parent)->parent; + enterJournalReadOnlyMode(journal, completion->result); + completeReaping(completion); +} + +/** + * Waiter callback for getting a VIO with which to flush the lower layer prior + * to reaping. + * + * @param waiter The journal as a flush waiter + * @param vioContext The newly acquired flush VIO + **/ +static void flushForReaping(Waiter *waiter, void *vioContext) +{ + SlabJournal *journal = slabJournalFromFlushWaiter(waiter); + VIOPoolEntry *entry = vioContext; + VIO *vio = entry->vio; + + entry->parent = journal; + vio->completion.callbackThreadID = journal->slab->allocator->threadID; + launchFlush(vio, completeReaping, handleFlushError); +} + +/** + * Conduct a reap on a slab journal to reclaim unreferenced blocks. + * + * @param journal The slab journal + **/ +static void reapSlabJournal(SlabJournal *journal) +{ + if (isReaping(journal)) { + // We already have a reap in progress so wait for it to finish. + return; + } + + if (isUnrecoveredSlab(journal->slab) || !isNormal(&journal->slab->state) + || isVDOReadOnly(journal)) { + // We must not reap in the first two cases, and there's no point in + // read-only mode. + return; + } + + /* + * Start reclaiming blocks only when the journal head has no references. Then + * stop when a block is referenced or reap reaches the most recently written + * block, referenced by the slab summary, which has the sequence number just + * before the tail. + */ + bool reaped = false; + while ((journal->unreapable < journal->tail) + && (journal->reapLock->count == 0)) { + reaped = true; + journal->unreapable++; + journal->reapLock++; + if (journal->reapLock == &journal->locks[journal->size]) { + journal->reapLock = &journal->locks[0]; + } + } + + if (!reaped) { + return; + } + + PhysicalLayer *layer = journal->slab->allocator->completion.layer; + if (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC) { + finishReaping(journal); + return; + } + + /* + * In async mode, it is never safe to reap a slab journal block without first + * issuing a flush, regardless of whether a user flush has been received or + * not. In the absence of the flush, the reference block write which released + * the locks allowing the slab journal to reap may not be persisted. Although + * slab summary writes will eventually issue flushes, multiple slab journal + * block writes can be issued while previous slab summary updates have not + * yet been made. Even though those slab journal block writes will be ignored + * if the slab summary update is not persisted, they may still overwrite the + * to-be-reaped slab journal block resulting in a loss of reference count + * updates (VDO-2912). + * + * In sync mode, it is similarly unsafe. However, we cannot possibly make + * those additional slab journal block writes due to the blocking threshold + * and the recovery journal's flush policy of flushing before every block. + * We may make no more than (number of VIOs) entries in slab journals since + * the last recovery journal flush; thus, due to the size of the slab + * journal blocks, the RJ must have flushed the storage no more than one + * slab journal block ago. So we could only overwrite the to-be-reaped block + * if we wrote and flushed the last block in the journal. But the blocking + * threshold prevents that. + */ + journal->flushWaiter.callback = flushForReaping; + int result = acquireVIO(journal->slab->allocator, &journal->flushWaiter); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + return; + } +} + +/** + * This is the callback invoked after a slab summary update completes. It + * is registered in the constructor on behalf of updateTailBlockLocation(). + * + * Implements WaiterCallback. + * + * @param waiter The slab summary waiter that has just been notified + * @param context The result code of the update + **/ +static void releaseJournalLocks(Waiter *waiter, void *context) +{ + SlabJournal *journal = slabJournalFromSlabSummaryWaiter(waiter); + int result = *((int *) context); + if (result != VDO_SUCCESS) { + if (result != VDO_READ_ONLY) { + // Don't bother logging what might be lots of errors if we are already + // in read-only mode. + logErrorWithStringError(result, "failed slab summary update %llu", + journal->summarized); + } + + journal->updatingSlabSummary = false; + enterJournalReadOnlyMode(journal, result); + return; + } + + if (journal->partialWriteInProgress + && (journal->summarized == journal->tail)) { + journal->partialWriteInProgress = false; + addEntries(journal); + } + + SequenceNumber first = journal->lastSummarized; + journal->lastSummarized = journal->summarized; + for (SequenceNumber i = journal->summarized - 1; i >= first; i--) { + // Release the lock the summarized block held on the recovery journal. + // (During replay, recoveryStart will always be 0.) + if (journal->recoveryJournal != NULL) { + ZoneCount zoneNumber = journal->slab->allocator->zoneNumber; + releaseRecoveryJournalBlockReference(journal->recoveryJournal, + getLock(journal, i)->recoveryStart, + ZONE_TYPE_PHYSICAL, + zoneNumber); + + } + + // Release our own lock against reaping for blocks that are committed. + // (This function will not change locks during replay.) + adjustSlabJournalBlockReference(journal, i, -1); + } + + journal->updatingSlabSummary = false; + + reapSlabJournal(journal); + + // Check if the slab summary needs to be updated again. + updateTailBlockLocation(journal); +} + +/** + * Update the tail block location in the slab summary, if necessary. + * + * @param journal The slab journal that is updating its tail block location + **/ +static void updateTailBlockLocation(SlabJournal *journal) +{ + if (journal->updatingSlabSummary || isVDOReadOnly(journal) + || (journal->lastSummarized >= journal->nextCommit)) { + checkIfSlabDrained(journal->slab); + return; + } + + BlockCount freeBlockCount; + if (isUnrecoveredSlab(journal->slab)) { + freeBlockCount = getSummarizedFreeBlockCount(journal->summary, + journal->slab->slabNumber); + } else { + freeBlockCount = getSlabFreeBlockCount(journal->slab); + } + + journal->summarized = journal->nextCommit; + journal->updatingSlabSummary = true; + + /* + * Update slab summary as dirty. + * Slab journal can only reap past sequence number 1 when all the refCounts + * for this slab have been written to the layer. Therefore, indicate that the + * refCounts must be loaded when the journal head has reaped past sequence + * number 1. + */ + TailBlockOffset blockOffset + = getSlabJournalBlockOffset(journal, journal->summarized); + updateSlabSummaryEntry(journal->summary, &journal->slabSummaryWaiter, + journal->slab->slabNumber, blockOffset, + (journal->head > 1), false, freeBlockCount); +} + +/**********************************************************************/ +void reopenSlabJournal(SlabJournal *journal) +{ + ASSERT_LOG_ONLY(journal->tailHeader.entryCount == 0, + "Slab journal's active block empty before reopening"); + journal->head = journal->tail; + initializeJournalState(journal); + + // Ensure no locks are spuriously held on an empty journal. + for (SequenceNumber block = 1; block <= journal->size; block++) { + ASSERT_LOG_ONLY((getLock(journal, block)->count == 0), + "Scrubbed journal's block %llu is not locked", + block); + } + + addEntries(journal); +} + +/**********************************************************************/ +static SequenceNumber getCommittingSequenceNumber(const VIOPoolEntry *entry) +{ + const PackedSlabJournalBlock *block = entry->buffer; + return getUInt64LE(block->header.fields.sequenceNumber); +} + +/** + * Handle post-commit processing. This is the callback registered by + * writeSlabJournalBlock(). + * + * @param completion The write VIO as a completion + **/ +static void completeWrite(VDOCompletion *completion) +{ + int writeResult = completion->result; + VIOPoolEntry *entry = completion->parent; + SlabJournal *journal = entry->parent; + + SequenceNumber committed = getCommittingSequenceNumber(entry); + unspliceRingNode(&entry->node); + returnVIO(journal->slab->allocator, entry); + + if (writeResult != VDO_SUCCESS) { + logErrorWithStringError(writeResult, + "cannot write slab journal block %llu", + committed); + enterJournalReadOnlyMode(journal, writeResult); + return; + } + + relaxedAdd64(&journal->events->blocksWritten, 1); + + if (isRingEmpty(&journal->uncommittedBlocks)) { + // If no blocks are outstanding, then the commit point is at the tail. + journal->nextCommit = journal->tail; + } else { + // The commit point is always the beginning of the oldest incomplete block. + VIOPoolEntry *oldest = asVIOPoolEntry(journal->uncommittedBlocks.next); + journal->nextCommit = getCommittingSequenceNumber(oldest); + } + + updateTailBlockLocation(journal); +} + +/** + * Callback from acquireVIO() registered in commitSlabJournalTail(). + * + * @param waiter The VIO pool waiter which was just notified + * @param vioContext The VIO pool entry for the write + **/ +static void writeSlabJournalBlock(Waiter *waiter, void *vioContext) +{ + SlabJournal *journal = slabJournalFromResourceWaiter(waiter); + VIOPoolEntry *entry = vioContext; + SlabJournalBlockHeader *header = &journal->tailHeader; + + header->head = journal->head; + pushRingNode(&journal->uncommittedBlocks, &entry->node); + packSlabJournalBlockHeader(header, &journal->block->header); + + // Copy the tail block into the VIO. + memcpy(entry->buffer, journal->block, VDO_BLOCK_SIZE); + + int unusedEntries = journal->entriesPerBlock - header->entryCount; + ASSERT_LOG_ONLY(unusedEntries >= 0, "Slab journal block is not overfull"); + if (unusedEntries > 0) { + // Release the per-entry locks for any unused entries in the block we are + // about to write. + adjustSlabJournalBlockReference(journal, header->sequenceNumber, + -unusedEntries); + journal->partialWriteInProgress = !blockIsFull(journal); + } + + PhysicalBlockNumber blockNumber + = getBlockNumber(journal, header->sequenceNumber); + + entry->parent = journal; + entry->vio->completion.callbackThreadID = journal->slab->allocator->threadID; + /* + * This block won't be read in recovery until the slab summary is updated + * to refer to it. The slab summary update does a flush which is sufficient + * to protect us from VDO-2331. + */ + launchWriteMetadataVIO(entry->vio, blockNumber, completeWrite, + completeWrite); + + // Since the write is submitted, the tail block structure can be reused. + journal->tail++; + initializeTailBlock(journal); + journal->waitingToCommit = false; + if (journal->slab->state.state == ADMIN_STATE_WAITING_FOR_RECOVERY) { + finishOperationWithResult(&journal->slab->state, + (isVDOReadOnly(journal) + ? VDO_READ_ONLY : VDO_SUCCESS)); + return; + } + + addEntries(journal); +} + +/**********************************************************************/ +void commitSlabJournalTail(SlabJournal *journal) +{ + if ((journal->tailHeader.entryCount == 0) + && mustMakeEntriesToFlush(journal)) { + // There are no entries at the moment, but there are some waiters, so defer + // initiating the flush until those entries are ready to write. + return; + } + + if (isVDOReadOnly(journal) + || journal->waitingToCommit + || (journal->tailHeader.entryCount == 0)) { + // There is nothing to do since the tail block is empty, or writing, or + // the journal is in read-only mode. + return; + } + + /* + * Since we are about to commit the tail block, this journal no longer + * needs to be on the ring of journals which the recovery journal might + * ask to commit. + */ + markSlabJournalClean(journal); + + journal->waitingToCommit = true; + + journal->resourceWaiter.callback = writeSlabJournalBlock; + int result = acquireVIO(journal->slab->allocator, &journal->resourceWaiter); + if (result != VDO_SUCCESS) { + journal->waitingToCommit = false; + enterJournalReadOnlyMode(journal, result); + return; + } +} + +/**********************************************************************/ +void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader, + SlabJournalPayload *payload, + SlabBlockNumber sbn, + JournalOperation operation) +{ + JournalEntryCount entryNumber = tailHeader->entryCount++; + if (operation == BLOCK_MAP_INCREMENT) { + if (!tailHeader->hasBlockMapIncrements) { + memset(payload->fullEntries.entryTypes, 0, + SLAB_JOURNAL_ENTRY_TYPES_SIZE); + tailHeader->hasBlockMapIncrements = true; + } + + payload->fullEntries.entryTypes[entryNumber / 8] + |= ((byte) 1 << (entryNumber % 8)); + } + + packSlabJournalEntry(&payload->entries[entryNumber], sbn, + isIncrementOperation(operation)); +} + +/**********************************************************************/ +SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block, + JournalEntryCount entryCount) +{ + SlabJournalEntry entry + = unpackSlabJournalEntry(&block->payload.entries[entryCount]); + if (block->header.fields.hasBlockMapIncrements + && ((block->payload.fullEntries.entryTypes[entryCount / 8] + & ((byte) 1 << (entryCount % 8))) != 0)) { + entry.operation = BLOCK_MAP_INCREMENT; + } + return entry; +} + +/** + * Actually add an entry to the slab journal, potentially firing off a write + * if a block becomes full. This function is synchronous. + * + * @param journal The slab journal to append to + * @param pbn The pbn being adjusted + * @param operation The type of entry to make + * @param recoveryPoint The recovery journal point for this entry + **/ +static void addEntry(SlabJournal *journal, + PhysicalBlockNumber pbn, + JournalOperation operation, + const JournalPoint *recoveryPoint) +{ + int result = ASSERT(beforeJournalPoint(&journal->tailHeader.recoveryPoint, + recoveryPoint), + "recovery journal point is monotonically increasing, " + "recovery point: %llu.%u, " + "block recovery point: %llu.%u", + recoveryPoint->sequenceNumber, recoveryPoint->entryCount, + journal->tailHeader.recoveryPoint.sequenceNumber, + journal->tailHeader.recoveryPoint.entryCount); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + return; + } + + PackedSlabJournalBlock *block = journal->block; + if (operation == BLOCK_MAP_INCREMENT) { + result = ASSERT_LOG_ONLY((journal->tailHeader.entryCount + < journal->fullEntriesPerBlock), + "block has room for full entries"); + if (result != VDO_SUCCESS) { + enterJournalReadOnlyMode(journal, result); + return; + } + } + + encodeSlabJournalEntry(&journal->tailHeader, &block->payload, + pbn - journal->slab->start, operation); + journal->tailHeader.recoveryPoint = *recoveryPoint; + if (blockIsFull(journal)) { + commitSlabJournalTail(journal); + } +} + +/**********************************************************************/ +bool attemptReplayIntoSlabJournal(SlabJournal *journal, + PhysicalBlockNumber pbn, + JournalOperation operation, + JournalPoint *recoveryPoint, + VDOCompletion *parent) +{ + // Only accept entries after the current recovery point. + if (!beforeJournalPoint(&journal->tailHeader.recoveryPoint, recoveryPoint)) { + return true; + } + + SlabJournalBlockHeader *header = &journal->tailHeader; + if ((header->entryCount >= journal->fullEntriesPerBlock) + && (header->hasBlockMapIncrements || + (operation == BLOCK_MAP_INCREMENT))) { + // The tail block does not have room for the entry we are attempting + // to add so commit the tail block now. + commitSlabJournalTail(journal); + } + + if (journal->waitingToCommit) { + startOperationWithWaiter(&journal->slab->state, + ADMIN_STATE_WAITING_FOR_RECOVERY, parent, NULL); + return false; + } + + if ((journal->tail - journal->head) >= journal->size) { + /* + * We must have reaped the current head before the crash, since + * the blocked threshold keeps us from having more entries than + * fit in a slab journal; hence we can just advance the head + * (and unreapable block), as needed. + */ + journal->head++; + journal->unreapable++; + } + + markSlabReplaying(journal->slab); + addEntry(journal, pbn, operation, recoveryPoint); + return true; +} + +/** + * Check whether the journal should be saving reference blocks out. + * + * @param journal The journal to check + * + * @return true if the journal should be requesting reference block writes + **/ +static bool requiresFlushing(const SlabJournal *journal) +{ + BlockCount journalLength = (journal->tail - journal->head); + return (journalLength >= journal->flushingThreshold); +} + +/** + * Check whether the journal must be reaped before adding new entries. + * + * @param journal The journal to check + * + * @return true if the journal must be reaped + **/ +static bool requiresReaping(const SlabJournal *journal) +{ + BlockCount journalLength = (journal->tail - journal->head); + return (journalLength >= journal->blockingThreshold); +} + +/**********************************************************************/ +bool requiresScrubbing(const SlabJournal *journal) +{ + BlockCount journalLength = (journal->tail - journal->head); + return (journalLength >= journal->scrubbingThreshold); +} + +/** + * Implements WaiterCallback. This callback is invoked by addEntries() once + * it has determined that we are ready to make another entry in the slab + * journal. + * + * @param waiter The VIO which should make an entry now + * @param context The slab journal to make an entry in + **/ +static void addEntryFromWaiter(Waiter *waiter, void *context) +{ + DataVIO *dataVIO = waiterAsDataVIO(waiter); + SlabJournal *journal = (SlabJournal *) context; + SlabJournalBlockHeader *header = &journal->tailHeader; + SequenceNumber recoveryBlock = dataVIO->recoveryJournalPoint.sequenceNumber; + + if (header->entryCount == 0) { + /* + * This is the first entry in the current tail block, so get a lock + * on the recovery journal which we will hold until this tail block is + * committed. + */ + getLock(journal, header->sequenceNumber)->recoveryStart = recoveryBlock; + if (journal->recoveryJournal != NULL) { + ZoneCount zoneNumber = journal->slab->allocator->zoneNumber; + acquireRecoveryJournalBlockReference(journal->recoveryJournal, + recoveryBlock, ZONE_TYPE_PHYSICAL, + zoneNumber); + } + markSlabJournalDirty(journal, recoveryBlock); + + // If the slab journal is over the first threshold, tell the refCounts to + // write some reference blocks, but proceed apace. + if (requiresFlushing(journal)) { + relaxedAdd64(&journal->events->flushCount, 1); + BlockCount journalLength = (journal->tail - journal->head); + BlockCount blocksToDeadline = 0; + if (journalLength <= journal->flushingDeadline) { + blocksToDeadline = journal->flushingDeadline - journalLength; + } + saveSeveralReferenceBlocks(journal->slab->referenceCounts, + blocksToDeadline + 1); + } + } + + JournalPoint slabJournalPoint = { + .sequenceNumber = header->sequenceNumber, + .entryCount = header->entryCount, + }; + + addEntry(journal, dataVIO->operation.pbn, dataVIO->operation.type, + &dataVIO->recoveryJournalPoint); + + // Now that an entry has been made in the slab journal, update the + // reference counts. + int result = modifySlabReferenceCount(journal->slab, &slabJournalPoint, + dataVIO->operation); + continueDataVIO(dataVIO, result); +} + +/** + * Check whether the next entry to be made is a block map increment. + * + * @param journal The journal + * + * @return true if the first entry waiter's operation is a block + * map increment + **/ +static inline bool isNextEntryABlockMapIncrement(SlabJournal *journal) +{ + DataVIO *dataVIO = waiterAsDataVIO(getFirstWaiter(&journal->entryWaiters)); + return (dataVIO->operation.type == BLOCK_MAP_INCREMENT); +} + +/** + * Add as many entries as possible from the queue of VIOs waiting to make + * entries. By processing the queue in order, we ensure that slab journal + * entries are made in the same order as recovery journal entries for the + * same increment or decrement. + * + * @param journal The journal to which entries may be added + **/ +static void addEntries(SlabJournal *journal) +{ + if (journal->addingEntries) { + // Protect against re-entrancy. + return; + } + + journal->addingEntries = true; + while (hasWaiters(&journal->entryWaiters)) { + if (journal->partialWriteInProgress || slabIsRebuilding(journal->slab)) { + // Don't add entries while rebuilding or while a partial write is + // outstanding (VDO-2399). + break; + } + + SlabJournalBlockHeader *header = &journal->tailHeader; + if (journal->waitingToCommit) { + // If we are waiting for resources to write the tail block, and the + // tail block is full, we can't make another entry. + relaxedAdd64(&journal->events->tailBusyCount, 1); + break; + } else if (isNextEntryABlockMapIncrement(journal) + && (header->entryCount >= journal->fullEntriesPerBlock)) { + // The tail block does not have room for a block map increment, so + // commit it now. + commitSlabJournalTail(journal); + if (journal->waitingToCommit) { + relaxedAdd64(&journal->events->tailBusyCount, 1); + break; + } + } + + // If the slab is over the blocking threshold, make the VIO wait. + if (requiresReaping(journal)) { + relaxedAdd64(&journal->events->blockedCount, 1); + saveDirtyReferenceBlocks(journal->slab->referenceCounts); + break; + } + + if (header->entryCount == 0) { + JournalLock *lock = getLock(journal, header->sequenceNumber); + // Check if the on disk slab journal is full. Because of the + // blocking and scrubbing thresholds, this should never happen. + if (lock->count > 0) { + ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail, + "New block has locks, but journal is not full"); + + /* + * The blocking threshold must let the journal fill up if the new + * block has locks; if the blocking threshold is smaller than the + * journal size, the new block cannot possibly have locks already. + */ + ASSERT_LOG_ONLY((journal->blockingThreshold >= journal->size), + "New block can have locks already iff blocking" + "threshold is at the end of the journal"); + + relaxedAdd64(&journal->events->diskFullCount, 1); + saveDirtyReferenceBlocks(journal->slab->referenceCounts); + break; + } + + /* + * Don't allow the new block to be reaped until all of the reference + * count blocks are written and the journal block has been + * fully committed as well. + */ + lock->count = journal->entriesPerBlock + 1; + + if (header->sequenceNumber == 1) { + /* + * This is the first entry in this slab journal, ever. Dirty all of + * the reference count blocks. Each will acquire a lock on the + * tail block so that the journal won't be reaped until the + * reference counts are initialized. The lock acquisition must + * be done by the RefCounts since here we don't know how many + * reference blocks the RefCounts has. + */ + acquireDirtyBlockLocks(journal->slab->referenceCounts); + } + } + + notifyNextWaiter(&journal->entryWaiters, addEntryFromWaiter, journal); + } + + journal->addingEntries = false; + + // If there are no waiters, and we are flushing or saving, commit the + // tail block. + if (isSlabDraining(journal->slab) && !isSuspending(&journal->slab->state) + && !hasWaiters(&journal->entryWaiters)) { + commitSlabJournalTail(journal); + } +} + +/**********************************************************************/ +void addSlabJournalEntry(SlabJournal *journal, DataVIO *dataVIO) +{ + if (!isSlabOpen(journal->slab)) { + continueDataVIO(dataVIO, VDO_INVALID_ADMIN_STATE); + return; + } + + if (isVDOReadOnly(journal)) { + continueDataVIO(dataVIO, VDO_READ_ONLY); + return; + } + + int result = enqueueDataVIO(&journal->entryWaiters, dataVIO, + THIS_LOCATION("$F($j-$js)")); + if (result != VDO_SUCCESS) { + continueDataVIO(dataVIO, result); + return; + } + + if (isUnrecoveredSlab(journal->slab) && requiresReaping(journal)) { + increaseScrubbingPriority(journal->slab); + } + + addEntries(journal); +} + +/**********************************************************************/ +void adjustSlabJournalBlockReference(SlabJournal *journal, + SequenceNumber sequenceNumber, + int adjustment) +{ + if (sequenceNumber == 0) { + return; + } + + if (isReplayingSlab(journal->slab)) { + // Locks should not be used during offline replay. + return; + } + + ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero"); + JournalLock *lock = getLock(journal, sequenceNumber); + if (adjustment < 0) { + ASSERT_LOG_ONLY((-adjustment <= lock->count), + "adjustment %d of lock count %u for slab journal block %" + PRIu64 " must not underflow", adjustment, lock->count, + sequenceNumber); + } + + lock->count += adjustment; + if (lock->count == 0) { + reapSlabJournal(journal); + } +} + +/**********************************************************************/ +bool releaseRecoveryJournalLock(SlabJournal *journal, + SequenceNumber recoveryLock) +{ + if (recoveryLock > journal->recoveryLock) { + ASSERT_LOG_ONLY((recoveryLock < journal->recoveryLock), + "slab journal recovery lock is not older than the recovery" + " journal head"); + return false; + } + + if ((recoveryLock < journal->recoveryLock) || isVDOReadOnly(journal)) { + return false; + } + + // All locks are held by the block which is in progress; write it. + commitSlabJournalTail(journal); + return true; +} + +/**********************************************************************/ +void drainSlabJournal(SlabJournal *journal) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == journal->slab->allocator->threadID), + "drainSlabJournal() called on correct thread"); + if (isQuiescing(&journal->slab->state)) { + // XXX: we should revisit this assertion since it is no longer clear what + // it is for. + ASSERT_LOG_ONLY((!(slabIsRebuilding(journal->slab) + && hasWaiters(&journal->entryWaiters))), + "slab is recovered or has no waiters"); + } + + switch (journal->slab->state.state) { + case ADMIN_STATE_REBUILDING: + case ADMIN_STATE_SUSPENDING: + case ADMIN_STATE_SAVE_FOR_SCRUBBING: + break; + + default: + commitSlabJournalTail(journal); + } +} + +/** + * Finish the decode process by returning the VIO and notifying the slab that + * we're done. + * + * @param completion The VIO as a completion + **/ +static void finishDecodingJournal(VDOCompletion *completion) +{ + int result = completion->result; + VIOPoolEntry *entry = completion->parent; + SlabJournal *journal = entry->parent; + returnVIO(journal->slab->allocator, entry); + notifySlabJournalIsLoaded(journal->slab, result); +} + +/** + * Set up the in-memory journal state to the state which was written to disk. + * This is the callback registered in readSlabJournalTail(). + * + * @param completion The VIO which was used to read the journal tail + **/ +static void setDecodedState(VDOCompletion *completion) +{ + VIOPoolEntry *entry = completion->parent; + SlabJournal *journal = entry->parent; + PackedSlabJournalBlock *block = entry->buffer; + + SlabJournalBlockHeader header; + unpackSlabJournalBlockHeader(&block->header, &header); + + if ((header.metadataType != VDO_METADATA_SLAB_JOURNAL) + || (header.nonce != journal->slab->allocator->nonce)) { + finishDecodingJournal(completion); + return; + } + + journal->tail = header.sequenceNumber + 1; + + // If the slab is clean, this implies the slab journal is empty, so advance + // the head appropriately. + if (getSummarizedCleanliness(journal->summary, journal->slab->slabNumber)) { + journal->head = journal->tail; + } else { + journal->head = header.head; + } + + journal->tailHeader = header; + initializeJournalState(journal); + finishDecodingJournal(completion); +} + +/** + * This reads the slab journal tail block by using a VIO acquired from the VIO + * pool. This is the success callback from acquireVIOFromPool() when decoding + * the slab journal. + * + * @param waiter The VIO pool waiter which has just been notified + * @param vioContext The VIO pool entry given to the waiter + **/ +static void readSlabJournalTail(Waiter *waiter, void *vioContext) +{ + SlabJournal *journal = slabJournalFromResourceWaiter(waiter); + Slab *slab = journal->slab; + VIOPoolEntry *entry = vioContext; + TailBlockOffset lastCommitPoint + = getSummarizedTailBlockOffset(journal->summary, slab->slabNumber); + entry->parent = journal; + + + // Slab summary keeps the commit point offset, so the tail block is the + // block before that. Calculation supports small journals in unit tests. + TailBlockOffset tailBlock = ((lastCommitPoint == 0) + ? (TailBlockOffset) (journal->size - 1) + : (lastCommitPoint - 1)); + entry->vio->completion.callbackThreadID = slab->allocator->threadID; + launchReadMetadataVIO(entry->vio, slab->journalOrigin + tailBlock, + setDecodedState, finishDecodingJournal); +} + +/**********************************************************************/ +void decodeSlabJournal(SlabJournal *journal) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == journal->slab->allocator->threadID), + "decodeSlabJournal() called on correct thread"); + Slab *slab = journal->slab; + TailBlockOffset lastCommitPoint + = getSummarizedTailBlockOffset(journal->summary, slab->slabNumber); + if ((lastCommitPoint == 0) + && !mustLoadRefCounts(journal->summary, slab->slabNumber)) { + /* + * This slab claims that it has a tail block at (journal->size - 1), but + * a head of 1. This is impossible, due to the scrubbing threshold, on + * a real system, so don't bother reading the (bogus) data off disk. + */ + ASSERT_LOG_ONLY(((journal->size < 16) + || (journal->scrubbingThreshold < (journal->size - 1))), + "Scrubbing threshold protects against reads of unwritten" + "slab journal blocks"); + notifySlabJournalIsLoaded(slab, VDO_SUCCESS); + return; + } + + journal->resourceWaiter.callback = readSlabJournalTail; + int result = acquireVIO(slab->allocator, &journal->resourceWaiter); + if (result != VDO_SUCCESS) { + notifySlabJournalIsLoaded(slab, result); + } +} + +/**********************************************************************/ +void dumpSlabJournal(const SlabJournal *journal) +{ + logInfo(" slab journal: entryWaiters=%zu waitingToCommit=%s" + " updatingSlabSummary=%s head=%llu unreapable=%" PRIu64 + " tail=%llu nextCommit=%llu summarized=%" PRIu64 + " lastSummarized=%llu recoveryJournalLock=%" PRIu64 + " dirty=%s", countWaiters(&journal->entryWaiters), + boolToString(journal->waitingToCommit), + boolToString(journal->updatingSlabSummary), + journal->head, journal->unreapable, journal->tail, + journal->nextCommit, journal->summarized, journal->lastSummarized, + journal->recoveryLock, + boolToString(isSlabJournalDirty(journal))); + // Given the frequency with which the locks are just a tiny bit off, it + // might be worth dumping all the locks, but that might be too much logging. +} diff --git a/source/vdo/base/slabJournal.h b/source/vdo/base/slabJournal.h new file mode 100644 index 0000000..a411711 --- /dev/null +++ b/source/vdo/base/slabJournal.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournal.h#8 $ + */ + +#ifndef SLAB_JOURNAL_H +#define SLAB_JOURNAL_H + +#include "completion.h" +#include "journalPoint.h" +#include "ringNode.h" +#include "types.h" + +/** + * Convert a completion to a SlabJournal. + * + * @param completion The completion to convert + * + * @return The completion as a SlabJournal + **/ +SlabJournal *asSlabJournal(VDOCompletion *completion) + __attribute__((warn_unused_result)); + +/** + * Calculate the number of slab journal entries per block. + * + * @return The number of slab journal entries per block + **/ +size_t getSlabJournalEntriesPerBlock(void) + __attribute__((warn_unused_result)); + +/** + * Obtain a pointer to a SlabJournal structure from a pointer to the + * dirtyRingNode field within it. + * + * @param node The RingNode to convert + * + * @return The RingNode as a SlabJournal + **/ +SlabJournal *slabJournalFromDirtyNode(RingNode *node) + __attribute__((warn_unused_result)); + +/** + * Create a slab journal. + * + * @param [in] allocator The block allocator which owns this journal + * @param [in] slab The parent slab of the journal + * @param [in] recoveryJournal The recovery journal of the VDO + * @param [out] journalPtr The pointer to hold the new slab journal + * + * @return VDO_SUCCESS or error code + **/ +int makeSlabJournal(BlockAllocator *allocator, + Slab *slab, + RecoveryJournal *recoveryJournal, + SlabJournal **journalPtr) + __attribute__((warn_unused_result)); + +/** + * Free a slab journal and null out the reference to it. + * + * @param journalPtr The reference to the slab journal to free + **/ +void freeSlabJournal(SlabJournal **journalPtr); + +/** + * Check whether a slab journal is blank, meaning it has never had any entries + * recorded in it. + * + * @param journal The journal to query + * + * @return true if the slab journal has never been modified + **/ +bool isSlabJournalBlank(const SlabJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Check whether the slab journal is on the block allocator's ring of dirty + * journals. + * + * @param journal The journal to query + * + * @return true if the journal has been added to the dirty ring + **/ +bool isSlabJournalDirty(const SlabJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Check whether a slab journal is active. + * + * @param journal The slab journal to check + * + * @return true if the journal is active + **/ +bool isSlabJournalActive(SlabJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Abort any VIOs waiting to make slab journal entries. + * + * @param journal The journal to abort + **/ +void abortSlabJournalWaiters(SlabJournal *journal); + +/** + * Reopen a slab journal by emptying it and then adding any pending entries. + * + * @param journal The journal to reopen + **/ +void reopenSlabJournal(SlabJournal *journal); + +/** + * Attempt to replay a recovery journal entry into a slab journal. + * + * @param journal The slab journal to use + * @param pbn The PBN for the entry + * @param operation The type of entry to add + * @param recoveryPoint The recovery journal point corresponding to this entry + * @param parent The completion to notify when there is space to add + * the entry if the entry could not be added immediately + * + * @return true if the entry was added immediately + **/ +bool attemptReplayIntoSlabJournal(SlabJournal *journal, + PhysicalBlockNumber pbn, + JournalOperation operation, + JournalPoint *recoveryPoint, + VDOCompletion *parent) + __attribute__((warn_unused_result)); + +/** + * Add an entry to a slab journal. + * + * @param journal The slab journal to use + * @param dataVIO The DataVIO for which to add the entry + **/ +void addSlabJournalEntry(SlabJournal *journal, DataVIO *dataVIO); + +/** + * Adjust the reference count for a slab journal block. Note that when the + * adjustment is negative, the slab journal will be reaped. + * + * @param journal The slab journal + * @param sequenceNumber The journal sequence number of the referenced block + * @param adjustment Amount to adjust the reference counter + **/ +void adjustSlabJournalBlockReference(SlabJournal *journal, + SequenceNumber sequenceNumber, + int adjustment); + +/** + * Request the slab journal to release the recovery journal lock it may hold on + * a specified recovery journal block. + * + * @param journal The slab journal + * @param recoveryLock The sequence number of the recovery journal block + * whose locks should be released + * + * @return true if the journal does hold a lock on the specified + * block (which it will release) + **/ +bool releaseRecoveryJournalLock(SlabJournal *journal, + SequenceNumber recoveryLock) + __attribute__((warn_unused_result)); + +/** + * Commit the tail block of a slab journal. + * + * @param journal The journal whose tail block should be committed + **/ +void commitSlabJournalTail(SlabJournal *journal); + +/** + * Drain slab journal I/O. Depending upon the type of drain (as recorded in + * the journal's slab), any dirty journal blocks may be written out. + * + * @param journal The journal to drain + **/ +void drainSlabJournal(SlabJournal *journal); + +/** + * Decode the slab journal by reading its tail. + * + * @param journal The journal to decode + **/ +void decodeSlabJournal(SlabJournal *journal); + +/** + * Check to see if the journal should be scrubbed. + * + * @param journal The slab journal + * + * @return true if the journal requires scrubbing + **/ +bool requiresScrubbing(const SlabJournal *journal) + __attribute__((warn_unused_result)); + +/** + * Dump the slab journal. + * + * @param journal The slab journal to dump + **/ +void dumpSlabJournal(const SlabJournal *journal); + +#endif // SLAB_JOURNAL_H diff --git a/source/vdo/base/slabJournalEraser.c b/source/vdo/base/slabJournalEraser.c new file mode 100644 index 0000000..7cd6a81 --- /dev/null +++ b/source/vdo/base/slabJournalEraser.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalEraser.c#1 $ + */ + +#include "slabJournalEraser.h" + +#include "memoryAlloc.h" + +#include "completion.h" +#include "constants.h" +#include "extent.h" +#include "slab.h" +#include "slabDepot.h" + +typedef struct { + VDOCompletion *parent; + VDOExtent *extent; + char *zeroBuffer; + SlabIterator slabs; +} SlabJournalEraser; + +/** + * Free the eraser and finish the parent. + * + * @param eraser The eraser that is done + * @param result The result to return to the parent + **/ +static void finishErasing(SlabJournalEraser *eraser, int result) +{ + VDOCompletion *parent = eraser->parent; + freeExtent(&eraser->extent); + FREE(eraser->zeroBuffer); + FREE(eraser); + finishCompletion(parent, result); +} + +/** + * Finish erasing slab journals with an error. + * + * @param completion A completion whose parent is the eraser + **/ +static void handleErasingError(VDOCompletion *completion) +{ + SlabJournalEraser *eraser = completion->parent; + finishErasing(eraser, eraser->extent->completion.result); +} + +/** + * Erase the next slab journal. + * + * @param extentCompletion A completion whose parent is the eraser + **/ +static void eraseNextSlabJournal(VDOCompletion *extentCompletion) +{ + SlabJournalEraser *eraser = extentCompletion->parent; + + if (!hasNextSlab(&eraser->slabs)) { + finishErasing(eraser, VDO_SUCCESS); + return; + } + + Slab *slab = nextSlab(&eraser->slabs); + writeMetadataExtent(eraser->extent, slab->journalOrigin); +} + +/**********************************************************************/ +void eraseSlabJournals(SlabDepot *depot, + SlabIterator slabs, + VDOCompletion *parent) +{ + SlabJournalEraser *eraser; + int result = ALLOCATE(1, SlabJournalEraser, __func__, &eraser); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + eraser->parent = parent; + eraser->slabs = slabs; + + BlockCount journalSize = getSlabConfig(depot)->slabJournalBlocks; + result = ALLOCATE(journalSize * VDO_BLOCK_SIZE, char, __func__, + &eraser->zeroBuffer); + if (result != VDO_SUCCESS) { + finishErasing(eraser, result); + return; + } + + result = createExtent(parent->layer, VIO_TYPE_SLAB_JOURNAL, + VIO_PRIORITY_METADATA, journalSize, eraser->zeroBuffer, + &eraser->extent); + if (result != VDO_SUCCESS) { + finishErasing(eraser, result); + return; + } + + VDOCompletion *extentCompletion = &eraser->extent->completion; + prepareCompletion(extentCompletion, eraseNextSlabJournal, + handleErasingError, getCallbackThreadID(), eraser); + eraseNextSlabJournal(extentCompletion); +} diff --git a/source/vdo/base/slabJournalEraser.h b/source/vdo/base/slabJournalEraser.h new file mode 100644 index 0000000..215d86f --- /dev/null +++ b/source/vdo/base/slabJournalEraser.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalEraser.h#1 $ + */ + +#ifndef SLAB_JOURNAL_ERASER_H +#define SLAB_JOURNAL_ERASER_H + +#include "slabIterator.h" +#include "types.h" + +/** + * Begin erasing slab journals, one at a time. + * + * @param depot The depot from which to erase + * @param slabs The slabs whose journals need erasing + * @param parent The object to notify when complete + **/ +void eraseSlabJournals(SlabDepot *depot, + SlabIterator slabs, + VDOCompletion *parent); + +#endif // SLAB_JOURNAL_ERASER_H diff --git a/source/vdo/base/slabJournalInternals.h b/source/vdo/base/slabJournalInternals.h new file mode 100644 index 0000000..ce7eafb --- /dev/null +++ b/source/vdo/base/slabJournalInternals.h @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalInternals.h#8 $ + */ + +#ifndef SLAB_JOURNAL_INTERNALS_H +#define SLAB_JOURNAL_INTERNALS_H + +#include "slabJournal.h" + +#include "numeric.h" + +#include "blockAllocatorInternals.h" +#include "blockMapEntry.h" +#include "journalPoint.h" +#include "slab.h" +#include "slabSummary.h" +#include "statistics.h" +#include "waitQueue.h" + +/** + * Slab journal blocks may have one of two formats, depending upon whether or + * not any of the entries in the block are block map increments. Since the + * steady state for a VDO is that all of the necessary block map pages will + * be allocated, most slab journal blocks will have only data entries. Such + * blocks can hold more entries, hence the two formats. + **/ + +/** A single slab journal entry */ +struct slabJournalEntry { + SlabBlockNumber sbn; + JournalOperation operation; +}; + +/** A single slab journal entry in its on-disk form */ +typedef union { + struct __attribute__((packed)) { + uint8_t offsetLow8; + uint8_t offsetMid8; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned offsetHigh7 : 7; + unsigned increment : 1; +#else + unsigned increment : 1; + unsigned offsetHigh7 : 7; +#endif + } fields; + + // A raw view of the packed encoding. + uint8_t raw[3]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + unsigned offset : 23; + unsigned increment : 1; + } littleEndian; +#endif +} __attribute__((packed)) PackedSlabJournalEntry; + +/** The unpacked representation of the header of a slab journal block */ +typedef struct { + /** Sequence number for head of journal */ + SequenceNumber head; + /** Sequence number for this block */ + SequenceNumber sequenceNumber; + /** The nonce for a given VDO instance */ + Nonce nonce; + /** Recovery journal point for last entry */ + JournalPoint recoveryPoint; + /** Metadata type */ + VDOMetadataType metadataType; + /** Whether this block contains block map increments */ + bool hasBlockMapIncrements; + /** The number of entries in the block */ + JournalEntryCount entryCount; +} SlabJournalBlockHeader; + +/** + * The packed, on-disk representation of a slab journal block header. + * All fields are kept in little-endian byte order. + **/ +typedef union __attribute__((packed)) { + struct __attribute__((packed)) { + /** 64-bit sequence number for head of journal */ + byte head[8]; + /** 64-bit sequence number for this block */ + byte sequenceNumber[8]; + /** Recovery journal point for last entry, packed into 64 bits */ + PackedJournalPoint recoveryPoint; + /** The 64-bit nonce for a given VDO instance */ + byte nonce[8]; + /** 8-bit metadata type (should always be two, for the slab journal) */ + uint8_t metadataType; + /** Whether this block contains block map increments */ + bool hasBlockMapIncrements; + /** 16-bit count of the entries encoded in the block */ + byte entryCount[2]; + } fields; + + // A raw view of the packed encoding. + uint8_t raw[8 + 8 + 8 + 8 + 1 + 1 + 2]; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + // This view is only valid on little-endian machines and is only present for + // ease of directly examining packed entries in GDB. + struct __attribute__((packed)) { + SequenceNumber head; + SequenceNumber sequenceNumber; + PackedJournalPoint recoveryPoint; + Nonce nonce; + VDOMetadataType metadataType; + bool hasBlockMapIncrements; + JournalEntryCount entryCount; + } littleEndian; +#endif +} PackedSlabJournalBlockHeader; + +enum { + SLAB_JOURNAL_PAYLOAD_SIZE + = VDO_BLOCK_SIZE - sizeof(PackedSlabJournalBlockHeader), + SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE * 8) / 25, + SLAB_JOURNAL_ENTRY_TYPES_SIZE = ((SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK - 1) + / 8) + 1, + SLAB_JOURNAL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE + / sizeof(PackedSlabJournalEntry)), +}; + +/** The payload of a slab journal block which has block map increments */ +typedef struct { + /* The entries themselves */ + PackedSlabJournalEntry entries[SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK]; + /* The bit map indicating which entries are block map increments */ + byte entryTypes[SLAB_JOURNAL_ENTRY_TYPES_SIZE]; +} __attribute__((packed)) FullSlabJournalEntries; + +typedef union { + /* Entries which include block map increments */ + FullSlabJournalEntries fullEntries; + /* Entries which are only data updates */ + PackedSlabJournalEntry entries[SLAB_JOURNAL_ENTRIES_PER_BLOCK]; + /* Ensure the payload fills to the end of the block */ + byte space[SLAB_JOURNAL_PAYLOAD_SIZE]; +} __attribute__((packed)) SlabJournalPayload; + +typedef struct { + PackedSlabJournalBlockHeader header; + SlabJournalPayload payload; +} __attribute__((packed)) PackedSlabJournalBlock; + +typedef struct { + uint16_t count; + SequenceNumber recoveryStart; +} JournalLock; + +struct slabJournal { + /** A waiter object for getting a VIO pool entry */ + Waiter resourceWaiter; + /** A waiter object for updating the slab summary */ + Waiter slabSummaryWaiter; + /** A waiter object for getting an extent with which to flush */ + Waiter flushWaiter; + /** The queue of VIOs waiting to make an entry */ + WaitQueue entryWaiters; + /** The parent slab reference of this journal */ + Slab *slab; + + /** Whether a tail block commit is pending */ + bool waitingToCommit; + /** Whether the journal is updating the slab summary */ + bool updatingSlabSummary; + /** Whether the journal is adding entries from the entryWaiters queue */ + bool addingEntries; + /** Whether a partial write is in progress */ + bool partialWriteInProgress; + + /** The oldest block in the journal on disk */ + SequenceNumber head; + /** The oldest block in the journal which may not be reaped */ + SequenceNumber unreapable; + /** The end of the half-open interval of the active journal */ + SequenceNumber tail; + /** The next journal block to be committed */ + SequenceNumber nextCommit; + /** The tail sequence number that is written in the slab summary */ + SequenceNumber summarized; + /** The tail sequence number that was last summarized in slab summary */ + SequenceNumber lastSummarized; + + /** The sequence number of the recovery journal lock */ + SequenceNumber recoveryLock; + + /** + * The number of entries which fit in a single block. Can't use the constant + * because unit tests change this number. + **/ + JournalEntryCount entriesPerBlock; + /** + * The number of full entries which fit in a single block. Can't use the + * constant because unit tests change this number. + **/ + JournalEntryCount fullEntriesPerBlock; + + /** The recovery journal of the VDO (slab journal holds locks on it) */ + RecoveryJournal *recoveryJournal; + + /** The slab summary to update tail block location */ + SlabSummaryZone *summary; + /** The statistics shared by all slab journals in our physical zone */ + AtomicSlabJournalStatistics *events; + /** A ring of the VIO pool entries for outstanding journal block writes */ + RingNode uncommittedBlocks; + + /** + * The current tail block header state. This will be packed into + * the block just before it is written. + **/ + SlabJournalBlockHeader tailHeader; + /** A pointer to a block-sized buffer holding the packed block data */ + PackedSlabJournalBlock *block; + + /** The number of blocks in the on-disk journal */ + BlockCount size; + /** The number of blocks at which to start pushing reference blocks */ + BlockCount flushingThreshold; + /** The number of blocks at which all reference blocks should be writing */ + BlockCount flushingDeadline; + /** The number of blocks at which to wait for reference blocks to write */ + BlockCount blockingThreshold; + /** The number of blocks at which to scrub the slab before coming online */ + BlockCount scrubbingThreshold; + + /** This node is for BlockAllocator to keep a queue of dirty journals */ + RingNode dirtyNode; + + /** The lock for the oldest unreaped block of the journal */ + JournalLock *reapLock; + /** The locks for each on disk block */ + JournalLock locks[]; +}; + +/** + * Get the slab journal block offset of the given sequence number. + * + * @param journal The slab journal + * @param sequence The sequence number + * + * @return the offset corresponding to the sequence number + **/ +__attribute__((warn_unused_result)) +static inline TailBlockOffset +getSlabJournalBlockOffset(SlabJournal *journal, SequenceNumber sequence) +{ + return (sequence % journal->size); +} + +/** + * Encode a slab journal entry (exposed for unit tests). + * + * @param tailHeader The unpacked header for the block + * @param payload The journal block payload to hold the entry + * @param sbn The slab block number of the entry to encode + * @param operation The type of the entry + **/ +void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader, + SlabJournalPayload *payload, + SlabBlockNumber sbn, + JournalOperation operation); + +/** + * Decode a slab journal entry. + * + * @param block The journal block holding the entry + * @param entryCount The number of the entry + * + * @return The decoded entry + **/ +SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block, + JournalEntryCount entryCount) + __attribute__((warn_unused_result)); + +/** + * Generate the packed encoding of a slab journal entry. + * + * @param packed The entry into which to pack the values + * @param sbn The slab block number of the entry to encode + * @param isIncrement The increment flag + **/ +static inline void packSlabJournalEntry(PackedSlabJournalEntry *packed, + SlabBlockNumber sbn, + bool isIncrement) +{ + packed->fields.offsetLow8 = (sbn & 0x0000FF); + packed->fields.offsetMid8 = (sbn & 0x00FF00) >> 8; + packed->fields.offsetHigh7 = (sbn & 0x7F0000) >> 16; + packed->fields.increment = isIncrement ? 1 : 0; +} + +/** + * Decode the packed representation of a slab journal entry. + * + * @param packed The packed entry to decode + * + * @return The decoded slab journal entry + **/ +__attribute__((warn_unused_result)) +static inline +SlabJournalEntry unpackSlabJournalEntry(const PackedSlabJournalEntry *packed) +{ + SlabJournalEntry entry; + entry.sbn = packed->fields.offsetHigh7; + entry.sbn <<= 8; + entry.sbn |= packed->fields.offsetMid8; + entry.sbn <<= 8; + entry.sbn |= packed->fields.offsetLow8; + entry.operation + = (packed->fields.increment ? DATA_INCREMENT : DATA_DECREMENT); + return entry; +} + +/** + * Generate the packed representation of a slab block header. + * + * @param header The header containing the values to encode + * @param packed The header into which to pack the values + **/ +static inline +void packSlabJournalBlockHeader(const SlabJournalBlockHeader *header, + PackedSlabJournalBlockHeader *packed) +{ + storeUInt64LE(packed->fields.head, header->head); + storeUInt64LE(packed->fields.sequenceNumber, header->sequenceNumber); + storeUInt64LE(packed->fields.nonce, header->nonce); + storeUInt16LE(packed->fields.entryCount, header->entryCount); + + packed->fields.metadataType = header->metadataType; + packed->fields.hasBlockMapIncrements = header->hasBlockMapIncrements; + + packJournalPoint(&header->recoveryPoint, &packed->fields.recoveryPoint); +} + +/** + * Decode the packed representation of a slab block header. + * + * @param packed The packed header to decode + * @param header The header into which to unpack the values + **/ +static inline +void unpackSlabJournalBlockHeader(const PackedSlabJournalBlockHeader *packed, + SlabJournalBlockHeader *header) +{ + *header = (SlabJournalBlockHeader) { + .head = getUInt64LE(packed->fields.head), + .sequenceNumber = getUInt64LE(packed->fields.sequenceNumber), + .nonce = getUInt64LE(packed->fields.nonce), + .entryCount = getUInt16LE(packed->fields.entryCount), + .metadataType = packed->fields.metadataType, + .hasBlockMapIncrements = packed->fields.hasBlockMapIncrements, + }; + unpackJournalPoint(&packed->fields.recoveryPoint, &header->recoveryPoint); +} + +#endif // SLAB_JOURNAL_INTERNALS_H diff --git a/source/vdo/base/slabScrubber.c b/source/vdo/base/slabScrubber.c new file mode 100644 index 0000000..e37e9c8 --- /dev/null +++ b/source/vdo/base/slabScrubber.c @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubber.c#6 $ + */ + +#include "slabScrubberInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminState.h" +#include "blockAllocator.h" +#include "constants.h" +#include "readOnlyNotifier.h" +#include "recoveryJournal.h" +#include "refCounts.h" +#include "refCountsInternals.h" +#include "slab.h" +#include "slabJournalInternals.h" + +/** + * Allocate the buffer and extent used for reading the slab journal when + * scrubbing a slab. + * + * @param scrubber The slab scrubber for which to allocate + * @param layer The physical layer on which the scrubber resides + * @param slabJournalSize The size of a slab journal + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int allocateExtentAndBuffer(SlabScrubber *scrubber, + PhysicalLayer *layer, + BlockCount slabJournalSize) +{ + size_t bufferSize = VDO_BLOCK_SIZE * slabJournalSize; + int result = ALLOCATE(bufferSize, char, __func__, &scrubber->journalData); + if (result != VDO_SUCCESS) { + return result; + } + + return createExtent(layer, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, + slabJournalSize, scrubber->journalData, + &scrubber->extent); +} + +/**********************************************************************/ +int makeSlabScrubber(PhysicalLayer *layer, + BlockCount slabJournalSize, + ReadOnlyNotifier *readOnlyNotifier, + SlabScrubber **scrubberPtr) +{ + SlabScrubber *scrubber; + int result = ALLOCATE(1, SlabScrubber, __func__, &scrubber); + if (result != VDO_SUCCESS) { + return result; + } + + result = allocateExtentAndBuffer(scrubber, layer, slabJournalSize); + if (result != VDO_SUCCESS) { + freeSlabScrubber(&scrubber); + return result; + } + + initializeCompletion(&scrubber->completion, SLAB_SCRUBBER_COMPLETION, layer); + initializeRing(&scrubber->highPrioritySlabs); + initializeRing(&scrubber->slabs); + scrubber->readOnlyNotifier = readOnlyNotifier; + scrubber->adminState.state = ADMIN_STATE_SUSPENDED; + *scrubberPtr = scrubber; + return VDO_SUCCESS; +} + +/** + * Free the extent and buffer used for reading slab journals. + * + * @param scrubber The scrubber + **/ +static void freeExtentAndBuffer(SlabScrubber *scrubber) +{ + freeExtent(&scrubber->extent); + if (scrubber->journalData != NULL) { + FREE(scrubber->journalData); + scrubber->journalData = NULL; + } +} + +/**********************************************************************/ +void freeSlabScrubber(SlabScrubber **scrubberPtr) +{ + if (*scrubberPtr == NULL) { + return; + } + + SlabScrubber *scrubber = *scrubberPtr; + freeExtentAndBuffer(scrubber); + FREE(scrubber); + *scrubberPtr = NULL; +} + +/** + * Get the next slab to scrub. + * + * @param scrubber The slab scrubber + * + * @return The next slab to scrub or NULL if there are none + **/ +static Slab *getNextSlab(SlabScrubber *scrubber) +{ + if (!isRingEmpty(&scrubber->highPrioritySlabs)) { + return slabFromRingNode(scrubber->highPrioritySlabs.next); + } + + if (!isRingEmpty(&scrubber->slabs)) { + return slabFromRingNode(scrubber->slabs.next); + } + + return NULL; +} + +/**********************************************************************/ +bool hasSlabsToScrub(SlabScrubber *scrubber) +{ + return (getNextSlab(scrubber) != NULL); +} + +/**********************************************************************/ +SlabCount getScrubberSlabCount(const SlabScrubber *scrubber) +{ + return relaxedLoad64(&scrubber->slabCount); +} + +/**********************************************************************/ +void registerSlabForScrubbing(SlabScrubber *scrubber, + Slab *slab, + bool highPriority) +{ + ASSERT_LOG_ONLY((slab->status != SLAB_REBUILT), + "slab to be scrubbed is unrecovered"); + + if (slab->status != SLAB_REQUIRES_SCRUBBING) { + return; + } + + unspliceRingNode(&slab->ringNode); + if (!slab->wasQueuedForScrubbing) { + relaxedAdd64(&scrubber->slabCount, 1); + slab->wasQueuedForScrubbing = true; + } + + if (highPriority) { + slab->status = SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING; + pushRingNode(&scrubber->highPrioritySlabs, &slab->ringNode); + return; + } + + pushRingNode(&scrubber->slabs, &slab->ringNode); +} + +/** + * Stop scrubbing, either because there are no more slabs to scrub or because + * there's been an error. + * + * @param scrubber The scrubber + **/ +static void finishScrubbing(SlabScrubber *scrubber) +{ + if (!hasSlabsToScrub(scrubber)) { + freeExtentAndBuffer(scrubber); + } + + // Inform whoever is waiting that scrubbing has completed. + completeCompletion(&scrubber->completion); + + bool notify = hasWaiters(&scrubber->waiters); + + // Note that the scrubber has stopped, and inform anyone who might be waiting + // for that to happen. + if (!finishDraining(&scrubber->adminState)) { + scrubber->adminState.state = ADMIN_STATE_SUSPENDED; + } + + /* + * We can't notify waiters until after we've finished draining or they'll + * just requeue. Fortunately if there were waiters, we can't have been freed + * yet. + */ + if (notify) { + notifyAllWaiters(&scrubber->waiters, NULL, NULL); + } +} + +/**********************************************************************/ +static void scrubNextSlab(SlabScrubber *scrubber); + +/** + * Notify the scrubber that a slab has been scrubbed. This callback is + * registered in applyJournalEntries(). + * + * @param completion The slab rebuild completion + **/ +static void slabScrubbed(VDOCompletion *completion) +{ + SlabScrubber *scrubber = completion->parent; + finishScrubbingSlab(scrubber->slab); + relaxedAdd64(&scrubber->slabCount, -1); + scrubNextSlab(scrubber); +} + +/** + * Abort scrubbing due to an error. + * + * @param scrubber The slab scrubber + * @param result The error + **/ +static void abortScrubbing(SlabScrubber *scrubber, int result) +{ + enterReadOnlyMode(scrubber->readOnlyNotifier, result); + setCompletionResult(&scrubber->completion, result); + scrubNextSlab(scrubber); +} + +/** + * Handle errors while rebuilding a slab. + * + * @param completion The slab rebuild completion + **/ +static void handleScrubberError(VDOCompletion *completion) +{ + abortScrubbing(completion->parent, completion->result); +} + +/** + * Apply all the entries in a block to the reference counts. + * + * @param block A block with entries to apply + * @param entryCount The number of entries to apply + * @param blockNumber The sequence number of the block + * @param slab The slab to apply the entries to + * + * @return VDO_SUCCESS or an error code + **/ +static int applyBlockEntries(PackedSlabJournalBlock *block, + JournalEntryCount entryCount, + SequenceNumber blockNumber, + Slab *slab) +{ + JournalPoint entryPoint = { + .sequenceNumber = blockNumber, + .entryCount = 0, + }; + + SlabBlockNumber maxSBN = slab->end - slab->start; + while (entryPoint.entryCount < entryCount) { + SlabJournalEntry entry = decodeSlabJournalEntry(block, + entryPoint.entryCount); + if (entry.sbn > maxSBN) { + // This entry is out of bounds. + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Slab journal entry" + " (%llu, %u) had invalid offset" + " %u in slab (size %u blocks)", + blockNumber, entryPoint.entryCount, + entry.sbn, maxSBN); + } + + int result = replayReferenceCountChange(slab->referenceCounts, &entryPoint, + entry); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, "Slab journal entry (%llu, %u)" + " (%s of offset %" PRIu32 ") could not be" + " applied in slab %u", + blockNumber, entryPoint.entryCount, + getJournalOperationName(entry.operation), + entry.sbn, slab->slabNumber); + return result; + } + entryPoint.entryCount++; + } + + return VDO_SUCCESS; +} + +/** + * Find the relevant extent of the slab journal and apply all valid entries. + * This is a callback registered in startScrubbing(). + * + * @param completion The metadata read extent completion + **/ +static void applyJournalEntries(VDOCompletion *completion) +{ + SlabScrubber *scrubber = completion->parent; + Slab *slab = scrubber->slab; + SlabJournal *journal = slab->journal; + RefCounts *referenceCounts = slab->referenceCounts; + + // Find the boundaries of the useful part of the journal. + SequenceNumber tail = journal->tail; + TailBlockOffset endIndex = getSlabJournalBlockOffset(journal, tail - 1); + char *endData = scrubber->journalData + (endIndex * VDO_BLOCK_SIZE); + PackedSlabJournalBlock *endBlock = (PackedSlabJournalBlock *) endData; + + SequenceNumber head = getUInt64LE(endBlock->header.fields.head); + TailBlockOffset headIndex = getSlabJournalBlockOffset(journal, head); + BlockCount index = headIndex; + + JournalPoint refCountsPoint = referenceCounts->slabJournalPoint; + JournalPoint lastEntryApplied = refCountsPoint; + for (SequenceNumber sequence = head; sequence < tail; sequence++) { + char *blockData = scrubber->journalData + (index * VDO_BLOCK_SIZE); + PackedSlabJournalBlock *block = (PackedSlabJournalBlock *) blockData; + SlabJournalBlockHeader header; + unpackSlabJournalBlockHeader(&block->header, &header); + + if ((header.nonce != slab->allocator->nonce) + || (header.metadataType != VDO_METADATA_SLAB_JOURNAL) + || (header.sequenceNumber != sequence) + || (header.entryCount > journal->entriesPerBlock) + || (header.hasBlockMapIncrements + && (header.entryCount > journal->fullEntriesPerBlock))) { + // The block is not what we expect it to be. + logError("Slab journal block for slab %u was invalid", + slab->slabNumber); + abortScrubbing(scrubber, VDO_CORRUPT_JOURNAL); + return; + } + + int result = applyBlockEntries(block, header.entryCount, sequence, slab); + if (result != VDO_SUCCESS) { + abortScrubbing(scrubber, result); + return; + } + + lastEntryApplied.sequenceNumber = sequence; + lastEntryApplied.entryCount = header.entryCount - 1; + index++; + if (index == journal->size) { + index = 0; + } + } + + // At the end of rebuild, the refCounts should be accurate to the end + // of the journal we just applied. + int result = ASSERT(!beforeJournalPoint(&lastEntryApplied, &refCountsPoint), + "Refcounts are not more accurate than the slab journal"); + if (result != VDO_SUCCESS) { + abortScrubbing(scrubber, result); + return; + } + + // Save out the rebuilt reference blocks. + prepareCompletion(completion, slabScrubbed, handleScrubberError, + completion->callbackThreadID, scrubber); + startSlabAction(slab, ADMIN_STATE_SAVE_FOR_SCRUBBING, completion); +} + +/** + * Read the current slab's journal from disk now that it has been flushed. + * This callback is registered in scrubNextSlab(). + * + * @param completion The scrubber's extent completion + **/ +static void startScrubbing(VDOCompletion *completion) +{ + SlabScrubber *scrubber = completion->parent; + Slab *slab = scrubber->slab; + if (getSummarizedCleanliness(slab->allocator->summary, slab->slabNumber)) { + slabScrubbed(completion); + return; + } + + prepareCompletion(&scrubber->extent->completion, applyJournalEntries, + handleScrubberError, completion->callbackThreadID, + completion->parent); + readMetadataExtent(scrubber->extent, slab->journalOrigin); +} + +/** + * Scrub the next slab if there is one. + * + * @param scrubber The scrubber + **/ +static void scrubNextSlab(SlabScrubber *scrubber) +{ + // Note: this notify call is always safe only because scrubbing can only + // be started when the VDO is quiescent. + notifyAllWaiters(&scrubber->waiters, NULL, NULL); + if (isReadOnly(scrubber->readOnlyNotifier)) { + setCompletionResult(&scrubber->completion, VDO_READ_ONLY); + finishScrubbing(scrubber); + return; + } + + Slab *slab = getNextSlab(scrubber); + if ((slab == NULL) + || (scrubber->highPriorityOnly + && isRingEmpty(&scrubber->highPrioritySlabs))) { + scrubber->highPriorityOnly = false; + finishScrubbing(scrubber); + return; + } + + if (finishDraining(&scrubber->adminState)) { + return; + } + + unspliceRingNode(&slab->ringNode); + scrubber->slab = slab; + VDOCompletion *completion = extentAsCompletion(scrubber->extent); + prepareCompletion(completion, startScrubbing, + handleScrubberError, scrubber->completion.callbackThreadID, + scrubber); + startSlabAction(slab, ADMIN_STATE_SCRUBBING, completion); +} + +/**********************************************************************/ +void scrubSlabs(SlabScrubber *scrubber, + void *parent, + VDOAction *callback, + VDOAction *errorHandler) +{ + resumeIfQuiescent(&scrubber->adminState); + ThreadID threadID = getCallbackThreadID(); + prepareCompletion(&scrubber->completion, callback, errorHandler, threadID, + parent); + if (!hasSlabsToScrub(scrubber)) { + finishScrubbing(scrubber); + return; + } + + scrubNextSlab(scrubber); +} + +/**********************************************************************/ +void scrubHighPrioritySlabs(SlabScrubber *scrubber, + bool scrubAtLeastOne, + VDOCompletion *parent, + VDOAction *callback, + VDOAction *errorHandler) +{ + if (scrubAtLeastOne && isRingEmpty(&scrubber->highPrioritySlabs)) { + Slab *slab = getNextSlab(scrubber); + if (slab != NULL) { + registerSlabForScrubbing(scrubber, slab, true); + } + } + scrubber->highPriorityOnly = true; + scrubSlabs(scrubber, parent, callback, errorHandler); +} + +/**********************************************************************/ +void stopScrubbing(SlabScrubber *scrubber, VDOCompletion *parent) +{ + if (isQuiescent(&scrubber->adminState)) { + completeCompletion(parent); + } else { + startDraining(&scrubber->adminState, ADMIN_STATE_SUSPENDING, parent, NULL); + } +} + +/**********************************************************************/ +void resumeScrubbing(SlabScrubber *scrubber, VDOCompletion *parent) +{ + if (!hasSlabsToScrub(scrubber)) { + completeCompletion(parent); + return; + } + + int result = resumeIfQuiescent(&scrubber->adminState); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + scrubNextSlab(scrubber); + completeCompletion(parent); +} + +/**********************************************************************/ +int enqueueCleanSlabWaiter(SlabScrubber *scrubber, Waiter *waiter) +{ + if (isReadOnly(scrubber->readOnlyNotifier)) { + return VDO_READ_ONLY; + } + + if (isQuiescent(&scrubber->adminState)) { + return VDO_NO_SPACE; + } + + return enqueueWaiter(&scrubber->waiters, waiter); +} + +/**********************************************************************/ +void dumpSlabScrubber(const SlabScrubber *scrubber) +{ + logInfo("slabScrubber slabCount %u waiters %zu %s%s", + getScrubberSlabCount(scrubber), + countWaiters(&scrubber->waiters), + getAdminStateName(&scrubber->adminState), + scrubber->highPriorityOnly ? ", highPriorityOnly " : ""); +} diff --git a/source/vdo/base/slabScrubber.h b/source/vdo/base/slabScrubber.h new file mode 100644 index 0000000..ca13e63 --- /dev/null +++ b/source/vdo/base/slabScrubber.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubber.h#4 $ + */ + +#ifndef SLAB_SCRUBBER_H +#define SLAB_SCRUBBER_H + +#include "completion.h" +#include "types.h" +#include "waitQueue.h" + +/** + * Create a slab scrubber + * + * @param layer The physical layer of the VDO + * @param slabJournalSize The size of a slab journal in blocks + * @param readOnlyNotifier The context for entering read-only mode + * @param scrubberPtr A pointer to hold the scrubber + * + * @return VDO_SUCCESS or an error + **/ +int makeSlabScrubber(PhysicalLayer *layer, + BlockCount slabJournalSize, + ReadOnlyNotifier *readOnlyNotifier, + SlabScrubber **scrubberPtr) + __attribute__((warn_unused_result)); + +/** + * Free a slab scrubber and null out the reference to it. + * + * @param scrubberPtr A pointer to the scrubber to destroy + **/ +void freeSlabScrubber(SlabScrubber **scrubberPtr); + +/** + * Check whether a scrubber has slabs to scrub. + * + * @param scrubber The scrubber to check + * + * @return true if the scrubber has slabs to scrub + **/ +bool hasSlabsToScrub(SlabScrubber *scrubber) + __attribute__((warn_unused_result)); + +/** + * Register a slab with a scrubber. + * + * @param scrubber The scrubber + * @param slab The slab to scrub + * @param highPriority true if the slab should be put on the + * high-priority queue + **/ +void registerSlabForScrubbing(SlabScrubber *scrubber, + Slab *slab, + bool highPriority); + +/** + * Scrub all the slabs which have been registered with a slab scrubber. + * + * @param scrubber The scrubber + * @param parent The object to notify when scrubbing is complete + * @param callback The function to run when scrubbing is complete + * @param errorHandler The handler for scrubbing errors + **/ +void scrubSlabs(SlabScrubber *scrubber, + void *parent, + VDOAction *callback, + VDOAction *errorHandler); + +/** + * Scrub any slabs which have been registered at high priority with a slab + * scrubber. + * + * @param scrubber The scrubber + * @param scrubAtLeastOne true if one slab should always be + * scrubbed, even if there are no high-priority slabs + * (and there is at least one low priority slab) + * @param parent The completion to notify when scrubbing is complete + * @param callback The function to run when scrubbing is complete + * @param errorHandler The handler for scrubbing errors + **/ +void scrubHighPrioritySlabs(SlabScrubber *scrubber, + bool scrubAtLeastOne, + VDOCompletion *parent, + VDOAction *callback, + VDOAction *errorHandler); + +/** + * Tell the scrubber to stop scrubbing after it finishes the slab it is + * currently working on. + * + * @param scrubber The scrubber to stop + * @param parent The completion to notify when scrubbing has stopped + **/ +void stopScrubbing(SlabScrubber *scrubber, VDOCompletion *parent); + +/** + * Tell the scrubber to resume scrubbing if it has been stopped. + * + * @param scrubber The scrubber to resume + * @param parent The object to notify once scrubbing has resumed + **/ +void resumeScrubbing(SlabScrubber *scrubber, VDOCompletion *parent); + +/** + * Wait for a clean slab. + * + * @param scrubber The scrubber on which to wait + * @param waiter The waiter + * + * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no + * slabs to scrub, and some other error otherwise + **/ +int enqueueCleanSlabWaiter(SlabScrubber *scrubber, Waiter *waiter); + +/** + * Get the number of slabs that are unrecovered or being scrubbed. + * + * @param scrubber The scrubber to query + * + * @return the number of slabs that are unrecovered or being scrubbed + **/ +SlabCount getScrubberSlabCount(const SlabScrubber *scrubber) + __attribute__((warn_unused_result)); + +/** + * Dump information about a slab scrubber to the log for debugging. + * + * @param scrubber The scrubber to dump + **/ +void dumpSlabScrubber(const SlabScrubber *scrubber); + +#endif /* SLAB_SCRUBBER_H */ diff --git a/source/vdo/base/slabScrubberInternals.h b/source/vdo/base/slabScrubberInternals.h new file mode 100644 index 0000000..3d3e8cd --- /dev/null +++ b/source/vdo/base/slabScrubberInternals.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubberInternals.h#5 $ + */ + +#ifndef SLAB_SCRUBBER_INTERNALS_H +#define SLAB_SCRUBBER_INTERNALS_H + +#include "slabScrubber.h" + +#include "adminState.h" +#include "atomic.h" +#include "extent.h" +#include "ringNode.h" + +struct slabScrubber { + VDOCompletion completion; + /** The queue of slabs to scrub first */ + RingNode highPrioritySlabs; + /** The queue of slabs to scrub once there are no highPrioritySlabs */ + RingNode slabs; + /** The queue of VIOs waiting for a slab to be scrubbed */ + WaitQueue waiters; + + // The number of slabs that are unrecovered or being scrubbed. This field is + // modified by the physical zone thread, but is queried by other threads. + Atomic64 slabCount; + + /** The administrative state of the scrubber */ + AdminState adminState; + /** Whether to only scrub high-priority slabs */ + bool highPriorityOnly; + /** The context for entering read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The slab currently being scrubbed */ + Slab *slab; + /** The extent for loading slab journal blocks */ + VDOExtent *extent; + /** A buffer to store the slab journal blocks */ + char *journalData; +}; + +#endif // SLAB_SCRUBBER_INTERNALS_H diff --git a/source/vdo/base/slabSummary.c b/source/vdo/base/slabSummary.c new file mode 100644 index 0000000..7021c67 --- /dev/null +++ b/source/vdo/base/slabSummary.c @@ -0,0 +1,651 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.c#7 $ + */ + +#include "slabSummary.h" + +#include "memoryAlloc.h" + +#include "adminState.h" +#include "constants.h" +#include "extent.h" +#include "readOnlyNotifier.h" +#include "slabSummaryInternals.h" +#include "threadConfig.h" +#include "types.h" + +// SIZING + +/**********************************************************************/ +static BlockCount getSlabSummaryZoneSize(BlockSize blockSize) +{ + SlabCount entriesPerBlock = blockSize / sizeof(SlabSummaryEntry); + BlockCount blocksNeeded = MAX_SLABS / entriesPerBlock; + return blocksNeeded; +} + +/**********************************************************************/ +BlockCount getSlabSummarySize(BlockSize blockSize) +{ + return getSlabSummaryZoneSize(blockSize) * MAX_PHYSICAL_ZONES; +} + +// FULLNESS HINT COMPUTATION + +/** + * Translate a slab's free block count into a 'fullness hint' that can be + * stored in a SlabSummaryEntry's 7 bits that are dedicated to its free count. + * + * Note: the number of free blocks must be strictly less than 2^23 blocks, + * even though theoretically slabs could contain precisely 2^23 blocks; there + * is an assumption that at least one block is used by metadata. This + * assumption is necessary; otherwise, the fullness hint might overflow. + * The fullness hint formula is roughly (fullness >> 16) & 0x7f, but + * ((1 << 23) >> 16) & 0x7f is the same as (0 >> 16) & 0x7f, namely 0, which + * is clearly a bad hint if it could indicate both 2^23 free blocks or 0 free + * blocks. + * + * @param summary The summary which is being updated + * @param freeBlocks The number of free blocks + * + * @return A fullness hint, which can be stored in 7 bits. + **/ +__attribute__((warn_unused_result)) +static uint8_t computeFullnessHint(SlabSummary *summary, BlockCount freeBlocks) +{ + ASSERT_LOG_ONLY((freeBlocks < (1 << 23)), + "free blocks must be less than 2^23"); + + if (freeBlocks == 0) { + return 0; + } + + BlockCount hint = freeBlocks >> summary->hintShift; + return ((hint == 0) ? 1 : hint); +} + +/** + * Translate a slab's free block hint into an approximate count, such that + * computeFullnessHint() is the inverse function of getApproximateFreeBlocks() + * (i.e. computeFullnessHint(getApproximateFreeBlocks(x)) == x). + * + * @param summary The summary from which the hint was obtained + * @param freeBlockHint The hint read from the summary + * + * @return An approximation to the free block count + **/ +__attribute__((warn_unused_result)) +static BlockCount getApproximateFreeBlocks(SlabSummary *summary, + uint8_t freeBlockHint) +{ + return ((BlockCount) freeBlockHint) << summary->hintShift; +} + +// MAKE/FREE FUNCTIONS + +/**********************************************************************/ +static void launchWrite(SlabSummaryBlock *summaryBlock); + +/** + * Initialize a SlabSummaryBlock. + * + * @param layer The backing layer + * @param summaryZone The parent SlabSummaryZone + * @param threadID The ID of the thread of physical zone of this block + * @param entries The entries this block manages + * @param index The index of this block in its zone's summary + * @param slabSummaryBlock The block to intialize + * + * @return VDO_SUCCESS or an error + **/ +static int initializeSlabSummaryBlock(PhysicalLayer *layer, + SlabSummaryZone *summaryZone, + ThreadID threadID, + SlabSummaryEntry *entries, + BlockCount index, + SlabSummaryBlock *slabSummaryBlock) +{ + int result = ALLOCATE(VDO_BLOCK_SIZE, char, __func__, + &slabSummaryBlock->outgoingEntries); + if (result != VDO_SUCCESS) { + return result; + } + + result = createVIO(layer, VIO_TYPE_SLAB_SUMMARY, VIO_PRIORITY_METADATA, + slabSummaryBlock, slabSummaryBlock->outgoingEntries, + &slabSummaryBlock->vio); + if (result != VDO_SUCCESS) { + return result; + } + + slabSummaryBlock->vio->completion.callbackThreadID = threadID; + slabSummaryBlock->zone = summaryZone; + slabSummaryBlock->entries = entries; + slabSummaryBlock->index = index; + return VDO_SUCCESS; +} + +/** + * Create a new, empty SlabSummaryZone object. + * + * @param summary The summary to which the new zone will belong + * @param layer The layer + * @param zoneNumber The zone this is + * @param threadID The ID of the thread for this zone + * @param entries The buffer to hold the entries in this zone + * + * @return VDO_SUCCESS or an error + **/ +static int makeSlabSummaryZone(SlabSummary *summary, + PhysicalLayer *layer, + ZoneCount zoneNumber, + ThreadID threadID, + SlabSummaryEntry *entries) +{ + int result = ALLOCATE_EXTENDED(SlabSummaryZone, summary->blocksPerZone, + SlabSummaryBlock, __func__, + &summary->zones[zoneNumber]); + if (result != VDO_SUCCESS) { + return result; + } + + SlabSummaryZone *summaryZone = summary->zones[zoneNumber]; + summaryZone->summary = summary; + summaryZone->zoneNumber = zoneNumber; + summaryZone->entries = entries; + + if (layer->createMetadataVIO == NULL) { + // Blocks are only used for writing, and without a createVIO() call, + // we'll never be writing anything. + return VDO_SUCCESS; + } + + // Initialize each block. + for (BlockCount i = 0; i < summary->blocksPerZone; i++) { + result = initializeSlabSummaryBlock(layer, summaryZone, threadID, entries, + i, &summaryZone->summaryBlocks[i]); + if (result != VDO_SUCCESS) { + return result; + } + entries += summary->entriesPerBlock; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeSlabSummary(PhysicalLayer *layer, + Partition *partition, + const ThreadConfig *threadConfig, + unsigned int slabSizeShift, + BlockCount maximumFreeBlocksPerSlab, + ReadOnlyNotifier *readOnlyNotifier, + SlabSummary **slabSummaryPtr) +{ + BlockCount blocksPerZone = getSlabSummaryZoneSize(VDO_BLOCK_SIZE); + SlabCount entriesPerBlock = MAX_SLABS / blocksPerZone; + int result = ASSERT((entriesPerBlock * blocksPerZone) == MAX_SLABS, + "block size must be a multiple of entry size"); + if (result != VDO_SUCCESS) { + return result; + } + + if (partition == NULL) { + // Don't make a slab summary for the formatter since it doesn't need it. + return VDO_SUCCESS; + } + + SlabSummary *summary; + result = ALLOCATE_EXTENDED(SlabSummary, threadConfig->physicalZoneCount, + SlabSummaryZone *, __func__, &summary); + if (result != VDO_SUCCESS) { + return result; + } + + summary->zoneCount = threadConfig->physicalZoneCount; + summary->readOnlyNotifier = readOnlyNotifier; + summary->hintShift = (slabSizeShift > 6) ? (slabSizeShift - 6) : 0; + summary->blocksPerZone = blocksPerZone; + summary->entriesPerBlock = entriesPerBlock; + + size_t totalEntries = MAX_SLABS * MAX_PHYSICAL_ZONES; + size_t entryBytes = totalEntries * sizeof(SlabSummaryEntry); + result = layer->allocateIOBuffer(layer, entryBytes, "summary entries", + (char **) &summary->entries); + if (result != VDO_SUCCESS) { + freeSlabSummary(&summary); + return result; + } + + // Initialize all the entries. + uint8_t hint = computeFullnessHint(summary, maximumFreeBlocksPerSlab); + for (size_t i = 0; i < totalEntries; i++) { + // This default tail block offset must be reflected in + // slabJournal.c::readSlabJournalTail(). + summary->entries[i] = (SlabSummaryEntry) { + .tailBlockOffset = 0, + .fullnessHint = hint, + .loadRefCounts = false, + .isDirty = false, + }; + } + + setSlabSummaryOrigin(summary, partition); + for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) { + result = makeSlabSummaryZone(summary, layer, zone, + getPhysicalZoneThread(threadConfig, zone), + summary->entries + (MAX_SLABS * zone)); + if (result != VDO_SUCCESS) { + freeSlabSummary(&summary); + return result; + } + } + + *slabSummaryPtr = summary; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeSlabSummary(SlabSummary **slabSummaryPtr) +{ + if (*slabSummaryPtr == NULL) { + return; + } + + SlabSummary *summary = *slabSummaryPtr; + for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) { + SlabSummaryZone *summaryZone = summary->zones[zone]; + if (summaryZone != NULL) { + for (BlockCount i = 0; i < summary->blocksPerZone; i++) { + freeVIO(&summaryZone->summaryBlocks[i].vio); + FREE(summaryZone->summaryBlocks[i].outgoingEntries); + } + FREE(summaryZone); + } + } + FREE(summary->entries); + FREE(summary); + *slabSummaryPtr = NULL; +} + +/**********************************************************************/ +SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone) +{ + return summary->zones[zone]; +} + +// WRITING FUNCTIONALITY + +/** + * Check whether a summary zone has finished draining. + * + * @param summaryZone The zone to check + **/ +static void checkForDrainComplete(SlabSummaryZone *summaryZone) +{ + if (!isDraining(&summaryZone->state) || (summaryZone->writeCount > 0)) { + return; + } + + finishOperationWithResult(&summaryZone->state, + (isReadOnly(summaryZone->summary->readOnlyNotifier) + ? VDO_READ_ONLY : VDO_SUCCESS)); +} + +/** + * Wake all the waiters in a given queue. If the VDO is in read-only mode they + * will be given a VDO_READ_ONLY error code as their context, otherwise they + * will be given VDO_SUCCESS. + * + * @param summaryZone The slab summary which owns the queue + * @param queue The queue to notify + **/ +static void notifyWaiters(SlabSummaryZone *summaryZone, WaitQueue *queue) +{ + int result = (isReadOnly(summaryZone->summary->readOnlyNotifier) + ? VDO_READ_ONLY : VDO_SUCCESS); + notifyAllWaiters(queue, NULL, &result); +} + +/** + * Finish processing a block which attempted to write, whether or not the + * attempt succeeded. + * + * @param block The block + **/ +static void finishUpdatingSlabSummaryBlock(SlabSummaryBlock *block) +{ + notifyWaiters(block->zone, &block->currentUpdateWaiters); + block->writing = false; + block->zone->writeCount--; + if (hasWaiters(&block->nextUpdateWaiters)) { + launchWrite(block); + } else { + checkForDrainComplete(block->zone); + } +} + +/** + * This is the callback for a successful block write. + * + * @param completion The write VIO + **/ +static void finishUpdate(VDOCompletion *completion) +{ + SlabSummaryBlock *block = completion->parent; + atomicAdd64(&block->zone->summary->statistics.blocksWritten, 1); + finishUpdatingSlabSummaryBlock(block); +} + +/** + * Handle an error writing a slab summary block. + * + * @param completion The write VIO + **/ +static void handleWriteError(VDOCompletion *completion) +{ + SlabSummaryBlock *block = completion->parent; + enterReadOnlyMode(block->zone->summary->readOnlyNotifier, + completion->result); + finishUpdatingSlabSummaryBlock(block); +} + +/** + * Write a slab summary block unless it is currently out for writing. + * + * @param [in] block The block that needs to be committed + **/ +static void launchWrite(SlabSummaryBlock *block) +{ + if (block->writing) { + return; + } + + SlabSummaryZone *zone = block->zone; + zone->writeCount++; + transferAllWaiters(&block->nextUpdateWaiters, &block->currentUpdateWaiters); + block->writing = true; + + SlabSummary *summary = zone->summary; + if (isReadOnly(summary->readOnlyNotifier)) { + finishUpdatingSlabSummaryBlock(block); + return; + } + + memcpy(block->outgoingEntries, block->entries, + sizeof(SlabSummaryEntry) * summary->entriesPerBlock); + + // Flush before writing to ensure that the slab journal tail blocks and + // reference updates covered by this summary update are stable (VDO-2332). + PhysicalBlockNumber pbn = (summary->origin + + (summary->blocksPerZone * zone->zoneNumber) + + block->index); + launchWriteMetadataVIOWithFlush(block->vio, pbn, finishUpdate, + handleWriteError, true, false); +} + +/** + * Initiate a drain. + * + * Implements AdminInitiator. + **/ +static void initiateDrain(AdminState *state) +{ + checkForDrainComplete(container_of(state, SlabSummaryZone, state)); +} + +/**********************************************************************/ +void drainSlabSummaryZone(SlabSummaryZone *summaryZone, + AdminStateCode operation, + VDOCompletion *parent) +{ + startDraining(&summaryZone->state, operation, parent, initiateDrain); +} + +/**********************************************************************/ +void resumeSlabSummaryZone(SlabSummaryZone *summaryZone, VDOCompletion *parent) +{ + finishCompletion(parent, resumeIfQuiescent(&summaryZone->state)); +} + +// READ/UPDATE FUNCTIONS + +/** + * Get the summary block, and offset into it, for storing the summary for a + * slab. + * + * @param summaryZone The SlabSummaryZone being queried + * @param slabNumber The slab whose summary location is sought + * + * @return A pointer to the SlabSummaryEntryBlock containing this + * SlabSummaryEntry + **/ +static SlabSummaryBlock *getSummaryBlockForSlab(SlabSummaryZone *summaryZone, + SlabCount slabNumber) +{ + SlabCount entriesPerBlock = summaryZone->summary->entriesPerBlock; + return &summaryZone->summaryBlocks[slabNumber / entriesPerBlock]; +} + +/**********************************************************************/ +void updateSlabSummaryEntry(SlabSummaryZone *summaryZone, + Waiter *waiter, + SlabCount slabNumber, + TailBlockOffset tailBlockOffset, + bool loadRefCounts, + bool isClean, + BlockCount freeBlocks) +{ + SlabSummaryBlock *block = getSummaryBlockForSlab(summaryZone, slabNumber); + int result; + if (isReadOnly(summaryZone->summary->readOnlyNotifier)) { + result = VDO_READ_ONLY; + } else if (isDraining(&summaryZone->state) + || isQuiescent(&summaryZone->state)) { + result = VDO_INVALID_ADMIN_STATE; + } else { + uint8_t hint = computeFullnessHint(summaryZone->summary, freeBlocks); + SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; + *entry = (SlabSummaryEntry) { + .tailBlockOffset = tailBlockOffset, + .loadRefCounts = (entry->loadRefCounts || loadRefCounts), + .isDirty = !isClean, + .fullnessHint = hint, + }; + result = enqueueWaiter(&block->nextUpdateWaiters, waiter); + } + + if (result != VDO_SUCCESS) { + waiter->callback(waiter, &result); + return; + } + + launchWrite(block); +} + +/**********************************************************************/ +TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone, + SlabCount slabNumber) +{ + return summaryZone->entries[slabNumber].tailBlockOffset; +} + +/**********************************************************************/ +bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber) +{ + return summaryZone->entries[slabNumber].loadRefCounts; +} + +/**********************************************************************/ +bool getSummarizedCleanliness(SlabSummaryZone *summaryZone, + SlabCount slabNumber) +{ + return !summaryZone->entries[slabNumber].isDirty; +} + +/**********************************************************************/ +BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone, + SlabCount slabNumber) +{ + SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; + return getApproximateFreeBlocks(summaryZone->summary, entry->fullnessHint); +} + +/**********************************************************************/ +void getSummarizedRefCountsState(SlabSummaryZone *summaryZone, + SlabCount slabNumber, + size_t *freeBlockHint, + bool *isClean) +{ + SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; + *freeBlockHint = entry->fullnessHint; + *isClean = !entry->isDirty; +} + +/**********************************************************************/ +void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone, + SlabCount slabCount, + SlabStatus *statuses) +{ + for (SlabCount i = 0; i < slabCount; i++) { + statuses[i] = (SlabStatus) { + .slabNumber = i, + .isClean = !summaryZone->entries[i].isDirty, + .emptiness = summaryZone->entries[i].fullnessHint + }; + } +} + +// RESIZE FUNCTIONS + +/**********************************************************************/ +void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition) +{ + summary->origin = getFixedLayoutPartitionOffset(partition); +} + +// COMBINING FUNCTIONS (LOAD) + +/** + * Clean up after saving out the combined slab summary. This callback is + * registered in finishLoadingSummary() and loadSlabSummary(). + * + * @param completion The extent which was used to write the summary data + **/ +static void finishCombiningZones(VDOCompletion *completion) +{ + SlabSummary *summary = completion->parent; + int result = completion->result; + VDOExtent *extent = asVDOExtent(completion); + freeExtent(&extent); + finishLoadingWithResult(&summary->zones[0]->state, result); +} + +/**********************************************************************/ +void combineZones(SlabSummary *summary) +{ + // Combine all the old summary data into the portion of the buffer + // corresponding to the first zone. + ZoneCount zone = 0; + if (summary->zonesToCombine > 1) { + for (SlabCount entryNumber = 0; entryNumber < MAX_SLABS; entryNumber++) { + if (zone != 0) { + memcpy(summary->entries + entryNumber, + summary->entries + (zone * MAX_SLABS) + entryNumber, + sizeof(SlabSummaryEntry)); + } + zone++; + if (zone == summary->zonesToCombine) { + zone = 0; + } + } + } + + // Copy the combined data to each zones's region of the buffer. + for (zone = 1; zone < MAX_PHYSICAL_ZONES; zone++) { + memcpy(summary->entries + (zone * MAX_SLABS), summary->entries, + MAX_SLABS * sizeof(SlabSummaryEntry)); + } +} + +/** + * Combine the slab summary data from all the previously written zones + * and copy the combined summary to each partition's data region. Then write + * the combined summary back out to disk. This callback is registered in + * loadSlabSummary(). + * + * @param completion The extent which was used to read the summary data + **/ +static void finishLoadingSummary(VDOCompletion *completion) +{ + SlabSummary *summary = completion->parent; + VDOExtent *extent = asVDOExtent(completion); + + // Combine the zones so each zone is correct for all slabs. + combineZones(summary); + + // Write the combined summary back out. + extent->completion.callback = finishCombiningZones; + writeMetadataExtent(extent, summary->origin); +} + +/**********************************************************************/ +void loadSlabSummary(SlabSummary *summary, + AdminStateCode operation, + ZoneCount zonesToCombine, + VDOCompletion *parent) +{ + SlabSummaryZone *zone = summary->zones[0]; + if (!startLoading(&zone->state, operation, parent, NULL)) { + return; + } + + VDOExtent *extent; + BlockCount blocks = summary->blocksPerZone * MAX_PHYSICAL_ZONES; + int result = createExtent(parent->layer, VIO_TYPE_SLAB_SUMMARY, + VIO_PRIORITY_METADATA, blocks, + (char *) summary->entries, &extent); + if (result != VDO_SUCCESS) { + finishLoadingWithResult(&zone->state, result); + return; + } + + if ((operation == ADMIN_STATE_FORMATTING) + || (operation == ADMIN_STATE_LOADING_FOR_REBUILD)) { + prepareCompletion(&extent->completion, finishCombiningZones, + finishCombiningZones, 0, summary); + writeMetadataExtent(extent, summary->origin); + return; + } + + summary->zonesToCombine = zonesToCombine; + prepareCompletion(&extent->completion, finishLoadingSummary, + finishCombiningZones, 0, summary); + readMetadataExtent(extent, summary->origin); +} + +/**********************************************************************/ +SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary) +{ + const AtomicSlabSummaryStatistics *atoms = &summary->statistics; + return (SlabSummaryStatistics) { + .blocksWritten = atomicLoad64(&atoms->blocksWritten), + }; +} diff --git a/source/vdo/base/slabSummary.h b/source/vdo/base/slabSummary.h new file mode 100644 index 0000000..4ce32cb --- /dev/null +++ b/source/vdo/base/slabSummary.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.h#5 $ + */ + +#ifndef SLAB_SUMMARY_H +#define SLAB_SUMMARY_H + +#include "completion.h" +#include "fixedLayout.h" +#include "slab.h" +#include "statistics.h" +#include "types.h" +#include "waitQueue.h" + +/** + * The SlabSummary provides hints during load and recovery about the state + * of the slabs in order to avoid the need to read the slab journals in their + * entirety before a VDO can come online. + * + * The information in the summary for each slab includes the rough number of + * free blocks (which is used to prioritize scrubbing), the cleanliness of a + * slab (so that clean slabs containing free space will be used on restart), + * and the location of the tail block of the slab's journal. + * + * The SlabSummary has its own partition at the end of the volume which is + * sized to allow for a complete copy of the summary for each of up to 16 + * physical zones. + * + * During resize, the SlabSummary moves its backing partition and is saved once + * moved; the SlabSummary is not permitted to overwrite the previous recovery + * journal space. + * + * The SlabSummary does not have its own version information, but relies on the + * master version number. + **/ + +/** + * The offset of a slab journal tail block. + **/ +typedef uint8_t TailBlockOffset; + +/** + * A slab status is a very small structure for use in determining the ordering + * of slabs in the scrubbing process. + **/ +typedef struct slabStatus { + SlabCount slabNumber; + bool isClean; + uint8_t emptiness; +} SlabStatus; + +/** + * Returns the size on disk of the SlabSummary structure. + * + * @param blockSize The block size of the physical layer + * + * @return the blocks required to store the SlabSummary on disk + **/ +BlockCount getSlabSummarySize(BlockSize blockSize) +__attribute__((warn_unused_result)); + +/** + * Create a slab summary. + * + * @param [in] layer The layer + * @param [in] partition The partition to hold the summary + * @param [in] threadConfig The thread config of the VDO + * @param [in] slabSizeShift The number of bits in the slab size + * @param [in] maximumFreeBlocksPerSlab The maximum number of free blocks a + * slab can have + * @param [in] readOnlyNotifier The context for entering read-only + * mode + * @param [out] slabSummaryPtr A pointer to hold the summary + * + * @return VDO_SUCCESS or an error + **/ +int makeSlabSummary(PhysicalLayer *layer, + Partition *partition, + const ThreadConfig *threadConfig, + unsigned int slabSizeShift, + BlockCount maximumFreeBlocksPerSlab, + ReadOnlyNotifier *readOnlyNotifier, + SlabSummary **slabSummaryPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a SlabSummary and NULL out the reference to it. + * + * @param [in,out] slabSummaryPtr A pointer to the SlabSummary to free + **/ +void freeSlabSummary(SlabSummary **slabSummaryPtr); + +/** + * Get the portion of the slab summary for a specified zone. + * + * @param summary The slab summary + * @param zone The zone + * + * @return The portion of the slab summary for the specified zone + **/ +SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone) + __attribute__((warn_unused_result)); + +/** + * Drain a zone of the slab summary. + * + * @param summaryZone The zone to drain + * @param operation The type of drain to perform + * @param parent The object to notify when the suspend is complete + **/ +void drainSlabSummaryZone(SlabSummaryZone *summaryZone, + AdminStateCode operation, + VDOCompletion *parent); + +/** + * Resume a zone of the slab summary. + * + * @param summaryZone The zone to resume + * @param parent The object to notify when the zone is resumed + **/ +void resumeSlabSummaryZone(SlabSummaryZone *summaryZone, + VDOCompletion *parent); + +/** + * Update the entry for a slab. + * + * @param summaryZone The SlabSummaryZone for the zone of the slab + * @param waiter The waiter that is updating the summary + * @param slabNumber The slab number to update + * @param tailBlockOffset The offset of slab journal's tail block + * @param loadRefCounts Whether the refCounts must be loaded from the layer + * on the next load + * @param isClean Whether the slab is clean + * @param freeBlocks The number of free blocks + **/ +void updateSlabSummaryEntry(SlabSummaryZone *summaryZone, + Waiter *waiter, + SlabCount slabNumber, + TailBlockOffset tailBlockOffset, + bool loadRefCounts, + bool isClean, + BlockCount freeBlocks); + +/** + * Get the stored tail block offset for a slab. + * + * @param summaryZone The SlabSummaryZone to use + * @param slabNumber The slab number to get the offset for + * + * @return The tail block offset for the slab + **/ +TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone, + SlabCount slabNumber) + __attribute__((warn_unused_result)); + +/** + * Whether refCounts must be loaded from the layer. + * + * @param summaryZone The SlabSummaryZone to use + * @param slabNumber The slab number to get information for + * + * @return Whether refCounts must be loaded + **/ +bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber) + __attribute__((warn_unused_result)); + +/** + * Get the stored cleanliness information for a single slab. + * + * @param summaryZone The SlabSummaryZone to use + * @param slabNumber The slab number to get information for + * + * @return Whether the slab is clean + **/ +bool getSummarizedCleanliness(SlabSummaryZone *summaryZone, + SlabCount slabNumber) + __attribute__((warn_unused_result)); + +/** + * Get the stored emptiness information for a single slab. + * + * @param summaryZone The SlabSummaryZone to use + * @param slabNumber The slab number to get information for + * + * @return An approximation to the free blocks in the slab + **/ +BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone, + SlabCount slabNumber) + __attribute__((warn_unused_result)); + +/** + * Get the stored RefCounts state information for a single slab. Used + * in testing only. + * + * @param [in] summaryZone The SlabSummaryZone to use + * @param [in] slabNumber The slab number to get information for + * @param [out] freeBlockHint The approximate number of free blocks + * @param [out] isClean Whether the slab is clean + **/ +void getSummarizedRefCountsState(SlabSummaryZone *summaryZone, + SlabCount slabNumber, + size_t *freeBlockHint, + bool *isClean); + +/** + * Get the stored slab statuses for all slabs in a zone. + * + * @param [in] summaryZone The SlabSummaryZone to use + * @param [in] slabCount The number of slabs to fetch + * @param [in,out] statuses An array of SlabStatuses to populate + **/ +void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone, + SlabCount slabCount, + SlabStatus *statuses); + +/** + * Set the origin of the slab summary relative to the physical layer. + * + * @param summary The SlabSummary to update + * @param partition The slab summary partition + **/ +void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition); + +/** + * Read in all the slab summary data from the slab summary partition, + * combine all the previously used zones into a single zone, and then + * write the combined summary back out to each possible zones' summary + * region. + * + * @param summary The summary to load + * @param operation The type of load to perform + * @param zonesToCombine The number of zones to be combined; if set to 0, + * all of the summary will be initialized as new. + * @param parent The parent of this operation + **/ +void loadSlabSummary(SlabSummary *summary, + AdminStateCode operation, + ZoneCount zonesToCombine, + VDOCompletion *parent); + +/** + * Fetch the cumulative statistics for all slab summary zones in a summary. + * + * @param summary The summary in question + * + * @return the cumulative slab summary statistics for the summary + **/ +SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary) + __attribute__((warn_unused_result)); + +#endif // SLAB_SUMMARY_H diff --git a/source/vdo/base/slabSummaryInternals.h b/source/vdo/base/slabSummaryInternals.h new file mode 100644 index 0000000..8ac071c --- /dev/null +++ b/source/vdo/base/slabSummaryInternals.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummaryInternals.h#7 $ + */ + +#ifndef SLAB_SUMMARY_INTERNALS_H +#define SLAB_SUMMARY_INTERNALS_H + +#include "slabSummary.h" + +#include "adminState.h" +#include "atomic.h" + +typedef struct slabSummaryEntry { + /** Bits 7..0: The offset of the tail block within the slab journal */ + TailBlockOffset tailBlockOffset; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + /** Bits 13..8: A hint about the fullness of the slab */ + unsigned int fullnessHint : 6; + /** Bit 14: Whether the refCounts must be loaded from the layer */ + unsigned int loadRefCounts : 1; + /** Bit 15: The believed cleanliness of this slab */ + unsigned int isDirty : 1; +#else + /** Bit 15: The believed cleanliness of this slab */ + unsigned int isDirty : 1; + /** Bit 14: Whether the refCounts must be loaded from the layer */ + unsigned int loadRefCounts : 1; + /** Bits 13..8: A hint about the fullness of the slab */ + unsigned int fullnessHint : 6; +#endif +} __attribute__((packed)) SlabSummaryEntry; + +typedef struct slabSummaryBlock { + /** The zone to which this block belongs */ + SlabSummaryZone *zone; + /** The index of this block in its zone's summary */ + BlockCount index; + /** Whether this block has a write outstanding */ + bool writing; + /** Ring of updates waiting on the outstanding write */ + WaitQueue currentUpdateWaiters; + /** Ring of updates waiting on the next write */ + WaitQueue nextUpdateWaiters; + /** The active SlabSummaryEntry array for this block */ + SlabSummaryEntry *entries; + /** The VIO used to write this block */ + VIO *vio; + /** The packed entries, one block long, backing the VIO */ + char *outgoingEntries; +} SlabSummaryBlock; + +/** + * The statistics for all the slab summary zones owned by this slab summary. + * These fields are all mutated only by their physical zone threads, but are + * read by other threads when gathering statistics for the entire depot. + **/ +typedef struct atomicSlabSummaryStatistics { + /** Number of blocks written */ + Atomic64 blocksWritten; +} AtomicSlabSummaryStatistics; + +struct slabSummaryZone { + /** The summary of which this is a zone */ + SlabSummary *summary; + /** The number of this zone */ + ZoneCount zoneNumber; + /** Count of the number of blocks currently out for writing */ + BlockCount writeCount; + /** The state of this zone */ + AdminState state; + /** The array (owned by the blocks) of all entries */ + SlabSummaryEntry *entries; + /** The array of SlabSummaryEntryBlocks */ + SlabSummaryBlock summaryBlocks[]; +}; + +struct slabSummary { + /** The context for entering read-only mode */ + ReadOnlyNotifier *readOnlyNotifier; + /** The statistics for this slab summary */ + AtomicSlabSummaryStatistics statistics; + /** The start of the slab summary partition relative to the layer */ + PhysicalBlockNumber origin; + /** The number of bits to shift to get a 7-bit fullness hint */ + unsigned int hintShift; + /** The number of blocks (calculated based on MAX_SLABS) */ + BlockCount blocksPerZone; + /** The number of slabs per block (calculated from block size) */ + SlabCount entriesPerBlock; + /** The entries for all of the zones the partition can hold */ + SlabSummaryEntry *entries; + /** The number of zones which were active at the time of the last update */ + ZoneCount zonesToCombine; + /** The current number of active zones */ + ZoneCount zoneCount; + /** The currently active zones */ + SlabSummaryZone *zones[]; +}; + +/** + * Treating the current entries buffer as the on-disk value of all zones, + * update every zone to the correct values for every slab. + * + * @param summary The summary whose entries should be combined + **/ +void combineZones(SlabSummary *summary); + +#endif // SLAB_SUMMARY_INTERNALS_H diff --git a/source/vdo/base/statistics.h b/source/vdo/base/statistics.h new file mode 100644 index 0000000..2511076 --- /dev/null +++ b/source/vdo/base/statistics.h @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef STATISTICS_H +#define STATISTICS_H + +#include "header.h" +#include "types.h" + +enum { + STATISTICS_VERSION = 31, +}; + +typedef struct { + /** The total number of slabs from which blocks may be allocated */ + uint64_t slabCount; + /** The total number of slabs from which blocks have ever been allocated */ + uint64_t slabsOpened; + /** The number of times since loading that a slab has been re-opened */ + uint64_t slabsReopened; +} BlockAllocatorStatistics; + +/** + * Counters for tracking the number of items written (blocks, requests, etc.) + * that keep track of totals at steps in the write pipeline. Three counters + * allow the number of buffered, in-memory items and the number of in-flight, + * unacknowledged writes to be derived, while still tracking totals for + * reporting purposes + **/ +typedef struct { + /** The total number of items on which processing has started */ + uint64_t started; + /** The total number of items for which a write operation has been issued */ + uint64_t written; + /** The total number of items for which a write operation has completed */ + uint64_t committed; +} CommitStatistics; + +/** Counters for events in the recovery journal */ +typedef struct { + /** Number of times the on-disk journal was full */ + uint64_t diskFull; + /** Number of times the recovery journal requested slab journal commits. */ + uint64_t slabJournalCommitsRequested; + /** Write/Commit totals for individual journal entries */ + CommitStatistics entries; + /** Write/Commit totals for journal blocks */ + CommitStatistics blocks; +} RecoveryJournalStatistics; + +/** The statistics for the compressed block packer. */ +typedef struct { + /** Number of compressed data items written since startup */ + uint64_t compressedFragmentsWritten; + /** Number of blocks containing compressed items written since startup */ + uint64_t compressedBlocksWritten; + /** Number of VIOs that are pending in the packer */ + uint64_t compressedFragmentsInPacker; +} PackerStatistics; + +/** The statistics for the slab journals. */ +typedef struct { + /** Number of times the on-disk journal was full */ + uint64_t diskFullCount; + /** Number of times an entry was added over the flush threshold */ + uint64_t flushCount; + /** Number of times an entry was added over the block threshold */ + uint64_t blockedCount; + /** Number of times a tail block was written */ + uint64_t blocksWritten; + /** Number of times we had to wait for the tail to write */ + uint64_t tailBusyCount; +} SlabJournalStatistics; + +/** The statistics for the slab summary. */ +typedef struct { + /** Number of blocks written */ + uint64_t blocksWritten; +} SlabSummaryStatistics; + +/** The statistics for the reference counts. */ +typedef struct { + /** Number of reference blocks written */ + uint64_t blocksWritten; +} RefCountsStatistics; + +/** The statistics for the block map. */ +typedef struct { + /** number of dirty (resident) pages */ + uint32_t dirtyPages; + /** number of clean (resident) pages */ + uint32_t cleanPages; + /** number of free pages */ + uint32_t freePages; + /** number of pages in failed state */ + uint32_t failedPages; + /** number of pages incoming */ + uint32_t incomingPages; + /** number of pages outgoing */ + uint32_t outgoingPages; + /** how many times free page not avail */ + uint32_t cachePressure; + /** number of getVDOPageAsync() for read */ + uint64_t readCount; + /** number or getVDOPageAsync() for write */ + uint64_t writeCount; + /** number of times pages failed to read */ + uint64_t failedReads; + /** number of times pages failed to write */ + uint64_t failedWrites; + /** number of gets that are reclaimed */ + uint64_t reclaimed; + /** number of gets for outgoing pages */ + uint64_t readOutgoing; + /** number of gets that were already there */ + uint64_t foundInCache; + /** number of gets requiring discard */ + uint64_t discardRequired; + /** number of gets enqueued for their page */ + uint64_t waitForPage; + /** number of gets that have to fetch */ + uint64_t fetchRequired; + /** number of page fetches */ + uint64_t pagesLoaded; + /** number of page saves */ + uint64_t pagesSaved; + /** the number of flushes issued */ + uint64_t flushCount; +} BlockMapStatistics; + +/** The dedupe statistics from hash locks */ +typedef struct { + /** Number of times the UDS advice proved correct */ + uint64_t dedupeAdviceValid; + /** Number of times the UDS advice proved incorrect */ + uint64_t dedupeAdviceStale; + /** Number of writes with the same data as another in-flight write */ + uint64_t concurrentDataMatches; + /** Number of writes whose hash collided with an in-flight write */ + uint64_t concurrentHashCollisions; +} HashLockStatistics; + +/** Counts of error conditions in VDO. */ +typedef struct { + /** number of times VDO got an invalid dedupe advice PBN from UDS */ + uint64_t invalidAdvicePBNCount; + /** number of times a VIO completed with a VDO_NO_SPACE error */ + uint64_t noSpaceErrorCount; + /** number of times a VIO completed with a VDO_READ_ONLY error */ + uint64_t readOnlyErrorCount; +} ErrorStatistics; + +/** The statistics of the vdo service. */ +struct vdoStatistics { + uint32_t version; + uint32_t releaseVersion; + /** Number of blocks used for data */ + uint64_t dataBlocksUsed; + /** Number of blocks used for VDO metadata */ + uint64_t overheadBlocksUsed; + /** Number of logical blocks that are currently mapped to physical blocks */ + uint64_t logicalBlocksUsed; + /** number of physical blocks */ + BlockCount physicalBlocks; + /** number of logical blocks */ + BlockCount logicalBlocks; + /** Size of the block map page cache, in bytes */ + uint64_t blockMapCacheSize; + /** String describing the active write policy of the VDO */ + char writePolicy[15]; + /** The physical block size */ + uint64_t blockSize; + /** Number of times the VDO has successfully recovered */ + uint64_t completeRecoveries; + /** Number of times the VDO has recovered from read-only mode */ + uint64_t readOnlyRecoveries; + /** String describing the operating mode of the VDO */ + char mode[15]; + /** Whether the VDO is in recovery mode */ + bool inRecoveryMode; + /** What percentage of recovery mode work has been completed */ + uint8_t recoveryPercentage; + /** The statistics for the compressed block packer */ + PackerStatistics packer; + /** Counters for events in the block allocator */ + BlockAllocatorStatistics allocator; + /** Counters for events in the recovery journal */ + RecoveryJournalStatistics journal; + /** The statistics for the slab journals */ + SlabJournalStatistics slabJournal; + /** The statistics for the slab summary */ + SlabSummaryStatistics slabSummary; + /** The statistics for the reference counts */ + RefCountsStatistics refCounts; + /** The statistics for the block map */ + BlockMapStatistics blockMap; + /** The dedupe statistics from hash locks */ + HashLockStatistics hashLock; + /** Counts of error conditions */ + ErrorStatistics errors; +}; + +/** + * Get the proc file path for reading VDOStatistics. + * + * @return The proc file path + **/ +static inline const char *getVDOStatisticsProcFile(void) { + return "dedupe_stats"; +} + +#endif /* not STATISTICS_H */ diff --git a/source/vdo/base/statusCodes.c b/source/vdo/base/statusCodes.c new file mode 100644 index 0000000..40be3fd --- /dev/null +++ b/source/vdo/base/statusCodes.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/statusCodes.c#3 $ + */ + +#include "statusCodes.h" + +#include "errors.h" +#include "permassert.h" +#include "threadOnce.h" + +const struct errorInfo vdoStatusList[] = { + { "VDO_NOT_IMPLEMENTED", "Not implemented" }, + { "VDO_OUT_OF_RANGE", "Out of range" }, + { "VDO_REF_COUNT_INVALID", "Reference count would become invalid" }, + { "VDO_NO_SPACE", "Out of space" }, + { "VDO_UNEXPECTED_EOF", "Unexpected EOF on block read" }, + { "VDO_BAD_CONFIGURATION", "Bad configuration option" }, + { "VDO_SOCKET_ERROR", "Socket error" }, + { "VDO_BAD_ALIGNMENT", "Mis-aligned block reference" }, + { "VDO_COMPONENT_BUSY", "Prior operation still in progress" }, + { "VDO_BAD_PAGE", "Corrupt or incorrect page" }, + { "VDO_UNSUPPORTED_VERSION", "Unsupported component version" }, + { "VDO_INCORRECT_COMPONENT", "Component id mismatch in decoder" }, + { "VDO_PARAMETER_MISMATCH", "Parameters have conflicting values" }, + { "VDO_BLOCK_SIZE_TOO_SMALL", "The block size is too small" }, + { "VDO_UNKNOWN_PARTITION", "No partition exists with a given id" }, + { "VDO_PARTITION_EXISTS", "A partition already exists with a given id"}, + { "VDO_NOT_READ_ONLY", "The device is not in read-only mode" }, + { "VDO_INCREMENT_TOO_SMALL", "Physical block growth of too few blocks" }, + { "VDO_CHECKSUM_MISMATCH", "Incorrect checksum" }, + { "VDO_RECOVERY_JOURNAL_FULL", "The recovery journal is full" }, + { "VDO_LOCK_ERROR", "A lock is held incorrectly" }, + { "VDO_READ_ONLY", "The device is in read-only mode" }, + { "VDO_SHUTTING_DOWN", "The device is shutting down" }, + { "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" }, + { "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" }, + { "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" }, + { "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" }, + { "VDO_UNKNOWN_COMMAND", "The extended command is not known" }, + { "VDO_COMMAND_ERROR", "Bad extended command parameters" }, + { "VDO_CANNOT_DETERMINE_SIZE", "Cannot determine config sizes to fit" }, + { "VDO_BAD_MAPPING", "Invalid page mapping" }, + { "VDO_READ_CACHE_BUSY", "Read cache has no free slots" }, + { "VDO_BIO_CREATION_FAILED", "Bio creation failed" }, + { "VDO_BAD_MAGIC", "Bad magic number" }, + { "VDO_BAD_NONCE", "Bad nonce" }, + { "VDO_JOURNAL_OVERFLOW", "Journal sequence number overflow" }, + { "VDO_INVALID_ADMIN_STATE", "Invalid operation for current state" }, +}; + +#ifndef __KERNEL__ +static OnceState vdoStatusCodesRegistered = ONCE_STATE_INITIALIZER; +static int statusCodeRegistrationResult; + +/**********************************************************************/ +static void doStatusCodeRegistration(void) +{ + STATIC_ASSERT((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE) + == COUNT_OF(vdoStatusList)); + + int result = registerErrorBlock("VDO Status", + VDO_STATUS_CODE_BASE, + VDO_STATUS_CODE_BLOCK_END, + vdoStatusList, + sizeof(vdoStatusList)); + /* + * The following test handles cases where libvdo is statically linked + * against both the test modules and the test driver (because multiple + * instances of this module call their own copy of this function + * once each, resulting in multiple calls to registerErrorBlock which + * is shared in libuds). + */ + if (result == UDS_DUPLICATE_NAME) { + result = UDS_SUCCESS; + } + + statusCodeRegistrationResult + = (result == UDS_SUCCESS) ? VDO_SUCCESS : result; +} +#endif + +/**********************************************************************/ +int registerStatusCodes(void) +{ +#ifdef __KERNEL__ + return VDO_SUCCESS; +#else + performOnce(&vdoStatusCodesRegistered, doStatusCodeRegistration); + return statusCodeRegistrationResult; +#endif +} diff --git a/source/vdo/base/statusCodes.h b/source/vdo/base/statusCodes.h new file mode 100644 index 0000000..dd3a3ff --- /dev/null +++ b/source/vdo/base/statusCodes.h @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/statusCodes.h#2 $ + */ + +#ifndef STATUS_CODES_H +#define STATUS_CODES_H + +#include "errors.h" + +enum { + UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, + VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, + VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, + PRP_BLOCK_START = VDO_BLOCK_END, + PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, +}; + +/** + * VDO-specific status codes. + **/ +enum vdoStatusCodes { + /** successful result */ + VDO_SUCCESS = 0, + /** base of all VDO errors */ + VDO_STATUS_CODE_BASE = VDO_BLOCK_START, + /** we haven't written this yet */ + VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE, + /** input out of range */ + VDO_OUT_OF_RANGE, + /** an invalid reference count would result */ + VDO_REF_COUNT_INVALID, + /** a free block could not be allocated */ + VDO_NO_SPACE, + /** unexpected EOF on block read */ + VDO_UNEXPECTED_EOF, + /** improper or missing configuration option */ + VDO_BAD_CONFIGURATION, + /** socket opening or binding problem */ + VDO_SOCKET_ERROR, + /** read or write on non-aligned offset */ + VDO_BAD_ALIGNMENT, + /** prior operation still in progress */ + VDO_COMPONENT_BUSY, + /** page contents incorrect or corrupt data */ + VDO_BAD_PAGE, + /** unsupported version of some component */ + VDO_UNSUPPORTED_VERSION, + /** component id mismatch in decoder */ + VDO_INCORRECT_COMPONENT, + /** parameters have conflicting values */ + VDO_PARAMETER_MISMATCH, + /** the block size is too small */ + VDO_BLOCK_SIZE_TOO_SMALL, + /** no partition exists with a given id */ + VDO_UNKNOWN_PARTITION, + /** a partition already exists with a given id */ + VDO_PARTITION_EXISTS, + /** the VDO is not in read-only mode */ + VDO_NOT_READ_ONLY, + /** physical block growth of too few blocks */ + VDO_INCREMENT_TOO_SMALL, + /** incorrect checksum */ + VDO_CHECKSUM_MISMATCH, + /** the recovery journal is full */ + VDO_RECOVERY_JOURNAL_FULL, + /** a lock is held incorrectly */ + VDO_LOCK_ERROR, + /** the VDO is in read-only mode */ + VDO_READ_ONLY, + /** the VDO is shutting down */ + VDO_SHUTTING_DOWN, + /** the recovery journal has corrupt entries */ + VDO_CORRUPT_JOURNAL, + /** exceeds maximum number of slabs supported */ + VDO_TOO_MANY_SLABS, + /** a compressed block fragment is invalid */ + VDO_INVALID_FRAGMENT, + /** action is unsupported while rebuilding */ + VDO_RETRY_AFTER_REBUILD, + /** the extended command is not known */ + VDO_UNKNOWN_COMMAND, + /** bad extended command parameters */ + VDO_COMMAND_ERROR, + /** cannot determine sizes to fit */ + VDO_CANNOT_DETERMINE_SIZE, + /** a block map entry is invalid */ + VDO_BAD_MAPPING, + /** read cache has no free slots */ + VDO_READ_CACHE_BUSY, + /** bio_add_page failed */ + VDO_BIO_CREATION_FAILED, + /** bad magic number */ + VDO_BAD_MAGIC, + /** bad nonce */ + VDO_BAD_NONCE, + /** sequence number overflow */ + VDO_JOURNAL_OVERFLOW, + /** the VDO is not in a state to perform an admin operation */ + VDO_INVALID_ADMIN_STATE, + /** one more than last error code */ + VDO_STATUS_CODE_LAST, + VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END +}; + +extern const struct errorInfo vdoStatusList[]; + +/** + * Register the VDO status codes if needed. + * + * @return a success or error code + **/ +int registerStatusCodes(void); + +#endif // STATUS_CODES_H diff --git a/source/vdo/base/superBlock.c b/source/vdo/base/superBlock.c new file mode 100644 index 0000000..a7376e9 --- /dev/null +++ b/source/vdo/base/superBlock.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/superBlock.c#5 $ + */ + +#include "superBlock.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "completion.h" +#include "constants.h" +#include "header.h" +#include "releaseVersions.h" +#include "statusCodes.h" +#include "types.h" +#include "vio.h" + +struct superBlock { + /** The parent for asynchronous load and save operations */ + VDOCompletion *parent; + /** The VIO for reading and writing the super block to disk */ + VIO *vio; + /** The buffer for encoding and decoding component data */ + Buffer *componentBuffer; + /** + * A sector-sized buffer wrapping the first sector of encodedSuperBlock, for + * encoding and decoding the entire super block. + **/ + Buffer *blockBuffer; + /** A 1-block buffer holding the encoded on-disk super block */ + byte *encodedSuperBlock; + /** The release version number loaded from the volume */ + ReleaseVersionNumber loadedReleaseVersion; + /** Whether this super block may not be written */ + bool unwriteable; +}; + +enum { + SUPER_BLOCK_FIXED_SIZE + = ENCODED_HEADER_SIZE + sizeof(ReleaseVersionNumber) + CHECKSUM_SIZE, + MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - SUPER_BLOCK_FIXED_SIZE, +}; + +static const Header SUPER_BLOCK_HEADER_12_0 = { + .id = SUPER_BLOCK, + .version = { + .majorVersion = 12, + .minorVersion = 0, + }, + + // This is the minimum size, if the super block contains no components. + .size = SUPER_BLOCK_FIXED_SIZE - ENCODED_HEADER_SIZE, +}; + +/** + * Allocate a super block. Callers must free the allocated super block even + * on error. + * + * @param layer The physical layer which holds the super block on disk + * @param superBlockPtr A pointer to hold the new super block + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int allocateSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) +{ + int result = ALLOCATE(1, SuperBlock, __func__, superBlockPtr); + if (result != UDS_SUCCESS) { + return result; + } + + SuperBlock *superBlock = *superBlockPtr; + result = makeBuffer(MAX_COMPONENT_DATA_SIZE, &superBlock->componentBuffer); + if (result != UDS_SUCCESS) { + return result; + } + + result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, + "encoded super block", + (char **) &superBlock->encodedSuperBlock); + if (result != UDS_SUCCESS) { + return result; + } + + // Even though the buffer is a full block, to avoid the potential corruption + // from a torn write, the entire encoding must fit in the first sector. + result = wrapBuffer(superBlock->encodedSuperBlock, VDO_SECTOR_SIZE, 0, + &superBlock->blockBuffer); + if (result != UDS_SUCCESS) { + return result; + } + + if (layer->createMetadataVIO == NULL) { + return VDO_SUCCESS; + } + + return createVIO(layer, VIO_TYPE_SUPER_BLOCK, VIO_PRIORITY_METADATA, + superBlock, (char *) superBlock->encodedSuperBlock, + &superBlock->vio); +} + +/**********************************************************************/ +int makeSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) +{ + SuperBlock *superBlock; + int result = allocateSuperBlock(layer, &superBlock); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + return result; + } + + // For a new super block, use the current release. + superBlock->loadedReleaseVersion = CURRENT_RELEASE_VERSION_NUMBER; + *superBlockPtr = superBlock; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeSuperBlock(SuperBlock **superBlockPtr) +{ + if (*superBlockPtr == NULL) { + return; + } + + SuperBlock *superBlock = *superBlockPtr; + freeBuffer(&superBlock->blockBuffer); + freeBuffer(&superBlock->componentBuffer); + freeVIO(&superBlock->vio); + FREE(superBlock->encodedSuperBlock); + FREE(superBlock); + *superBlockPtr = NULL; +} + +/** + * Encode a super block into its on-disk representation. + * + * @param layer The physical layer which implements the checksum + * @param superBlock The super block to encode + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int encodeSuperBlock(PhysicalLayer *layer, SuperBlock *superBlock) +{ + Buffer *buffer = superBlock->blockBuffer; + int result = resetBufferEnd(buffer, 0); + if (result != VDO_SUCCESS) { + return result; + } + + size_t componentDataSize = contentLength(superBlock->componentBuffer); + + // Encode the header. + Header header = SUPER_BLOCK_HEADER_12_0; + header.size += componentDataSize; + result = encodeHeader(&header, buffer); + if (result != UDS_SUCCESS) { + return result; + } + + // Encode the loaded release version. + result = putUInt32LEIntoBuffer(buffer, superBlock->loadedReleaseVersion); + if (result != UDS_SUCCESS) { + return result; + } + + // Copy the already-encoded component data. + result = putBytes(buffer, componentDataSize, + getBufferContents(superBlock->componentBuffer)); + if (result != UDS_SUCCESS) { + return result; + } + + // Compute and encode the checksum. + CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, + superBlock->encodedSuperBlock, + contentLength(buffer)); + result = putUInt32LEIntoBuffer(buffer, checksum); + if (result != UDS_SUCCESS) { + return result; + } + + return UDS_SUCCESS; +} + +/**********************************************************************/ +int saveSuperBlock(PhysicalLayer *layer, + SuperBlock *superBlock, + PhysicalBlockNumber superBlockOffset) +{ + int result = encodeSuperBlock(layer, superBlock); + if (result != VDO_SUCCESS) { + return result; + } + + return layer->writer(layer, superBlockOffset, 1, + (char *) superBlock->encodedSuperBlock, NULL); +} + +/** + * Finish the parent of a super block load or save operation. This + * callback is registered in saveSuperBlockAsync() and loadSuperBlockAsync. + * + * @param completion The super block VIO + **/ +static void finishSuperBlockParent(VDOCompletion *completion) +{ + SuperBlock *superBlock = completion->parent; + VDOCompletion *parent = superBlock->parent; + superBlock->parent = NULL; + finishCompletion(parent, completion->result); +} + +/** + * Log a super block save error. This error handler is registered in + * saveSuperBlockAsync(). + * + * @param completion The super block VIO + **/ +static void handleSaveError(VDOCompletion *completion) +{ + logErrorWithStringError(completion->result, "super block save failed"); + /* + * Mark the super block as unwritable so that we won't attempt to write it + * again. This avoids the case where a growth attempt fails writing the + * super block with the new size, but the subsequent attempt to write out + * the read-only state succeeds. In this case, writes which happened just + * before the suspend would not be visible if the VDO is restarted without + * rebuilding, but, after a read-only rebuild, the effects of those writes + * would reappear. + */ + ((SuperBlock *) completion->parent)->unwriteable = true; + completion->callback(completion); +} + +/**********************************************************************/ +void saveSuperBlockAsync(SuperBlock *superBlock, + PhysicalBlockNumber superBlockOffset, + VDOCompletion *parent) +{ + if (superBlock->unwriteable) { + finishCompletion(parent, VDO_READ_ONLY); + return; + } + + if (superBlock->parent != NULL) { + finishCompletion(parent, VDO_COMPONENT_BUSY); + return; + } + + PhysicalLayer *layer = parent->layer; + int result = encodeSuperBlock(layer, superBlock); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + superBlock->parent = parent; + superBlock->vio->completion.callbackThreadID = parent->callbackThreadID; + launchWriteMetadataVIOWithFlush(superBlock->vio, superBlockOffset, + finishSuperBlockParent, handleSaveError, + true, true); +} + +/** + * Decode a super block from its on-disk representation. + * + * @param layer The physical layer which implements the checksum + * @param superBlock The super block to decode + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int decodeSuperBlock(PhysicalLayer *layer, SuperBlock *superBlock) +{ + // Reset the block buffer to start decoding the entire first sector. + Buffer *buffer = superBlock->blockBuffer; + clearBuffer(buffer); + + // Decode and validate the header. + Header header; + int result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&SUPER_BLOCK_HEADER_12_0, &header, false, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + if (header.size > contentLength(buffer)) { + // We can't check release version or checksum until we know the content + // size, so we have to assume a version mismatch on unexpected values. + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "super block contents too large: %zu", + header.size); + } + + // Restrict the buffer to the actual payload bytes that remain. + result = resetBufferEnd(buffer, uncompactedAmount(buffer) + header.size); + if (result != VDO_SUCCESS) { + return result; + } + + // Decode and store the release version number. It will be checked when the + // VDO master version is decoded and validated. + result = getUInt32LEFromBuffer(buffer, &superBlock->loadedReleaseVersion); + if (result != VDO_SUCCESS) { + return result; + } + + // The component data is all the rest, except for the checksum. + size_t componentDataSize = contentLength(buffer) - sizeof(CRC32Checksum); + result = putBuffer(superBlock->componentBuffer, buffer, componentDataSize); + if (result != VDO_SUCCESS) { + return result; + } + + // Checksum everything up to but not including the saved checksum itself. + CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, + superBlock->encodedSuperBlock, + uncompactedAmount(buffer)); + + // Decode and verify the saved checksum. + CRC32Checksum savedChecksum; + result = getUInt32LEFromBuffer(buffer, &savedChecksum); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT(contentLength(buffer) == 0, + "must have decoded entire superblock payload"); + if (result != VDO_SUCCESS) { + return result; + } + + return ((checksum != savedChecksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS); +} + +/**********************************************************************/ +int loadSuperBlock(PhysicalLayer *layer, + PhysicalBlockNumber superBlockOffset, + SuperBlock **superBlockPtr) +{ + SuperBlock *superBlock = NULL; + int result = allocateSuperBlock(layer, &superBlock); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + return result; + } + + result = layer->reader(layer, superBlockOffset, 1, + (char *) superBlock->encodedSuperBlock, NULL); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + return result; + } + + result = decodeSuperBlock(layer, superBlock); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + return result; + } + + *superBlockPtr = superBlock; + return result; +} + +/** + * Continue after loading the super block. This callback is registered + * in loadSuperBlockAsync(). + * + * @param completion The super block VIO + **/ +static void finishReadingSuperBlock(VDOCompletion *completion) +{ + SuperBlock *superBlock = completion->parent; + VDOCompletion *parent = superBlock->parent; + superBlock->parent = NULL; + finishCompletion(parent, decodeSuperBlock(completion->layer, superBlock)); +} + +/**********************************************************************/ +void loadSuperBlockAsync(VDOCompletion *parent, + PhysicalBlockNumber superBlockOffset, + SuperBlock **superBlockPtr) +{ + PhysicalLayer *layer = parent->layer; + SuperBlock *superBlock = NULL; + int result = allocateSuperBlock(layer, &superBlock); + if (result != VDO_SUCCESS) { + freeSuperBlock(&superBlock); + finishCompletion(parent, result); + return; + } + + *superBlockPtr = superBlock; + + superBlock->parent = parent; + superBlock->vio->completion.callbackThreadID = parent->callbackThreadID; + launchReadMetadataVIO(superBlock->vio, superBlockOffset, + finishReadingSuperBlock, finishSuperBlockParent); +} + +/**********************************************************************/ +Buffer *getComponentBuffer(SuperBlock *superBlock) +{ + return superBlock->componentBuffer; +} + +/**********************************************************************/ +ReleaseVersionNumber getLoadedReleaseVersion(const SuperBlock *superBlock) +{ + return superBlock->loadedReleaseVersion; +} + +/**********************************************************************/ +size_t getFixedSuperBlockSize(void) +{ + return SUPER_BLOCK_FIXED_SIZE; +} diff --git a/source/vdo/base/superBlock.h b/source/vdo/base/superBlock.h new file mode 100644 index 0000000..bfed7c6 --- /dev/null +++ b/source/vdo/base/superBlock.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/superBlock.h#2 $ + */ + +#ifndef SUPER_BLOCK_H +#define SUPER_BLOCK_H + +#include "buffer.h" + +#include "completion.h" +#include "types.h" + +typedef struct superBlock SuperBlock; + +/** + * Make a new super block. + * + * @param [in] layer The layer on which to write this super block + * @param [out] superBlockPtr A pointer to hold the new super block + * + * @return VDO_SUCCESS or an error + **/ +int makeSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) + __attribute__((warn_unused_result)); + +/** + * Free a super block and null out the reference to it. + * + * @param superBlockPtr the reference to the super block to free + **/ +void freeSuperBlock(SuperBlock **superBlockPtr); + +/** + * Save a super block. + * + * @param layer The physical layer on which to save the super block + * @param superBlock The super block to save + * @param superBlockOffset The location of the super block + * + * @return VDO_SUCCESS or an error + **/ +int saveSuperBlock(PhysicalLayer *layer, + SuperBlock *superBlock, + PhysicalBlockNumber superBlockOffset) + __attribute__((warn_unused_result)); + +/** + * Save a super block asynchronously. + * + * @param superBlock The super block to save + * @param superBlockOffset The location at which to write the super block + * @param parent The object to notify when the save is complete + **/ +void saveSuperBlockAsync(SuperBlock *superBlock, + PhysicalBlockNumber superBlockOffset, + VDOCompletion *parent); + +/** + * Allocate a super block and read its contents from storage. + * + * @param [in] layer The layer from which to load the super block + * @param [in] superBlockOffset The location from which to read the super + * block + * @param [out] superBlockPtr A pointer to hold the loaded super block + * + * @return VDO_SUCCESS or an error + **/ +int loadSuperBlock(PhysicalLayer *layer, + PhysicalBlockNumber superBlockOffset, + SuperBlock **superBlockPtr) + __attribute__((warn_unused_result)); + +/** + * Allocate a super block and read its contents from storage asynchronously. If + * a load error occurs before the super block's own completion can be allocated, + * the parent will be finished with the error. + * + * @param [in] parent The completion to finish after loading the + * super block + * @param [in] superBlockOffset The location from which to read the super + * block + * @param [out] superBlockPtr A pointer to hold the super block + **/ +void loadSuperBlockAsync(VDOCompletion *parent, + PhysicalBlockNumber superBlockOffset, + SuperBlock **superBlockPtr); + +/** + * Get a buffer which contains the component data from a super block. + * + * @param superBlock The super block from which to get the component data + * + * @return the component data in a buffer + **/ +Buffer *getComponentBuffer(SuperBlock *superBlock) + __attribute__((warn_unused_result)); + +/** + * Get the release version number that was loaded from the volume when the + * SuperBlock was decoded. + * + * @param superBlock The super block to query + * + * @return the release version number that was decoded from the volume + **/ +ReleaseVersionNumber getLoadedReleaseVersion(const SuperBlock *superBlock) + __attribute__((warn_unused_result)); + +/** + * Get the encoded size of the fixed (non-component data) portion of a super + * block (this is for unit testing). + * + * @return The encoded size of the fixed portion of the super block + **/ +size_t getFixedSuperBlockSize(void) + __attribute__((warn_unused_result)); + +#endif /* SUPER_BLOCK_H */ diff --git a/source/vdo/base/threadConfig.c b/source/vdo/base/threadConfig.c new file mode 100644 index 0000000..b671b73 --- /dev/null +++ b/source/vdo/base/threadConfig.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/threadConfig.c#2 $ + */ + +#include "threadConfig.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "constants.h" +#include "types.h" + +/**********************************************************************/ +static int allocateThreadConfig(ZoneCount logicalZoneCount, + ZoneCount physicalZoneCount, + ZoneCount hashZoneCount, + ZoneCount baseThreadCount, + ThreadConfig **configPtr) +{ + ThreadConfig *config; + int result = ALLOCATE(1, ThreadConfig, "thread config", &config); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(logicalZoneCount, ThreadID, "logical thread array", + &config->logicalThreads); + if (result != VDO_SUCCESS) { + freeThreadConfig(&config); + return result; + } + + result = ALLOCATE(physicalZoneCount, ThreadID, "physical thread array", + &config->physicalThreads); + if (result != VDO_SUCCESS) { + freeThreadConfig(&config); + return result; + } + + result = ALLOCATE(hashZoneCount, ThreadID, "hash thread array", + &config->hashZoneThreads); + if (result != VDO_SUCCESS) { + freeThreadConfig(&config); + return result; + } + + config->logicalZoneCount = logicalZoneCount; + config->physicalZoneCount = physicalZoneCount; + config->hashZoneCount = hashZoneCount; + config->baseThreadCount = baseThreadCount; + + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void assignThreadIDs(ThreadID threadIDs[], + ZoneCount count, + ThreadID *idPtr) +{ + for (ZoneCount zone = 0; zone < count; zone++) { + threadIDs[zone] = (*idPtr)++; + } +} + +/**********************************************************************/ +int makeThreadConfig(ZoneCount logicalZoneCount, + ZoneCount physicalZoneCount, + ZoneCount hashZoneCount, + ThreadConfig **configPtr) +{ + if ((logicalZoneCount == 0) + && (physicalZoneCount == 0) + && (hashZoneCount == 0)) { + return makeOneThreadConfig(configPtr); + } + + if (physicalZoneCount > MAX_PHYSICAL_ZONES) { + return logErrorWithStringError(VDO_BAD_CONFIGURATION, + "Physical zone count %u exceeds maximum " + "(%u)", + physicalZoneCount, MAX_PHYSICAL_ZONES); + } + + if (logicalZoneCount > MAX_LOGICAL_ZONES) { + return logErrorWithStringError(VDO_BAD_CONFIGURATION, + "Logical zone count %u exceeds maximum " + "(%u)", + logicalZoneCount, MAX_LOGICAL_ZONES); + } + + ThreadConfig *config; + ThreadCount total = logicalZoneCount + physicalZoneCount + hashZoneCount + 2; + int result = allocateThreadConfig(logicalZoneCount, physicalZoneCount, + hashZoneCount, total, &config); + if (result != VDO_SUCCESS) { + return result; + } + + ThreadID id = 0; + config->adminThread = id; + config->journalThread = id++; + config->packerThread = id++; + assignThreadIDs(config->logicalThreads, logicalZoneCount, &id); + assignThreadIDs(config->physicalThreads, physicalZoneCount, &id); + assignThreadIDs(config->hashZoneThreads, hashZoneCount, &id); + + ASSERT_LOG_ONLY(id == total, "correct number of thread IDs assigned"); + + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeZeroThreadConfig(ThreadConfig **configPtr) +{ + ThreadConfig *config; + int result = ALLOCATE(1, ThreadConfig, __func__, &config); + if (result != VDO_SUCCESS) { + return result; + } + + config->logicalZoneCount = 0; + config->physicalZoneCount = 0; + config->hashZoneCount = 0; + config->baseThreadCount = 0; + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeOneThreadConfig(ThreadConfig **configPtr) +{ + ThreadConfig *config; + int result = allocateThreadConfig(1, 1, 1, 1, &config); + if (result != VDO_SUCCESS) { + return result; + } + + config->logicalThreads[0] = 0; + config->physicalThreads[0] = 0; + config->hashZoneThreads[0] = 0; + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int copyThreadConfig(const ThreadConfig *oldConfig, ThreadConfig **configPtr) +{ + ThreadConfig *config; + int result = allocateThreadConfig(oldConfig->logicalZoneCount, + oldConfig->physicalZoneCount, + oldConfig->hashZoneCount, + oldConfig->baseThreadCount, + &config); + if (result != VDO_SUCCESS) { + return result; + } + + config->adminThread = oldConfig->adminThread; + config->journalThread = oldConfig->journalThread; + config->packerThread = oldConfig->packerThread; + for (ZoneCount i = 0; i < config->logicalZoneCount; i++) { + config->logicalThreads[i] = oldConfig->logicalThreads[i]; + } + for (ZoneCount i = 0; i < config->physicalZoneCount; i++) { + config->physicalThreads[i] = oldConfig->physicalThreads[i]; + } + for (ZoneCount i = 0; i < config->hashZoneCount; i++) { + config->hashZoneThreads[i] = oldConfig->hashZoneThreads[i]; + } + + *configPtr = config; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeThreadConfig(ThreadConfig **configPtr) +{ + if (*configPtr == NULL) { + return; + } + + ThreadConfig *config = *configPtr; + *configPtr = NULL; + + FREE(config->logicalThreads); + FREE(config->physicalThreads); + FREE(config->hashZoneThreads); + FREE(config); +} + +/**********************************************************************/ +static bool getZoneThreadName(const ThreadID threadIDs[], + ZoneCount count, + ThreadID id, + const char *prefix, + char *buffer, + size_t bufferLength) +{ + if (id >= threadIDs[0]) { + ThreadID index = id - threadIDs[0]; + if (index < count) { + snprintf(buffer, bufferLength, "%s%d", prefix, index); + return true; + } + } + return false; +} + +/**********************************************************************/ +void getVDOThreadName(const ThreadConfig *threadConfig, + ThreadID threadID, + char *buffer, + size_t bufferLength) +{ + if (threadConfig->baseThreadCount == 1) { + // Historically this was the "request queue" thread. + snprintf(buffer, bufferLength, "reqQ"); + return; + } + if (threadID == threadConfig->journalThread) { + snprintf(buffer, bufferLength, "journalQ"); + return; + } else if (threadID == threadConfig->adminThread) { + // Theoretically this could be different from the journal thread. + snprintf(buffer, bufferLength, "adminQ"); + return; + } else if (threadID == threadConfig->packerThread) { + snprintf(buffer, bufferLength, "packerQ"); + return; + } + if (getZoneThreadName(threadConfig->logicalThreads, + threadConfig->logicalZoneCount, + threadID, "logQ", buffer, bufferLength)) { + return; + } + if (getZoneThreadName(threadConfig->physicalThreads, + threadConfig->physicalZoneCount, + threadID, "physQ", buffer, bufferLength)) { + return; + } + if (getZoneThreadName(threadConfig->hashZoneThreads, + threadConfig->hashZoneCount, + threadID, "hashQ", buffer, bufferLength)) { + return; + } + + // Some sort of misconfiguration? + snprintf(buffer, bufferLength, "reqQ%d", threadID); +} diff --git a/source/vdo/base/threadConfig.h b/source/vdo/base/threadConfig.h new file mode 100644 index 0000000..6401651 --- /dev/null +++ b/source/vdo/base/threadConfig.h @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/threadConfig.h#1 $ + */ + +#ifndef THREAD_CONFIG_H +#define THREAD_CONFIG_H + +#include "permassert.h" + +#include "types.h" + +struct threadConfig { + ZoneCount logicalZoneCount; + ZoneCount physicalZoneCount; + ZoneCount hashZoneCount; + ThreadCount baseThreadCount; + ThreadID adminThread; + ThreadID journalThread; + ThreadID packerThread; + ThreadID *logicalThreads; + ThreadID *physicalThreads; + ThreadID *hashZoneThreads; +}; + +/** + * Make a thread configuration. If both the logical zone count and the + * physical zone count are set to 0, a one thread configuration will be + * made. + * + * @param [in] logicalZoneCount The number of logical zones + * @param [in] physicalZoneCount The number of physical zones + * @param [in] hashZoneCount The number of hash zones + * @param [out] configPtr A pointer to hold the new thread + * configuration + * + * @return VDO_SUCCESS or an error + **/ +int makeThreadConfig(ZoneCount logicalZoneCount, + ZoneCount physicalZoneCount, + ZoneCount hashZoneCount, + ThreadConfig **configPtr) + __attribute__((warn_unused_result)); + +/** + * Make a thread configuration that uses no threads. This is the configuration + * for VDOs which are constructed from user mode that have only a synchronous + * layer. + * + * @param [out] configPtr A pointer to hold the new thread configuration + * + * @return VDO_SUCCESS or an error + **/ +int makeZeroThreadConfig(ThreadConfig **configPtr); + +/** + * Make a thread configuration that uses only one thread. + * + * @param [out] configPtr A pointer to hold the new thread configuration + * + * @return VDO_SUCCESS or an error + **/ +int makeOneThreadConfig(ThreadConfig **configPtr) + __attribute__((warn_unused_result)); + +/** + * Make a new thread config which is a copy of an existing one. + * + * @param [in] oldConfig The thread configuration to copy + * @param [out] configPtr A pointer to hold the new thread configuration + * + * @return VDO_SUCCESS or an error + **/ +int copyThreadConfig(const ThreadConfig *oldConfig, ThreadConfig **configPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a thread configuration and null out the reference to it. + * + * @param configPtr The reference to the thread configuration to destroy + **/ +void freeThreadConfig(ThreadConfig **configPtr); + +/** + * Get the thread id for a given logical zone. + * + * @param threadConfig the thread config + * @param logicalZone the number of the logical zone + * + * @return the thread id for the given zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getLogicalZoneThread(const ThreadConfig *threadConfig, + ZoneCount logicalZone) +{ + ASSERT_LOG_ONLY((logicalZone <= threadConfig->logicalZoneCount), + "logical zone valid"); + return threadConfig->logicalThreads[logicalZone]; +} + +/** + * Get the thread id for a given physical zone. + * + * @param threadConfig the thread config + * @param physicalZone the number of the physical zone + * + * @return the thread id for the given zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getPhysicalZoneThread(const ThreadConfig *threadConfig, + ZoneCount physicalZone) +{ + ASSERT_LOG_ONLY((physicalZone <= threadConfig->physicalZoneCount), + "physical zone valid"); + return threadConfig->physicalThreads[physicalZone]; +} + +/** + * Get the thread id for a given hash zone. + * + * @param threadConfig the thread config + * @param hashZone the number of the hash zone + * + * @return the thread id for the given zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getHashZoneThread(const ThreadConfig *threadConfig, + ZoneCount hashZone) +{ + ASSERT_LOG_ONLY((hashZone <= threadConfig->hashZoneCount), + "hash zone valid"); + return threadConfig->hashZoneThreads[hashZone]; +} + +/** + * Get the thread id for the journal zone. + * + * @param threadConfig the thread config + * + * @return the thread id for the journal zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getJournalZoneThread(const ThreadConfig *threadConfig) +{ + return threadConfig->journalThread; +} + +/** + * Get the thread id for the packer zone. + * + * @param threadConfig the thread config + * + * @return the thread id for the packer zone + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getPackerZoneThread(const ThreadConfig *threadConfig) +{ + return threadConfig->packerThread; +} + +/** + * Get the thread ID for admin requests. + * + * @param threadConfig The thread config + * + * @return the thread id to use for admin requests + **/ +__attribute__((warn_unused_result)) +static inline ThreadID getAdminThread(const ThreadConfig *threadConfig) +{ + return threadConfig->adminThread; +} + +/** + * Format the name of the worker thread desired to support a given + * work queue. The physical layer may add a prefix identifying the + * product; the output from this function should just identify the + * thread. + * + * @param threadConfig The thread configuration + * @param threadID The thread id + * @param buffer Where to put the formatted name + * @param bufferLength Size of the output buffer + **/ +void getVDOThreadName(const ThreadConfig *threadConfig, + ThreadID threadID, + char *buffer, + size_t bufferLength); + +#endif /* THREAD_CONFIG_H */ diff --git a/source/vdo/base/trace.c b/source/vdo/base/trace.c new file mode 100644 index 0000000..7b4e33f --- /dev/null +++ b/source/vdo/base/trace.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/trace.c#1 $ + */ + +#include "trace.h" + +#include "logger.h" +#include "stringUtils.h" +#include "timeUtils.h" + +TRACE_LOCATION_SECTION TraceLocationRecord baseTraceLocation[] = { + { + .function = "", + .line = 0, + }, +}; + +/**********************************************************************/ +void addTraceRecord(Trace *trace, TraceLocation location) +{ + if (trace->used < NUM_TRACE_RECORDS) { + TraceRecord *record = &trace->records[trace->used]; + trace->used++; + + record->when = nowUsec(); + record->tid = getThreadId(); + record->location = location - baseTraceLocation; + } +} + +/* + * The record display format used is a comma-separated list, each item + * containing: optional function name; "@" + timestamp with seconds + * and microseconds for the first record; if not the first record, "+" + * and offset in microseconds from previous timestamp. + * + * If the buffer's too small, it'll end with an ellipsis. + */ +void formatTrace(Trace *trace, + char *buffer, + size_t bufferLength, + size_t *msgLen) +{ + if (trace == NULL) { + return; + } + memset(buffer, 0, bufferLength); + char *buf = buffer; + char *bufferEnd = buffer + bufferLength - 1; + if (trace->used > 0) { + TraceRecord *record = &trace->records[0]; + TraceLocationRecord *location = baseTraceLocation + record->location; + snprintf(buf, bufferEnd - buf, "Trace[%s@%llu.%06llu", + location->function, record->when / 1000000, + record->when % 1000000); + buf += strlen(buf); + + for (unsigned int i = 1; i < trace->used; i++) { + TraceRecord *prev = record; + record++; + + snprintf(buf, bufferEnd - buf, ","); + buf += strlen(buf); + + location = baseTraceLocation + record->location; + unsigned long timeDiff = record->when - prev->when; + snprintf(buf, bufferEnd - buf, "%s+%lu", + location->function, timeDiff); + buf += strlen(buf); + } + if (bufferLength > 7) { + if (buffer[bufferLength-5] != '\0') { + // too long + strcpy(buffer+bufferLength-5, "...]"); + } else { + strcpy(buf, "]"); + } + } + } + *msgLen = (buf - buffer); +} diff --git a/source/vdo/base/trace.h b/source/vdo/base/trace.h new file mode 100644 index 0000000..59dabf9 --- /dev/null +++ b/source/vdo/base/trace.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/trace.h#1 $ + */ + +#ifndef TRACE_H +#define TRACE_H + +#ifndef __KERNEL__ +#include "cpu.h" +#endif + +#include "threads.h" + +/* + * We need these records to be glued together with no intervening + * bytes. That makes it rather sensitive to how the compiler, + * assembler, and linker may add padding. Force extra alignment to + * make it more reliable. + * + * Trace point descriptor language: + * + * The descriptor string provided at a trace point can have one or + * more components, separated by ";". The first (or only) component is + * a string to be formatted and shown in the flowchart graph. The + * remaining components must be of the form "var=string", and assign + * string values to "variables" that last through the processing of + * the remainder of the current trace being read. + * + * The string displayed has variable substitutions done for any + * occurrences of "$var" in the string. + * + * So, the descriptor sequence: + * kvdoWriteVIO;io=writeData;j=normal + * submitBio($io) + * writeJournalBlock($j) + * would cause the graph generator to show the strings: + * kvdoWriteVIO + * submitBio(writeData) + * writeJournalBlock(normal) + * + * Substitutions are done in the variable assignment strings when + * they're processed, so "foo=x($bar)" sets "foo" using the current + * value of "bar"; it doesn't cause "bar" to be looked up when "$foo" + * is seen later. + * + * The variable named "F" is automatically updated with the name of + * the function associated with the descriptor, so you don't have to + * explicitly repeat the name of the function if you just want to + * augment it with more information. This may be desirable if a trace + * point is expected to be reached more than once at different stages + * of processing, or in a static function with a generic-sounding name + * that needs disambiguation for graphing. + * + * If no descriptor string is provided, the + * function:lineNumber:threadName string reported via systemtap will + * be used in the graph. + * + * Current variable names used: + * cb=(various) random info to log when enqueueing VIO callback + * dup=post,update deduplication operation + * io=(various) kind of I/O and data it's being done on + * j=normal,dedupe kind of journal update being done + * js=mapWrite,writeZero,unmap which step of journaling we're doing + */ +typedef const struct __attribute__((aligned(16))) traceLocationRecord { + const char *function; + int line; + const char *description; +} TraceLocationRecord; + +/* + * With well under 100 locations defined at the moment, even with no + * idea where &baseTraceLocation will fall relative to the others, we + * only need to support a range of -100..+100. + */ +typedef int32_t TraceLocationNumber; + +/* The type to pass around */ +typedef TraceLocationRecord *TraceLocation; + +/* + * N.B.: This code uses GCC extensions to create static, initialized + * objects inline, describing the current function and line number. + * The objects are collected into a table we can index with small + * signed integers relative to &baseTraceLocation. + * + * We need baseTraceLocation because there's no standard way to get + * the address of the start of this array we're defining. And because + * we're not playing any (additional) special linker tricks to ensure + * ordering of the object files, the offsets may be signed, and we + * don't know the range beyond the fact that we don't have hundreds of + * these records lying around. + * + * By specifying a name that starts with neither .data nor .rodata, we + * leave it to the toolchain to pick a location for us, based on + * things like whether the section needs write access, which it does + * for a PIC library but not for a kernel module. + */ + +#define TRACE_LOCATION_SECTION \ + __attribute__((section(".kvdo_trace_locations"))) + +extern TRACE_LOCATION_SECTION TraceLocationRecord baseTraceLocation[]; + +#define TRACE_JOIN2(a,b) a##b +#define TRACE_JOIN(a,b) TRACE_JOIN2(a,b) +#define THIS_LOCATION(DESCRIPTION) \ + __extension__ \ + ({ \ + static TRACE_LOCATION_SECTION \ + TraceLocationRecord TRACE_JOIN(loc,__LINE__) = { \ + .function = __func__, \ + .line = __LINE__, \ + .description = DESCRIPTION, \ + }; \ + &TRACE_JOIN(loc,__LINE__); \ + }) + +typedef struct traceRecord { + uint64_t when; // counted in usec + pid_t tid; + TraceLocationNumber location; +} TraceRecord; + +enum { NUM_TRACE_RECORDS = 71 }; + +typedef struct trace { + unsigned int used; + TraceRecord records[NUM_TRACE_RECORDS]; +} Trace; + +/** + * Store a new record in the trace data. + * + * @param trace The trace data to be updated + * @param location The source-location descriptor to be recorded + **/ +void addTraceRecord(Trace *trace, TraceLocation location); + +/** + * Format trace data into a string for logging. + * + * @param [in] trace The trace data to be logged + * @param [in] buffer The buffer in which to store the string + * @param [in] bufferLength Length of the buffer + * @param [out] msgLen Length of the formatted string + **/ +void formatTrace(Trace *trace, + char *buffer, + size_t bufferLength, + size_t *msgLen); + +#endif /* TRACE_H */ diff --git a/source/vdo/base/types.h b/source/vdo/base/types.h new file mode 100644 index 0000000..d820da6 --- /dev/null +++ b/source/vdo/base/types.h @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/types.h#14 $ + */ + +#ifndef TYPES_H +#define TYPES_H + +#include "blockMappingState.h" +#include "common.h" +#include "statusCodes.h" + +/** + * A size type in blocks. + **/ +typedef uint64_t BlockCount; + +/** + * The size of a block. + **/ +typedef uint16_t BlockSize; + +/** + * A count of compressed fragments + **/ +typedef uint8_t CompressedFragmentCount; + +/** + * A CRC-32 checksum + **/ +typedef uint32_t CRC32Checksum; + +/** + * A height within a tree. + **/ +typedef uint8_t Height; + +/** + * The logical block number as used by the consumer. + **/ +typedef uint64_t LogicalBlockNumber; + +/** + * The type of the nonce used to identify instances of VDO. + **/ +typedef uint64_t Nonce; + +/** + * A size in pages. + **/ +typedef uint32_t PageCount; + +/** + * A page number. + **/ +typedef uint32_t PageNumber; + +/** + * The size of a page. Must be evenly divisible by block size. + **/ +typedef uint32_t PageSize; + +/** + * The physical (well, less logical) block number at which the block is found + * on the underlying device. + **/ +typedef uint64_t PhysicalBlockNumber; + +/** + * A release version number. These numbers are used to make the numbering + * space for component versions independent across release branches. + * + * Really an enum, but we have to specify the size for encoding; see + * releaseVersions.h for the enumeration values. + **/ +typedef uint32_t ReleaseVersionNumber; + +/** + * A count of tree roots. + **/ +typedef uint8_t RootCount; + +/** + * A number of sectors. + **/ +typedef uint8_t SectorCount; + +/** + * A sequence number. + **/ +typedef uint64_t SequenceNumber; + +/** + * A size type in slabs. + **/ +typedef uint16_t SlabCount; + +/** + * A slot in a bin or block map page. + **/ +typedef uint16_t SlotNumber; + +/** + * A number of VIOs. + **/ +typedef uint16_t VIOCount; + +/** + * A VDO thread configuration. + **/ +typedef struct threadConfig ThreadConfig; + +/** + * A thread counter + **/ +typedef uint8_t ThreadCount; + +/** + * A thread ID + * + * Base-code threads are numbered sequentially starting from 0. + **/ +typedef uint8_t ThreadID; + +/** + * The thread ID returned when the current base code thread ID cannot be found + * or is otherwise undefined. + **/ +static const ThreadID INVALID_THREAD_ID = (ThreadID) -1; + +/** + * A zone counter + **/ +typedef uint8_t ZoneCount; + +/** + * The type of request a VIO is performing + **/ +typedef enum __attribute__((packed)) vioOperation { + VIO_UNSPECIFIED_OPERATION = 0, + VIO_READ = 1, + VIO_WRITE = 2, + VIO_READ_MODIFY_WRITE = VIO_READ | VIO_WRITE, + VIO_READ_WRITE_MASK = VIO_READ_MODIFY_WRITE, + VIO_FLUSH_BEFORE = 4, + VIO_FLUSH_AFTER = 8, +} VIOOperation; + +/** + * VIO types for statistics and instrumentation. + **/ +typedef enum __attribute__((packed)) { + VIO_TYPE_UNINITIALIZED = 0, + VIO_TYPE_DATA, + VIO_TYPE_BLOCK_ALLOCATOR, + VIO_TYPE_BLOCK_MAP, + VIO_TYPE_BLOCK_MAP_INTERIOR, + VIO_TYPE_COMPRESSED_BLOCK, + VIO_TYPE_PARTITION_COPY, + VIO_TYPE_RECOVERY_JOURNAL, + VIO_TYPE_SLAB_JOURNAL, + VIO_TYPE_SLAB_SUMMARY, + VIO_TYPE_SUPER_BLOCK, + VIO_TYPE_TEST, +} VIOType; + +/** + * The current operation on a physical block (from the point of view of the + * recovery journal, slab journals, and reference counts. + **/ +typedef enum __attribute__((packed)) { + DATA_DECREMENT = 0, + DATA_INCREMENT = 1, + BLOCK_MAP_DECREMENT = 2, + BLOCK_MAP_INCREMENT = 3, +} JournalOperation; + +/** + * Partition IDs are encoded in the volume layout in the super block. + **/ +typedef enum __attribute__((packed)) { + BLOCK_MAP_PARTITION = 0, + BLOCK_ALLOCATOR_PARTITION = 1, + RECOVERY_JOURNAL_PARTITION = 2, + SLAB_SUMMARY_PARTITION = 3, +} PartitionID; + +/** + * Check whether a VIOType is for servicing an external data request. + * + * @param vioType The VIOType to check + **/ +static inline bool isDataVIOType(VIOType vioType) +{ + return (vioType == VIO_TYPE_DATA); +} + +/** + * Check whether a VIOType is for compressed block writes + * + * @param vioType The VIOType to check + **/ +static inline bool isCompressedWriteVIOType(VIOType vioType) +{ + return (vioType == VIO_TYPE_COMPRESSED_BLOCK); +} + +/** + * Check whether a VIOType is for metadata + * + * @param vioType The VIOType to check + **/ +static inline bool isMetadataVIOType(VIOType vioType) +{ + return ((vioType != VIO_TYPE_UNINITIALIZED) + && !isDataVIOType(vioType) + && !isCompressedWriteVIOType(vioType)); +} + +/** + * Priority levels for asynchronous I/O operations performed on a VIO. + **/ +typedef enum __attribute__((packed)) vioPriority { + VIO_PRIORITY_LOW = 0, + VIO_PRIORITY_DATA = VIO_PRIORITY_LOW, + VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA, + VIO_PRIORITY_METADATA, + VIO_PRIORITY_HIGH, +} VIOPriority; + +/** + * Metadata types for the VDO. + **/ +typedef enum __attribute__((packed)) { + VDO_METADATA_RECOVERY_JOURNAL = 1, + VDO_METADATA_SLAB_JOURNAL, +} VDOMetadataType; + +/** + * The possible write policy values. + **/ +typedef enum { + WRITE_POLICY_SYNC, ///< All writes are synchronous, i. e., they + ///< are acknowledged only when the data is + ///< written to stable storage. + WRITE_POLICY_ASYNC, ///< Writes are acknowledged when the data is + ///< cached for writing to stable storage, subject + ///< to resiliency guarantees specified elsewhere. + ///< After a crash, the data will be either old or + ///< new value for unflushed writes, never garbage. + WRITE_POLICY_ASYNC_UNSAFE, ///< Writes are acknowledged when the data is + ///< cached for writing to stable storage, subject + ///< to resiliency guarantees specified elsewhere. + WRITE_POLICY_AUTO, ///< The appropriate policy is chosen based on the + ///< underlying device +} WritePolicy; + +typedef enum { + ZONE_TYPE_ADMIN, + ZONE_TYPE_JOURNAL, + ZONE_TYPE_LOGICAL, + ZONE_TYPE_PHYSICAL, +} ZoneType; + +/** + * A position in the block map where a block map entry is stored. + **/ +typedef struct { + PhysicalBlockNumber pbn; + SlotNumber slot; +} BlockMapSlot; + +/** + * A position in the arboreal block map at a specific level. + **/ +typedef struct { + PageNumber pageIndex; + BlockMapSlot blockMapSlot; +} BlockMapTreeSlot; + +/** + * The configuration of a single slab derived from the configured block size + * and slab size. + **/ +typedef struct slabConfig { + BlockCount slabBlocks; ///< total number of blocks in the slab + BlockCount dataBlocks; ///< number of blocks available for data + BlockCount referenceCountBlocks; ///< number of blocks for refCounts + BlockCount slabJournalBlocks; ///< number of blocks for the slab journal + /** + * Number of blocks after which the slab journal starts pushing out a + * ReferenceBlock for each new entry it receives. + **/ + BlockCount slabJournalFlushingThreshold; + /** + * Number of blocks after which the slab journal pushes out all + * ReferenceBlocks and makes all VIOs wait. + **/ + BlockCount slabJournalBlockingThreshold; + /** + * Number of blocks after which the slab must be scrubbed before coming + * online. + **/ + BlockCount slabJournalScrubbingThreshold; +} __attribute__((packed)) SlabConfig; + +/** + * The configuration of the VDO service. + **/ +typedef struct vdoConfig { + BlockCount logicalBlocks; ///< number of logical blocks + BlockCount physicalBlocks; ///< number of physical blocks + BlockCount slabSize; ///< number of blocks in a slab + BlockCount recoveryJournalSize; ///< number of recovery journal blocks + BlockCount slabJournalBlocks; ///< number of slab journal blocks +} __attribute__((packed)) VDOConfig; + +/** + * The configuration parameters of the VDO service specified at load time. + **/ +typedef struct vdoLoadConfig { + /** the offset on the physical layer where the VDO begins */ + PhysicalBlockNumber firstBlockOffset; + /** the expected release version number of the VDO */ + ReleaseVersionNumber releaseVersion; + /** the expected nonce of the VDO */ + Nonce nonce; + /** the thread configuration of the VDO */ + ThreadConfig *threadConfig; + /** the page cache size, in pages */ + PageCount cacheSize; + /** whether writes are synchronous */ + WritePolicy writePolicy; + /** the maximum age of a dirty block map page in recovery journal blocks */ + BlockCount maximumAge; +} VDOLoadConfig; + +/** + * Forward declarations of abstract types + **/ +typedef struct actionManager ActionManager; +typedef struct allocatingVIO AllocatingVIO; +typedef struct allocationSelector AllocationSelector; +typedef struct blockAllocator BlockAllocator; +typedef struct blockMap BlockMap; +typedef struct blockMapTreeZone BlockMapTreeZone; +typedef struct blockMapZone BlockMapZone; +typedef struct dataVIO DataVIO; +typedef struct flusher Flusher; +typedef struct forest Forest; +typedef struct hashLock HashLock; +typedef struct hashZone HashZone; +typedef struct indexConfig IndexConfig; +typedef struct inputBin InputBin; +typedef struct lbnLock LBNLock; +typedef struct lockCounter LockCounter; +typedef struct logicalZone LogicalZone; +typedef struct logicalZones LogicalZones; +typedef struct pbnLock PBNLock; +typedef struct physicalLayer PhysicalLayer; +typedef struct physicalZone PhysicalZone; +typedef struct recoveryJournal RecoveryJournal; +typedef struct readOnlyNotifier ReadOnlyNotifier; +typedef struct refCounts RefCounts; +typedef struct vdoSlab Slab; +typedef struct slabDepot SlabDepot; +typedef struct slabJournal SlabJournal; +typedef struct slabJournalEntry SlabJournalEntry; +typedef struct slabScrubber SlabScrubber; +typedef struct slabSummary SlabSummary; +typedef struct slabSummaryZone SlabSummaryZone; +typedef struct vdo VDO; +typedef struct vdoCompletion VDOCompletion; +typedef struct vdoExtent VDOExtent; +typedef struct vdoFlush VDOFlush; +typedef struct vdoLayout VDOLayout; +typedef struct vdoStatistics VDOStatistics; +typedef struct vio VIO; +typedef struct vioPool VIOPool; + +typedef struct { + PhysicalBlockNumber pbn; + BlockMappingState state; +} DataLocation; + +typedef struct { + PhysicalBlockNumber pbn; + BlockMappingState state; + PhysicalZone *zone; +} ZonedPBN; + +/** + * Callback which will be called by the VDO when all of the VIOs in the + * extent have been processed. + * + * @param extent The extent which is complete + **/ +typedef void VDOExtentCallback(VDOExtent *extent); + +/** + * An asynchronous operation. + * + * @param vio The VIO on which to operate + **/ +typedef void AsyncOperation(VIO *vio); + +/** + * An asynchronous compressed write operation. + * + * @param allocatingVIO The AllocatingVIO to write + **/ +typedef void CompressedWriter(AllocatingVIO *allocatingVIO); + +/** + * An asynchronous data operation. + * + * @param dataVIO The DataVIO on which to operate + **/ +typedef void AsyncDataOperation(DataVIO *dataVIO); + +/** + * A reference to a completion which (the reference) can be enqueued + * for completion on a specified thread. + **/ +typedef struct enqueueable { + VDOCompletion *completion; +} Enqueueable; + +#endif // TYPES_H diff --git a/source/vdo/base/upgrade.c b/source/vdo/base/upgrade.c new file mode 100644 index 0000000..4d58d6f --- /dev/null +++ b/source/vdo/base/upgrade.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/upgrade.c#6 $ + */ + +#include "upgrade.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "blockMap.h" +#include "readOnlyNotifier.h" +#include "recoveryJournal.h" +#include "releaseVersions.h" +#include "slabDepot.h" +#include "statusCodes.h" +#include "superBlock.h" +#include "vdoInternal.h" +#include "volumeGeometry.h" + +/* The latest supported Sodium version */ +/* Commented out because not currently used. + * static const VersionNumber SODIUM_MASTER_VERSION_67_0 = { + * .majorVersion = 67, + * .minorVersion = 0, + * }; + */ + +/* The component data version for current Sodium */ +static const VersionNumber SODIUM_COMPONENT_DATA_41_0 = { + .majorVersion = 41, + .minorVersion = 0, +}; + +/** + * Current Sodium's configuration of the VDO component. + **/ +typedef struct { + VDOState state; + uint64_t completeRecoveries; + uint64_t readOnlyRecoveries; + VDOConfig config; + Nonce nonce; +} __attribute__((packed)) SodiumComponent41_0; + +/** + * Checks whether the release version loaded in the superblock is the + * current VDO version. + * + * @param vdo The VDO to validate + * + * @return true if the release version number is the current version + **/ +static bool isCurrentReleaseVersion(VDO *vdo) +{ + ReleaseVersionNumber loadedVersion + = getLoadedReleaseVersion(vdo->superBlock); + + return (loadedVersion == CURRENT_RELEASE_VERSION_NUMBER); +} + +/** + * Loads the VDO master version into the VDO and checks that the version + * can be understood by VDO. + * + * @param vdo The VDO to validate + * + * @return VDO_SUCCESS or an error if the loaded version is not supported + **/ +static int validateSodiumVersion(VDO *vdo) +{ + int result = decodeVDOVersion(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + if (isCurrentReleaseVersion(vdo)) { + return VDO_SUCCESS; + } + + ReleaseVersionNumber loadedVersion + = getLoadedReleaseVersion(vdo->superBlock); + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "Release version %d, load version %d.%d" + " cannot be upgraded", loadedVersion, + vdo->loadVersion.majorVersion, + vdo->loadVersion.minorVersion); +} + +/** + * Decode a SodiumComponent41_0. + * + * @param buffer The component data buffer + * @param component The component structure to decode into + * + * @return VDO_SUCCESS or an error code + **/ +static int decodeSodium41_0Component(Buffer *buffer, + SodiumComponent41_0 *component) +{ + return getBytesFromBuffer(buffer, sizeof(*component), component); +} + +/** + * Decode the component data for the VDO itself from the component data + * buffer in the super block. + * + * @param vdo The VDO to decode + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int decodeSodiumComponent(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + VersionNumber version; + int result = decodeVersionNumber(buffer, &version); + if (result != VDO_SUCCESS) { + return result; + } + + SodiumComponent41_0 component; + if (areSameVersion(SODIUM_COMPONENT_DATA_41_0, version)) { + result = decodeSodium41_0Component(buffer, &component); + } else { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "VDO component data version mismatch," + " expected 41.0, got %d.%d", + version.majorVersion, + version.minorVersion); + } + if (result != VDO_SUCCESS) { + return result; + } + + // Copy the decoded component into the VDO structure. + vdo->state = component.state; + vdo->loadState = component.state; + vdo->completeRecoveries = component.completeRecoveries; + vdo->readOnlyRecoveries = component.readOnlyRecoveries; + vdo->config = component.config; + vdo->nonce = component.nonce; + + logInfo("Converted VDO component data version %d.%d", + version.majorVersion, version.minorVersion); + return VDO_SUCCESS; +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int finishSodiumDecode(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + const ThreadConfig *threadConfig = getThreadConfig(vdo); + int result = makeRecoveryJournal(vdo->nonce, vdo->layer, + getVDOPartition(vdo->layout, + RECOVERY_JOURNAL_PARTITION), + vdo->completeRecoveries, + vdo->config.recoveryJournalSize, + RECOVERY_JOURNAL_TAIL_BUFFER_SIZE, + vdo->readOnlyNotifier, threadConfig, + &vdo->recoveryJournal); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeSodiumRecoveryJournal(vdo->recoveryJournal, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeSodiumSlabDepot(buffer, threadConfig, vdo->nonce, vdo->layer, + getVDOPartition(vdo->layout, + SLAB_SUMMARY_PARTITION), + vdo->readOnlyNotifier, vdo->recoveryJournal, + &vdo->depot); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeSodiumBlockMap(buffer, vdo->config.logicalBlocks, + threadConfig, &vdo->blockMap); + if (result != VDO_SUCCESS) { + return result; + } + + ASSERT_LOG_ONLY((contentLength(buffer) == 0), + "All decoded component data was used"); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int upgradePriorVDO(PhysicalLayer *layer) +{ + VolumeGeometry geometry; + int result = loadVolumeGeometry(layer, &geometry); + if (result != VDO_SUCCESS) { + return result; + } + + VDO *vdo; + result = makeVDO(layer, &vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = loadSuperBlock(vdo->layer, getDataRegionOffset(geometry), + &vdo->superBlock); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return logErrorWithStringError(result, "Could not load VDO super block"); + } + + // Load the necessary pieces to save again. + result = validateSodiumVersion(vdo); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + if (isCurrentReleaseVersion(vdo)) { + logInfo("VDO already up-to-date"); + freeVDO(&vdo); + return VDO_SUCCESS; + } + + result = decodeSodiumComponent(vdo); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + if (requiresRebuild(vdo)) { + // Do not attempt to upgrade a dirty prior version. + freeVDO(&vdo); + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "Cannot upgrade a dirty VDO."); + } + + result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + result = makeReadOnlyNotifier(inReadOnlyMode(vdo), threadConfig, vdo->layer, + &vdo->readOnlyNotifier); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + result = finishSodiumDecode(vdo); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + // Saving will automatically change the release version to current. + result = saveVDOComponents(vdo); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + logInfo("Successfully saved upgraded VDO"); + freeVDO(&vdo); + + return result; +} diff --git a/source/vdo/base/upgrade.h b/source/vdo/base/upgrade.h new file mode 100644 index 0000000..be2bd05 --- /dev/null +++ b/source/vdo/base/upgrade.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/upgrade.h#1 $ + */ + +#ifndef UPGRADE_H +#define UPGRADE_H + +#include "types.h" + +/** + * Reconfigure the superblock of a prior VDO, preparing it for upgrading. + * + * @param layer The layer with a VDO to prepare + * + * @return VDO_SUCCESS or an error + **/ +int upgradePriorVDO(PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +#endif /* UPGRADE_H */ diff --git a/source/vdo/base/vdo.c b/source/vdo/base/vdo.c new file mode 100644 index 0000000..b4b9a41 --- /dev/null +++ b/source/vdo/base/vdo.c @@ -0,0 +1,1154 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdo.c#21 $ + */ + +/* + * This file contains the main entry points for normal operations on a VDO as + * well as functions for constructing and destroying VDO instances (in memory). + */ + +#include "vdoInternal.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "extent.h" +#include "hashZone.h" +#include "header.h" +#include "logicalZone.h" +#include "numUtils.h" +#include "packer.h" +#include "physicalZone.h" +#include "readOnlyNotifier.h" +#include "recoveryJournal.h" +#include "releaseVersions.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "statistics.h" +#include "statusCodes.h" +#include "threadConfig.h" +#include "vdoLayout.h" +#include "vioWrite.h" +#include "volumeGeometry.h" + +/** + * The master version of the on-disk format of a VDO. This should be + * incremented any time the on-disk representation of any VDO structure + * changes. Changes which require only online upgrade steps should increment + * the minor version. Changes which require an offline upgrade or which can not + * be upgraded to at all should increment the major version and set the minor + * version to 0. + **/ +static const VersionNumber VDO_MASTER_VERSION_67_0 = { + .majorVersion = 67, + .minorVersion = 0, +}; + +/** + * The current version for the data encoded in the super block. This must + * be changed any time there is a change to encoding of the component data + * of any VDO component. + **/ +static const VersionNumber VDO_COMPONENT_DATA_41_0 = { + .majorVersion = 41, + .minorVersion = 0, +}; + +/** + * This is the structure that captures the VDO fields saved as a SuperBlock + * component. + **/ +typedef struct { + VDOState state; + uint64_t completeRecoveries; + uint64_t readOnlyRecoveries; + VDOConfig config; + Nonce nonce; +} __attribute__((packed)) VDOComponent41_0; + +/**********************************************************************/ +int allocateVDO(PhysicalLayer *layer, VDO **vdoPtr) +{ + int result = registerStatusCodes(); + if (result != VDO_SUCCESS) { + return result; + } + + VDO *vdo; + result = ALLOCATE(1, VDO, __func__, &vdo); + if (result != UDS_SUCCESS) { + return result; + } + + vdo->layer = layer; + if (layer->createEnqueueable != NULL) { + result = initializeAdminCompletion(vdo, &vdo->adminCompletion); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + } + + *vdoPtr = vdo; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeVDO(PhysicalLayer *layer, VDO **vdoPtr) +{ + VDO *vdo; + int result = allocateVDO(layer, &vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeZeroThreadConfig(&vdo->loadConfig.threadConfig); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + *vdoPtr = vdo; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void destroyVDO(VDO *vdo) +{ + freeFlusher(&vdo->flusher); + freePacker(&vdo->packer); + freeRecoveryJournal(&vdo->recoveryJournal); + freeSlabDepot(&vdo->depot); + freeVDOLayout(&vdo->layout); + freeSuperBlock(&vdo->superBlock); + freeBlockMap(&vdo->blockMap); + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + if (vdo->hashZones != NULL) { + for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { + freeHashZone(&vdo->hashZones[zone]); + } + } + FREE(vdo->hashZones); + vdo->hashZones = NULL; + + freeLogicalZones(&vdo->logicalZones); + + if (vdo->physicalZones != NULL) { + for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { + freePhysicalZone(&vdo->physicalZones[zone]); + } + } + FREE(vdo->physicalZones); + vdo->physicalZones = NULL; + + uninitializeAdminCompletion(&vdo->adminCompletion); + freeReadOnlyNotifier(&vdo->readOnlyNotifier); + freeThreadConfig(&vdo->loadConfig.threadConfig); +} + +/**********************************************************************/ +void freeVDO(VDO **vdoPtr) +{ + if (*vdoPtr == NULL) { + return; + } + + destroyVDO(*vdoPtr); + FREE(*vdoPtr); + *vdoPtr = NULL; +} + +/**********************************************************************/ +size_t getComponentDataSize(VDO *vdo) +{ + return (sizeof(VersionNumber) + + sizeof(VersionNumber) + + sizeof(VDOComponent41_0) + + getVDOLayoutEncodedSize(vdo->layout) + + getRecoveryJournalEncodedSize() + + getSlabDepotEncodedSize() + + getBlockMapEncodedSize()); +} + +/** + * Encode the VDO master version. + * + * @param buffer The buffer in which to encode the version + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int encodeMasterVersion(Buffer *buffer) +{ + return encodeVersionNumber(VDO_MASTER_VERSION_67_0, buffer); +} + +/** + * Encode a VDOConfig structure into a buffer. + * + * @param config The config structure to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int encodeVDOConfig(const VDOConfig *config, Buffer *buffer) +{ + int result = putUInt64LEIntoBuffer(buffer, config->logicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->physicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->slabSize); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, config->recoveryJournalSize); + if (result != VDO_SUCCESS) { + return result; + } + + return putUInt64LEIntoBuffer(buffer, config->slabJournalBlocks); +} + +/** + * Encode the component data for the VDO itself. + * + * @param vdo The vdo to encode + * @param buffer The buffer in which to encode the VDO + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int encodeVDOComponent(const VDO *vdo, Buffer *buffer) +{ + int result = encodeVersionNumber(VDO_COMPONENT_DATA_41_0, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + size_t initialLength = contentLength(buffer); + + result = putUInt32LEIntoBuffer(buffer, vdo->state); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, vdo->completeRecoveries); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, vdo->readOnlyRecoveries); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeVDOConfig(&vdo->config, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, vdo->nonce); + if (result != VDO_SUCCESS) { + return result; + } + + size_t encodedSize = contentLength(buffer) - initialLength; + return ASSERT(encodedSize == sizeof(VDOComponent41_0), + "encoded VDO component size must match structure size"); +} + +/**********************************************************************/ +static int encodeVDO(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + int result = resetBufferEnd(buffer, 0); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeMasterVersion(buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeVDOComponent(vdo, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeVDOLayout(vdo->layout, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeRecoveryJournal(vdo->recoveryJournal, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeSlabDepot(vdo->depot, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeBlockMap(vdo->blockMap, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + ASSERT_LOG_ONLY((contentLength(buffer) == getComponentDataSize(vdo)), + "All super block component data was encoded"); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int saveVDOComponents(VDO *vdo) +{ + int result = encodeVDO(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + return saveSuperBlock(vdo->layer, vdo->superBlock, getFirstBlockOffset(vdo)); +} + +/**********************************************************************/ +void saveVDOComponentsAsync(VDO *vdo, VDOCompletion *parent) +{ + int result = encodeVDO(vdo); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + saveSuperBlockAsync(vdo->superBlock, getFirstBlockOffset(vdo), parent); +} + +/**********************************************************************/ +int saveReconfiguredVDO(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + size_t componentsSize = contentLength(buffer); + + byte *components; + int result = copyBytes(buffer, componentsSize, &components); + if (result != VDO_SUCCESS) { + return result; + } + + result = resetBufferEnd(buffer, 0); + if (result != VDO_SUCCESS) { + FREE(components); + return result; + } + + result = encodeMasterVersion(buffer); + if (result != VDO_SUCCESS) { + FREE(components); + return result; + } + + result = encodeVDOComponent(vdo, buffer); + if (result != VDO_SUCCESS) { + FREE(components); + return result; + } + + result = putBytes(buffer, componentsSize, components); + FREE(components); + if (result != VDO_SUCCESS) { + return result; + } + + return saveSuperBlock(vdo->layer, vdo->superBlock, getFirstBlockOffset(vdo)); +} + +/**********************************************************************/ +int decodeVDOVersion(VDO *vdo) +{ + return decodeVersionNumber(getComponentBuffer(vdo->superBlock), + &vdo->loadVersion); +} + +/**********************************************************************/ +int validateVDOVersion(VDO *vdo) +{ + int result = decodeVDOVersion(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + ReleaseVersionNumber loadedReleaseVersion + = getLoadedReleaseVersion(vdo->superBlock); + if (vdo->loadConfig.releaseVersion != loadedReleaseVersion) { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "Geometry release version %" PRIu32 " does " + "not match super block release version %" + PRIu32, + vdo->loadConfig.releaseVersion, + loadedReleaseVersion); + } + + return validateVersion(VDO_MASTER_VERSION_67_0, vdo->loadVersion, "master"); +} + +/** + * Decode a VDOConfig structure from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param config The config structure to receive the decoded values + * + * @return UDS_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int decodeVDOConfig(Buffer *buffer, VDOConfig *config) +{ + BlockCount logicalBlocks; + int result = getUInt64LEFromBuffer(buffer, &logicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount physicalBlocks; + result = getUInt64LEFromBuffer(buffer, &physicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount slabSize; + result = getUInt64LEFromBuffer(buffer, &slabSize); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount recoveryJournalSize; + result = getUInt64LEFromBuffer(buffer, &recoveryJournalSize); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount slabJournalBlocks; + result = getUInt64LEFromBuffer(buffer, &slabJournalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + + *config = (VDOConfig) { + .logicalBlocks = logicalBlocks, + .physicalBlocks = physicalBlocks, + .slabSize = slabSize, + .recoveryJournalSize = recoveryJournalSize, + .slabJournalBlocks = slabJournalBlocks, + }; + return VDO_SUCCESS; +} + +/** + * Decode the version 41.0 component state for the VDO itself from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param state The state structure to receive the decoded values + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) + static int decodeVDOComponent_41_0(Buffer *buffer, VDOComponent41_0 *state) +{ + size_t initialLength = contentLength(buffer); + + VDOState vdoState; + int result = getUInt32LEFromBuffer(buffer, &vdoState); + if (result != VDO_SUCCESS) { + return result; + } + + uint64_t completeRecoveries; + result = getUInt64LEFromBuffer(buffer, &completeRecoveries); + if (result != VDO_SUCCESS) { + return result; + } + + uint64_t readOnlyRecoveries; + result = getUInt64LEFromBuffer(buffer, &readOnlyRecoveries); + if (result != VDO_SUCCESS) { + return result; + } + + VDOConfig config; + result = decodeVDOConfig(buffer, &config); + if (result != VDO_SUCCESS) { + return result; + } + + Nonce nonce; + result = getUInt64LEFromBuffer(buffer, &nonce); + if (result != VDO_SUCCESS) { + return result; + } + + *state = (VDOComponent41_0) { + .state = vdoState, + .completeRecoveries = completeRecoveries, + .readOnlyRecoveries = readOnlyRecoveries, + .config = config, + .nonce = nonce, + }; + + size_t decodedSize = initialLength - contentLength(buffer); + return ASSERT(decodedSize == sizeof(VDOComponent41_0), + "decoded VDO component size must match structure size"); +} + +/**********************************************************************/ +int decodeVDOComponent(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + + VersionNumber version; + int result = decodeVersionNumber(buffer, &version); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateVersion(version, VDO_COMPONENT_DATA_41_0, + "VDO component data"); + if (result != VDO_SUCCESS) { + return result; + } + + VDOComponent41_0 component; + result = decodeVDOComponent_41_0(buffer, &component); + if (result != VDO_SUCCESS) { + return result; + } + + // Copy the decoded component into the VDO structure. + vdo->state = component.state; + vdo->loadState = component.state; + vdo->completeRecoveries = component.completeRecoveries; + vdo->readOnlyRecoveries = component.readOnlyRecoveries; + vdo->config = component.config; + vdo->nonce = component.nonce; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int validateVDOConfig(const VDOConfig *config, + BlockCount blockCount, + bool requireLogical) +{ + int result = ASSERT(config->slabSize > 0, "slab size unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(isPowerOfTwo(config->slabSize), + "slab size must be a power of two"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->slabSize <= (1 << MAX_SLAB_BITS), + "slab size must be less than or equal to 2^%d", + MAX_SLAB_BITS); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT(config->slabJournalBlocks >= MINIMUM_SLAB_JOURNAL_BLOCKS, + "slab journal size meets minimum size"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->slabJournalBlocks <= config->slabSize, + "slab journal size is within expected bound"); + if (result != UDS_SUCCESS) { + return result; + } + + SlabConfig slabConfig; + result = configureSlab(config->slabSize, config->slabJournalBlocks, + &slabConfig); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT((slabConfig.dataBlocks >= 1), + "slab must be able to hold at least one block"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->physicalBlocks > 0, "physical blocks unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->physicalBlocks <= MAXIMUM_PHYSICAL_BLOCKS, + "physical block count %llu exceeds maximum %llu", + config->physicalBlocks, MAXIMUM_PHYSICAL_BLOCKS); + if (result != UDS_SUCCESS) { + return VDO_OUT_OF_RANGE; + } + + // This can't check equality because FileLayer et al can only known about + // the storage size, which may not match the super block size. + if (blockCount < config->physicalBlocks) { + logError("A physical size of %llu blocks was specified," + " but that is smaller than the %llu blocks" + " configured in the VDO super block", + blockCount, config->physicalBlocks); + return VDO_PARAMETER_MISMATCH; + } + + result = ASSERT(!requireLogical || (config->logicalBlocks > 0), + "logical blocks unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->logicalBlocks <= MAXIMUM_LOGICAL_BLOCKS, + "logical blocks too large"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(config->recoveryJournalSize > 0, + "recovery journal size unspecified"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(isPowerOfTwo(config->recoveryJournalSize), + "recovery journal size must be a power of two"); + if (result != UDS_SUCCESS) { + return result; + } + + return result; +} + +/** + * Notify a VDO that it is going read-only. This will save the read-only state + * to the super block. + * + *

Implements ReadOnlyNotification. + * + * @param listener The VDO + * @param parent The completion to notify in order to acknowledge the + * notification + **/ +static void notifyVDOOfReadOnlyMode(void *listener, VDOCompletion *parent) +{ + VDO *vdo = listener; + if (inReadOnlyMode(vdo)) { + completeCompletion(parent); + } + + vdo->state = VDO_READ_ONLY_MODE; + saveVDOComponentsAsync(vdo, parent); +} + +/**********************************************************************/ +int enableReadOnlyEntry(VDO *vdo) +{ + return registerReadOnlyListener(vdo->readOnlyNotifier, vdo, + notifyVDOOfReadOnlyMode, + getAdminThread(getThreadConfig(vdo))); +} + +/**********************************************************************/ +bool inReadOnlyMode(const VDO *vdo) +{ + return (vdo->state == VDO_READ_ONLY_MODE); +} + +/**********************************************************************/ +bool isClean(const VDO *vdo) +{ + return ((vdo->state == VDO_CLEAN) || (vdo->state == VDO_NEW)); +} + +/**********************************************************************/ +bool wasClean(const VDO *vdo) +{ + return ((vdo->loadState == VDO_CLEAN) || (vdo->loadState == VDO_NEW)); +} + +/**********************************************************************/ +bool wasNew(const VDO *vdo) +{ + return (vdo->loadState == VDO_NEW); +} + +/**********************************************************************/ +bool requiresReadOnlyRebuild(const VDO *vdo) +{ + return ((vdo->loadState == VDO_FORCE_REBUILD) + || (vdo->loadState == VDO_REBUILD_FOR_UPGRADE)); +} + +/**********************************************************************/ +bool requiresRebuild(const VDO *vdo) +{ + return ((vdo->state == VDO_DIRTY) + || (vdo->state == VDO_FORCE_REBUILD) + || (vdo->state == VDO_REPLAYING) + || (vdo->state == VDO_REBUILD_FOR_UPGRADE)); +} + +/**********************************************************************/ +bool requiresRecovery(const VDO *vdo) +{ + return ((vdo->loadState == VDO_DIRTY) || (vdo->loadState == VDO_REPLAYING) + || (vdo->loadState == VDO_RECOVERING)); +} + +/**********************************************************************/ +bool isReplaying(const VDO *vdo) +{ + return (vdo->state == VDO_REPLAYING); +} + +/**********************************************************************/ +bool inRecoveryMode(const VDO *vdo) +{ + return (vdo->state == VDO_RECOVERING); +} + +/**********************************************************************/ +void enterRecoveryMode(VDO *vdo) +{ + assertOnAdminThread(vdo, __func__); + + if (inReadOnlyMode(vdo)) { + return; + } + + logInfo("Entering recovery mode"); + vdo->state = VDO_RECOVERING; +} + +/**********************************************************************/ +void leaveRecoveryMode(VDO *vdo) +{ + assertOnAdminThread(vdo, __func__); + + /* + * Since scrubbing can be stopped by vdoClose during recovery mode, + * do not change the VDO state if there are outstanding unrecovered slabs. + */ + if (inReadOnlyMode(vdo)) { + return; + } + + ASSERT_LOG_ONLY(inRecoveryMode(vdo), "VDO is in recovery mode"); + logInfo("Exiting recovery mode"); + vdo->state = VDO_DIRTY; +} + +/**********************************************************************/ +void makeVDOReadOnly(VDO *vdo, int errorCode) +{ + enterReadOnlyMode(vdo->readOnlyNotifier, errorCode); +} + +/**********************************************************************/ +bool setVDOCompressing(VDO *vdo, bool enableCompression) +{ + bool stateChanged = compareAndSwapBool(&vdo->compressing, !enableCompression, + enableCompression); + if (stateChanged && !enableCompression) { + // Flushing the packer is asynchronous, but we don't care when it + // finishes. + flushPacker(vdo->packer); + } + + logInfo("compression is %s", (enableCompression ? "enabled" : "disabled")); + return (stateChanged ? !enableCompression : enableCompression); +} + +/**********************************************************************/ +bool getVDOCompressing(VDO *vdo) +{ + return atomicLoadBool(&vdo->compressing); +} + +/**********************************************************************/ +static size_t getBlockMapCacheSize(const VDO *vdo) +{ + return ((size_t) vdo->loadConfig.cacheSize) * VDO_BLOCK_SIZE; +} + +/** + * Tally the hash lock statistics from all the hash zones. + * + * @param vdo The vdo to query + * + * @return The sum of the hash lock statistics from all hash zones + **/ +static HashLockStatistics getHashLockStatistics(const VDO *vdo) +{ + HashLockStatistics totals; + memset(&totals, 0, sizeof(totals)); + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { + HashLockStatistics stats = getHashZoneStatistics(vdo->hashZones[zone]); + totals.dedupeAdviceValid += stats.dedupeAdviceValid; + totals.dedupeAdviceStale += stats.dedupeAdviceStale; + totals.concurrentDataMatches += stats.concurrentDataMatches; + totals.concurrentHashCollisions += stats.concurrentHashCollisions; + } + + return totals; +} + +/** + * Get the current error statistics from VDO. + * + * @param vdo The vdo to query + * + * @return a copy of the current VDO error counters + **/ +static ErrorStatistics getVDOErrorStatistics(const VDO *vdo) +{ + /* + * The error counts can be incremented from arbitrary threads and so must be + * incremented atomically, but they are just statistics with no semantics + * that could rely on memory order, so unfenced reads are sufficient. + */ + const AtomicErrorStatistics *atoms = &vdo->errorStats; + return (ErrorStatistics) { + .invalidAdvicePBNCount = relaxedLoad64(&atoms->invalidAdvicePBNCount), + .noSpaceErrorCount = relaxedLoad64(&atoms->noSpaceErrorCount), + .readOnlyErrorCount = relaxedLoad64(&atoms->readOnlyErrorCount), + }; +} + +/**********************************************************************/ +static const char *describeWritePolicy(WritePolicy policy) +{ + switch (policy) { + case WRITE_POLICY_ASYNC: + return "async"; + case WRITE_POLICY_ASYNC_UNSAFE: + return "async-unsafe"; + case WRITE_POLICY_SYNC: + return "sync"; + default: + return "unknown"; + } +} + +/**********************************************************************/ +void getVDOStatistics(const VDO *vdo, VDOStatistics *stats) +{ + // These are immutable properties of the VDO object, so it is safe to + // query them from any thread. + RecoveryJournal *journal = vdo->recoveryJournal; + SlabDepot *depot = vdo->depot; + // XXX config.physicalBlocks is actually mutated during resize and is in a + // packed structure, but resize runs on the admin thread so we're usually OK. + stats->version = STATISTICS_VERSION; + stats->releaseVersion = CURRENT_RELEASE_VERSION_NUMBER; + stats->logicalBlocks = vdo->config.logicalBlocks; + stats->physicalBlocks = vdo->config.physicalBlocks; + stats->blockSize = VDO_BLOCK_SIZE; + stats->completeRecoveries = vdo->completeRecoveries; + stats->readOnlyRecoveries = vdo->readOnlyRecoveries; + stats->blockMapCacheSize = getBlockMapCacheSize(vdo); + snprintf(stats->writePolicy, sizeof(stats->writePolicy), "%s", + describeWritePolicy(getWritePolicy(vdo))); + + // The callees are responsible for thread-safety. + stats->dataBlocksUsed = getPhysicalBlocksAllocated(vdo); + stats->overheadBlocksUsed = getPhysicalBlocksOverhead(vdo); + stats->logicalBlocksUsed = getJournalLogicalBlocksUsed(journal); + stats->allocator = getDepotBlockAllocatorStatistics(depot); + stats->journal = getRecoveryJournalStatistics(journal); + stats->packer = getPackerStatistics(vdo->packer); + stats->slabJournal = getDepotSlabJournalStatistics(depot); + stats->slabSummary = getSlabSummaryStatistics(getSlabSummary(depot)); + stats->refCounts = getDepotRefCountsStatistics(depot); + stats->blockMap = getBlockMapStatistics(vdo->blockMap); + stats->hashLock = getHashLockStatistics(vdo); + stats->errors = getVDOErrorStatistics(vdo); + SlabCount slabTotal = getDepotSlabCount(depot); + stats->recoveryPercentage + = (slabTotal - getDepotUnrecoveredSlabCount(depot)) * 100 / slabTotal; + + // The "state" field is mutable, but we just need a unfenced atomic read. + VDOState state = *((const volatile VDOState *) &vdo->state); + stats->inRecoveryMode = (state == VDO_RECOVERING); + snprintf(stats->mode, sizeof(stats->mode), "%s", describeVDOState(state)); +} + +/**********************************************************************/ +BlockCount getPhysicalBlocksAllocated(const VDO *vdo) +{ + return (getDepotAllocatedBlocks(vdo->depot) + - getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); +} + +/**********************************************************************/ +BlockCount getPhysicalBlocksFree(const VDO *vdo) +{ + return getDepotFreeBlocks(vdo->depot); +} + +/**********************************************************************/ +BlockCount getPhysicalBlocksOverhead(const VDO *vdo) +{ + // XXX config.physicalBlocks is actually mutated during resize and is in a + // packed structure, but resize runs on admin thread so we're usually OK. + return (vdo->config.physicalBlocks + - getDepotDataBlocks(vdo->depot) + + getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); +} + +/**********************************************************************/ +BlockCount getTotalBlockMapBlocks(const VDO *vdo) +{ + return (getNumberOfFixedBlockMapPages(vdo->blockMap) + + getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); +} + +/**********************************************************************/ +WritePolicy getWritePolicy(const VDO *vdo) +{ + return vdo->loadConfig.writePolicy; +} + +/**********************************************************************/ +void setWritePolicy(VDO *vdo, WritePolicy new) +{ + vdo->loadConfig.writePolicy = new; +} + +/**********************************************************************/ +const VDOLoadConfig *getVDOLoadConfig(const VDO *vdo) +{ + return &vdo->loadConfig; +} + +/**********************************************************************/ +const ThreadConfig *getThreadConfig(const VDO *vdo) +{ + return vdo->loadConfig.threadConfig; +} + +/**********************************************************************/ +BlockCount getConfiguredBlockMapMaximumAge(const VDO *vdo) +{ + return vdo->loadConfig.maximumAge; +} + +/**********************************************************************/ +PageCount getConfiguredCacheSize(const VDO *vdo) +{ + return vdo->loadConfig.cacheSize; +} + +/**********************************************************************/ +PhysicalBlockNumber getFirstBlockOffset(const VDO *vdo) +{ + return vdo->loadConfig.firstBlockOffset; +} + +/**********************************************************************/ +BlockMap *getBlockMap(const VDO *vdo) +{ + return vdo->blockMap; +} + +/**********************************************************************/ +SlabDepot *getSlabDepot(VDO *vdo) +{ + return vdo->depot; +} + +/**********************************************************************/ +RecoveryJournal *getRecoveryJournal(VDO *vdo) +{ + return vdo->recoveryJournal; +} + +/**********************************************************************/ +void dumpVDOStatus(const VDO *vdo) +{ + dumpFlusher(vdo->flusher); + dumpRecoveryJournalStatistics(vdo->recoveryJournal); + dumpPacker(vdo->packer); + dumpSlabDepot(vdo->depot); + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + for (ZoneCount zone = 0; zone < threadConfig->logicalZoneCount; zone++) { + dumpLogicalZone(getLogicalZone(vdo->logicalZones, zone)); + } + + for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { + dumpPhysicalZone(vdo->physicalZones[zone]); + } + + for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { + dumpHashZone(vdo->hashZones[zone]); + } +} + +/**********************************************************************/ +void setVDOTracingFlags(VDO *vdo, bool vioTracing) +{ + vdo->vioTraceRecording = vioTracing; +} + +/**********************************************************************/ +bool vdoVIOTracingEnabled(const VDO *vdo) +{ + return ((vdo != NULL) && vdo->vioTraceRecording); +} + +/**********************************************************************/ +void assertOnAdminThread(VDO *vdo, const char *name) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == getAdminThread(getThreadConfig(vdo))), + "%s called on admin thread", name); +} + +/**********************************************************************/ +void assertOnLogicalZoneThread(const VDO *vdo, + ZoneCount logicalZone, + const char *name) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == getLogicalZoneThread(getThreadConfig(vdo), logicalZone)), + "%s called on logical thread", name); +} + +/**********************************************************************/ +void assertOnPhysicalZoneThread(const VDO *vdo, + ZoneCount physicalZone, + const char *name) +{ + ASSERT_LOG_ONLY((getCallbackThreadID() + == getPhysicalZoneThread(getThreadConfig(vdo), + physicalZone)), + "%s called on physical thread", name); +} + +/**********************************************************************/ +HashZone *selectHashZone(const VDO *vdo, const UdsChunkName *name) +{ + /* + * Use a fragment of the chunk name as a hash code. To ensure uniform + * distributions, it must not overlap with fragments used elsewhere. Eight + * bits of hash should suffice since the number of hash zones is small. + */ + // XXX Make a central repository for these offsets ala hashUtils. + // XXX Verify that the first byte is independent enough. + uint32_t hash = name->name[0]; + + /* + * Scale the 8-bit hash fragment to a zone index by treating it as a binary + * fraction and multiplying that by the zone count. If the hash is uniformly + * distributed over [0 .. 2^8-1], then (hash * count / 2^8) should be + * uniformly distributed over [0 .. count-1]. The multiply and shift is much + * faster than a divide (modulus) on X86 CPUs. + */ + return vdo->hashZones[(hash * getThreadConfig(vdo)->hashZoneCount) >> 8]; +} + +/**********************************************************************/ +int getPhysicalZone(const VDO *vdo, + PhysicalBlockNumber pbn, + PhysicalZone **zonePtr) +{ + if (pbn == ZERO_BLOCK) { + *zonePtr = NULL; + return VDO_SUCCESS; + } + + // Used because it does a more restrictive bounds check than getSlab(), and + // done first because it won't trigger read-only mode on an invalid PBN. + if (!isPhysicalDataBlock(vdo->depot, pbn)) { + return VDO_OUT_OF_RANGE; + } + + // With the PBN already checked, we should always succeed in finding a slab. + Slab *slab = getSlab(vdo->depot, pbn); + int result = ASSERT(slab != NULL, "getSlab must succeed on all valid PBNs"); + if (result != VDO_SUCCESS) { + return result; + } + + *zonePtr = vdo->physicalZones[getSlabZoneNumber(slab)]; + return VDO_SUCCESS; +} + +/**********************************************************************/ +ZonedPBN validateDedupeAdvice(VDO *vdo, + const DataLocation *advice, + LogicalBlockNumber lbn) +{ + ZonedPBN noAdvice = { .pbn = ZERO_BLOCK }; + if (advice == NULL) { + return noAdvice; + } + + // Don't use advice that's clearly meaningless. + if ((advice->state == MAPPING_STATE_UNMAPPED) + || (advice->pbn == ZERO_BLOCK)) { + logDebug("Invalid advice from deduplication server: pbn %llu, " + "state %u. Giving up on deduplication of logical block %llu", + advice->pbn, advice->state, lbn); + atomicAdd64(&vdo->errorStats.invalidAdvicePBNCount, 1); + return noAdvice; + } + + PhysicalZone *zone; + int result = getPhysicalZone(vdo, advice->pbn, &zone); + if ((result != VDO_SUCCESS) || (zone == NULL)) { + logDebug("Invalid physical block number from deduplication server: %" + PRIu64 ", giving up on deduplication of logical block %llu", + advice->pbn, lbn); + atomicAdd64(&vdo->errorStats.invalidAdvicePBNCount, 1); + return noAdvice; + } + + return (ZonedPBN) { + .pbn = advice->pbn, + .state = advice->state, + .zone = zone, + }; +} diff --git a/source/vdo/base/vdo.h b/source/vdo/base/vdo.h new file mode 100644 index 0000000..5741112 --- /dev/null +++ b/source/vdo/base/vdo.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdo.h#3 $ + */ + +#ifndef VDO_H +#define VDO_H + +#include "types.h" + +/** + * Allocate a VDO and associate it with its physical layer. + * + * @param [in] layer The physical layer the VDO sits on + * @param [out] vdoPtr A pointer to hold the allocated VDO + * + * @return VDO_SUCCESS or an error + **/ +int allocateVDO(PhysicalLayer *layer, VDO **vdoPtr) + __attribute__((warn_unused_result)); + +/** + * Construct a VDO for use in user space with a synchronous layer. + * + * @param [in] layer The physical layer the VDO sits on + * @param [out] vdoPtr A pointer to hold the allocated VDO + * + * @return VDO_SUCCESS or an error + **/ +int makeVDO(PhysicalLayer *layer, VDO **vdoPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a VDO instance. + * + * @param vdo The VDO to destroy + **/ +void destroyVDO(VDO *vdo); + +/** + * Destroy a VDO instance, free it, and null out the reference to it. + * + * @param vdoPtr A reference to the VDO to free + **/ +void freeVDO(VDO **vdoPtr); + +/** + * Put a VDO into read-only mode and save the read-only state in the super + * block. + * + * @param vdo The VDO to put into read-only mode + * @param errorCode The error which caused the VDO to enter read-only + * mode + **/ +void makeVDOReadOnly(VDO *vdo, int errorCode); + +/** + * Set whether compression is enabled in VDO. + * + * @param vdo The VDO + * @param enableCompression Whether to enable compression in VDO + * + * @return State of compression before new value is set + **/ +bool setVDOCompressing(VDO *vdo, bool enableCompression); + +/** + * Get whether compression is enabled in VDO. + * + * @param vdo The VDO + * + * @return State of compression + **/ +bool getVDOCompressing(VDO *vdo); + +/** + * Get the VDO statistics. + * + * @param [in] vdo The VDO + * @param [out] stats The VDO statistics are returned here + **/ +void getVDOStatistics(const VDO *vdo, VDOStatistics *stats); + +/** + * Get the number of physical blocks in use by user data. + * + * @param vdo The VDO + * + * @return The number of blocks allocated for user data + **/ +BlockCount getPhysicalBlocksAllocated(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the number of unallocated physical blocks. + * + * @param vdo The VDO + * + * @return The number of free blocks + **/ +BlockCount getPhysicalBlocksFree(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the number of physical blocks used by VDO metadata. + * + * @param vdo The VDO + * + * @return The number of overhead blocks + **/ +BlockCount getPhysicalBlocksOverhead(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the total number of blocks used for the block map. + * + * @param vdo The VDO + * + * @return The number of block map blocks + **/ +BlockCount getTotalBlockMapBlocks(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the VDO write policy. + * + * @param vdo The VDO + * + * @return The write policy + **/ +WritePolicy getWritePolicy(const VDO *vdo); + +/** + * Set the VDO write policy. + * + * @param vdo The VDO + * @param new The new write policy + **/ +void setWritePolicy(VDO *vdo, WritePolicy new); + +/** + * Get a copy of the load-time configuration of the VDO. + * + * @param vdo The VDO + * + * @return The load-time configuration of the VDO + **/ +const VDOLoadConfig *getVDOLoadConfig(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the thread config of the VDO. + * + * @param vdo The VDO + * + * @return The thread config + **/ +const ThreadConfig *getThreadConfig(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the configured maximum age of a dirty block map page. + * + * @param vdo The VDO + * + * @return The block map era length + **/ +BlockCount getConfiguredBlockMapMaximumAge(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the configured page cache size of the VDO. + * + * @param vdo The VDO + * + * @return The number of pages for the page cache + **/ +PageCount getConfiguredCacheSize(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the location of the first block of the VDO. + * + * @param vdo The VDO + * + * @return The location of the first block managed by the VDO + **/ +PhysicalBlockNumber getFirstBlockOffset(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO was new when it was loaded. + * + * @param vdo The VDO to query + * + * @return true if the VDO was new + **/ +bool wasNew(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a DataLocation containing potential dedupe advice is + * well-formed and addresses a data block in one of the configured physical + * zones of the VDO. If it is, return the location and zone as a ZonedPBN; + * otherwise increment statistics tracking invalid advice and return an + * unmapped ZonedPBN. + * + * @param vdo The VDO + * @param advice The advice to validate (NULL indicates no advice) + * @param lbn The logical block number of the write that requested advice, + * which is only used for debug-level logging of invalid advice + * + * @return The ZonedPBN representing the advice, if valid, otherwise an + * unmapped ZonedPBN if the advice was invalid or NULL + **/ +ZonedPBN validateDedupeAdvice(VDO *vdo, + const DataLocation *advice, + LogicalBlockNumber lbn) + __attribute__((warn_unused_result)); + +// TEST SUPPORT ONLY BEYOND THIS POINT + +/** + * Dump status information about VDO to the log for debugging. + * + * @param vdo The vdo to dump + **/ +void dumpVDOStatus(const VDO *vdo); + +/** + * Set the VIO tracing flag. + * + * @param vdo The VDO + * @param vioTracing Whether VIO tracing is enabled for this device + **/ +void setVDOTracingFlags(VDO *vdo, bool vioTracing); + +/** + * Indicate whether VIO tracing is enabled. + * + * @param vdo The VDO + * + * @return Whether VIO tracing is enabled + **/ +bool vdoVIOTracingEnabled(const VDO *vdo); + +/** + * Indicate whether extent tracing is enabled. + * + * @param vdo The VDO + * + * @return Whether extent tracing is enabled + **/ +bool vdoExtentTracingEnabled(const VDO *vdo); + +#endif /* VDO_H */ diff --git a/source/vdo/base/vdoDebug.c b/source/vdo/base/vdoDebug.c new file mode 100644 index 0000000..6c03ece --- /dev/null +++ b/source/vdo/base/vdoDebug.c @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoDebug.c#1 $ + */ + +#include "vdoDebug.h" + +#include "logger.h" +#include "stringUtils.h" +#include "vdoInternal.h" + +static const char xLogDebugMessage[] = "x-log-debug-message"; + +/**********************************************************************/ +int initializeVDOCommandCompletion(VDOCommandCompletion *command, + VDO *vdo, + int argc, + char **argv) +{ + *command = (VDOCommandCompletion) { + .vdo = vdo, + .argc = argc, + .argv = argv, + }; + initializeCompletion(&command->completion, VDO_COMMAND_COMPLETION, + vdo->layer); + return initializeEnqueueableCompletion(&command->subCompletion, + VDO_COMMAND_SUB_COMPLETION, + vdo->layer); +} + +/**********************************************************************/ +int destroyVDOCommandCompletion(VDOCommandCompletion *command) +{ + if (command == NULL) { + return VDO_SUCCESS; + } + + destroyEnqueueable(&command->subCompletion); + return command->completion.result; +} + +/**********************************************************************/ +static inline VDOCommandCompletion * +asVDOCommandCompletion(VDOCompletion *completion) +{ + if (completion->type == VDO_COMMAND_COMPLETION) { + return (VDOCommandCompletion *) + ((uintptr_t) completion - offsetof(VDOCommandCompletion, completion)); + } else if (completion->type == VDO_COMMAND_SUB_COMPLETION) { + return (VDOCommandCompletion *) + ((uintptr_t) completion - offsetof(VDOCommandCompletion, subCompletion)); + } else { + ASSERT_LOG_ONLY(((completion->type == VDO_COMMAND_COMPLETION) || + (completion->type == VDO_COMMAND_SUB_COMPLETION)), + "completion type is %s instead of " + "VDO_COMMAND_COMPLETION or VDO_COMMAND_SUB_COMPLETION", + getCompletionTypeName(completion->type)); + return NULL; + } +} + +/**********************************************************************/ +static void logDebugMessage(VDOCommandCompletion *cmd) +{ + static char buffer[256]; + + char *buf = buffer; + char *end = buffer + sizeof(buffer); + + for (int i = 1; i < cmd->argc; ++i) { + buf = appendToBuffer(buf, end, " %s", cmd->argv[i]); + } + if (buf == end) { + strcpy(buf - 4, "..."); + } + logInfo("debug message:%s", buffer); + finishCompletion(&cmd->completion, VDO_SUCCESS); +} + +/**********************************************************************/ +void executeVDOExtendedCommand(VDOCompletion *completion) +{ + VDOCommandCompletion *cmd = asVDOCommandCompletion(completion); + + if ((cmd->vdo == NULL) || (cmd->argc == 0)) { + finishCompletion(&cmd->completion, VDO_COMMAND_ERROR); + return; + } + if (strcmp(cmd->argv[0], xLogDebugMessage) == 0) { + logDebugMessage(cmd); + } else { + finishCompletion(&cmd->completion, VDO_UNKNOWN_COMMAND); + } +} diff --git a/source/vdo/base/vdoDebug.h b/source/vdo/base/vdoDebug.h new file mode 100644 index 0000000..c626533 --- /dev/null +++ b/source/vdo/base/vdoDebug.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoDebug.h#1 $ + */ + +#ifndef VDO_DEBUG_H +#define VDO_DEBUG_H + +#include "completion.h" +#include "vdo.h" + +/** + * A completion used to pass information to a potentially asynchronous + * (because it must run in a different zone) extended command. + * + * These commands are dispatched according to argv[0], which is of the form + * "x-some-command-name", and intentionally open ended for debugging. + * + * The command "x-log-debug-message" is currently defined to echo the + * remainder of the arguments into the kernel log via the vdo logger at + * info level. + **/ +typedef struct vdoCommandCompletion { + VDOCompletion completion; + VDOCompletion subCompletion; + VDO *vdo; + int argc; + char **argv; +} VDOCommandCompletion; + +/** + * Initialize a VDO command completion. + * + * @param command The command completion to initialize. + * @param vdo The VDO. + * @param argc An argument count. + * @param argv An argument vector of length argc. + * + * @return VDO_SUCCESS or an error code + **/ +int initializeVDOCommandCompletion(VDOCommandCompletion *command, + VDO *vdo, + int argc, + char **argv); + +/** + * Destroy a VDO command completion. + * + * @param command The command completion. + * + * @return the completion result + **/ +int destroyVDOCommandCompletion(VDOCommandCompletion *command); + +/** + * Perform an asynchronous extended command (usually debugging related). + * + * @param completion The completion embedded in VDOCommandCompletion. + **/ +void executeVDOExtendedCommand(VDOCompletion *completion); + +#endif // VDO_DEBUG_H diff --git a/source/vdo/base/vdoInternal.h b/source/vdo/base/vdoInternal.h new file mode 100644 index 0000000..1337e73 --- /dev/null +++ b/source/vdo/base/vdoInternal.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoInternal.h#11 $ + */ + +#ifndef VDO_INTERNAL_H +#define VDO_INTERNAL_H + +#include "vdo.h" + +#include "adminCompletion.h" +#include "adminState.h" +#include "atomic.h" +#include "header.h" +#include "packer.h" +#include "statistics.h" +#include "superBlock.h" +#include "readOnlyNotifier.h" +#include "types.h" +#include "uds.h" +#include "vdoLayout.h" +#include "vdoState.h" + +/** + * Error counters are atomic since updates can arrive concurrently from + * arbitrary threads. + **/ +typedef struct atomicErrorStatistics { + // Dedupe path error stats + Atomic64 invalidAdvicePBNCount; + Atomic64 noSpaceErrorCount; + Atomic64 readOnlyErrorCount; +} AtomicErrorStatistics; + +struct vdo { + /* The state of this VDO */ + VDOState state; + /* The read-only notifier */ + ReadOnlyNotifier *readOnlyNotifier; + /* The number of times this VDO has recovered from a dirty state */ + uint64_t completeRecoveries; + /* The number of times this VDO has recovered from a read-only state */ + uint64_t readOnlyRecoveries; + /* The format-time configuration of this VDO */ + VDOConfig config; + /* The load-time configuration of this VDO */ + VDOLoadConfig loadConfig; + /* The nonce for this VDO */ + Nonce nonce; + + /* The super block */ + SuperBlock *superBlock; + + /* The physical storage below us */ + PhysicalLayer *layer; + + /* Our partitioning of the physical layer's storage */ + VDOLayout *layout; + + /* The block map */ + BlockMap *blockMap; + + /* The journal for block map recovery */ + RecoveryJournal *recoveryJournal; + + /* The slab depot */ + SlabDepot *depot; + + /* The compressed-block packer */ + Packer *packer; + /* Whether incoming data should be compressed */ + AtomicBool compressing; + + /* The handler for flush requests */ + Flusher *flusher; + + /* The master version of the VDO when loaded (for upgrading) */ + VersionNumber loadVersion; + /* The state the VDO was in when loaded (primarily for unit tests) */ + VDOState loadState; + /* Whether VIO tracing is enabled */ + bool vioTraceRecording; + + /* The logical zones of this VDO */ + LogicalZones *logicalZones; + + /* The physical zones of this VDO */ + PhysicalZone **physicalZones; + + /* The hash lock zones of this VDO */ + HashZone **hashZones; + + /* The completion for administrative operations */ + AdminCompletion adminCompletion; + + /* The administrative state of the VDO */ + AdminState adminState; + + /* Whether a close is required */ + bool closeRequired; + + /* Atomic global counts of error events */ + AtomicErrorStatistics errorStats; +}; + +/** + * Get the component data size of a VDO. + * + * @param vdo The VDO whose component data size is desired + * + * @return the component data size of the VDO + **/ +size_t getComponentDataSize(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Encode the VDO and save the super block synchronously. + * + * @param vdo The VDO whose state is being saved + * + * @return VDO_SUCCESS or an error + **/ +int saveVDOComponents(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Encode the VDO and save the super block asynchronously. All non-user mode + * super block savers should use this bottle neck instead of calling + * saveSuperBlockAsync() directly. + * + * @param vdo The VDO whose state is being saved + * @param parent The completion to notify when the save is complete + **/ +void saveVDOComponentsAsync(VDO *vdo, VDOCompletion *parent); + +/** + * Re-encode the VDO component after a reconfiguration and save the super + * block synchronously. This function avoids the need to decode and re-encode + * the other components by simply copying their previous encoding. + * + * @param vdo The VDO which was reconfigured + * + * @return VDO_SUCCESS or an error code + **/ +int saveReconfiguredVDO(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Decode the VDO master version from the component data buffer in the super + * block and store it in the VDO's loadVersion field. + **/ +int decodeVDOVersion(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Loads the VDO master version into the VDO and checks that the version + * can be understood by VDO. + * + * @param vdo The VDO to validate + * + * @return VDO_SUCCESS or an error if the loaded version is not supported + **/ +int validateVDOVersion(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Decode the component data for the VDO itself from the component data buffer + * in the super block. + * + * @param vdo The VDO to decode + * + * @return VDO_SUCCESS or an error + **/ +int decodeVDOComponent(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Validate constraints on VDO config. + * + * @param config The VDO config + * @param blockCount The block count of the VDO + * @param requireLogical Set to true if the number logical blocks + * must be configured (otherwise, it may be zero) + * + * @return a success or error code + **/ +int validateVDOConfig(const VDOConfig *config, + BlockCount blockCount, + bool requireLogical) + __attribute__((warn_unused_result)); + +/** + * Enable a VDO to enter read-only mode on errors. + * + * @param vdo The VDO to enable + * + * @return VDO_SUCCESS or an error + **/ +int enableReadOnlyEntry(VDO *vdo); + +/** + * Get the block map. + * + * @param vdo The VDO whose block map is desired + * + * @return the block map from the VDO + **/ +BlockMap *getBlockMap(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the slab depot from a VDO. + * + * @param vdo The VDO whose slab depot is desired + * + * @return the slab depot from the VDO + **/ +SlabDepot *getSlabDepot(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Get the recovery journal from a VDO. + * + * @param vdo The VDO whose recovery journal is desired + * + * @return the recovery journal from the VDO + **/ +RecoveryJournal *getRecoveryJournal(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a VDO is in read-only mode. + * + * @param vdo The VDO to query + * + * @return true if the VDO is in read-only mode + **/ +bool inReadOnlyMode(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO is in a clean state. + * + * @param vdo The VDO to query + * + * @return true if the VDO is clean + **/ +bool isClean(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO was in a clean state when it was loaded. + * + * @param vdo The VDO to query + * + * @return true if the VDO was clean + **/ +bool wasClean(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO requires a read-only mode rebuild. + * + * @param vdo The VDO to query + * + * @return true if the VDO requires a read-only rebuild + **/ +bool requiresReadOnlyRebuild(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a VDO requires rebuilding. + * + * @param vdo The VDO to query + * + * @return true if the VDO must be rebuilt + **/ +bool requiresRebuild(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a VDO should enter recovery mode. + * + * @param vdo The VDO to query + * + * @return true if the VDO requires recovery + **/ +bool requiresRecovery(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether a VDO was replaying the recovery journal into the block map + * when it crashed. + * + * @param vdo The VDO to query + * + * @return true if the VDO crashed while reconstructing the + * block map + **/ +bool isReplaying(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Check whether the VDO is in recovery mode. + * + * @param vdo The VDO to query + * + * @return true if the VDO is in recovery mode + **/ +bool inRecoveryMode(const VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Put the VDO into recovery mode + * + * @param vdo The VDO + **/ +void enterRecoveryMode(VDO *vdo); + +/** + * Leave recovery mode if slab scrubbing has actually finished. + * + * @param vdo The VDO + **/ +void leaveRecoveryMode(VDO *vdo); + +/** + * Assert that we are running on the admin thread. + * + * @param vdo The VDO + * @param name The name of the function which should be running on the admin + * thread (for logging). + **/ +void assertOnAdminThread(VDO *vdo, const char *name); + +/** + * Assert that this function was called on the specified logical zone thread. + * + * @param vdo The VDO + * @param logicalZone The number of the logical zone + * @param name The name of the calling function + **/ +void assertOnLogicalZoneThread(const VDO *vdo, + ZoneCount logicalZone, + const char *name); + +/** + * Assert that this function was called on the specified physical zone thread. + * + * @param vdo The VDO + * @param physicalZone The number of the physical zone + * @param name The name of the calling function + **/ +void assertOnPhysicalZoneThread(const VDO *vdo, + ZoneCount physicalZone, + const char *name); + +/** + * Select the hash zone responsible for locking a given chunk name. + * + * @param vdo The VDO containing the hash zones + * @param name The chunk name + * + * @return The hash zone responsible for the chunk name + **/ +HashZone *selectHashZone(const VDO *vdo, const UdsChunkName *name) + __attribute__((warn_unused_result)); + +/** + * Get the physical zone responsible for a given physical block number of a + * data block in this VDO instance, or of the zero block (for which a NULL + * zone is returned). For any other block number that is not in the range of + * valid data block numbers in any slab, an error will be returned. This + * function is safe to call on invalid block numbers; it will not put the VDO + * into read-only mode. + * + * @param [in] vdo The VDO containing the physical zones + * @param [in] pbn The PBN of the data block + * @param [out] zonePtr A pointer to return the physical zone + * + * @return VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid + * or an error code for any other failure + **/ +int getPhysicalZone(const VDO *vdo, + PhysicalBlockNumber pbn, + PhysicalZone **zonePtr) + __attribute__((warn_unused_result)); + +/**********************************************************************/ +// Asynchronous callback to share a duplicate block. This is only public so +// test code may compare it against the current callback in the completion. +void shareBlock(VDOCompletion *completion); + +#endif /* VDO_INTERNAL_H */ diff --git a/source/vdo/base/vdoLayout.c b/source/vdo/base/vdoLayout.c new file mode 100644 index 0000000..3dfce96 --- /dev/null +++ b/source/vdo/base/vdoLayout.c @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayout.c#2 $ + */ + +#include "vdoLayout.h" +#include "vdoLayoutInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockMap.h" +#include "partitionCopy.h" +#include "slab.h" +#include "slabSummary.h" +#include "types.h" +#include "vdoInternal.h" + +#include "statusCodes.h" + +static const PartitionID REQUIRED_PARTITIONS[] = { + BLOCK_MAP_PARTITION, + BLOCK_ALLOCATOR_PARTITION, + RECOVERY_JOURNAL_PARTITION, + SLAB_SUMMARY_PARTITION, +}; + +static const uint8_t REQUIRED_PARTITION_COUNT = 4; + +/** + * Make a fixed layout for a VDO. + * + * @param [in] physicalBlocks The number of physical blocks in the VDO + * @param [in] startingOffset The starting offset of the layout + * @param [in] blockMapBlocks The size of the block map partition + * @param [in] journalBlocks The size of the journal partition + * @param [in] summaryBlocks The size of the slab summary partition + * @param [out] layoutPtr A pointer to hold the new FixedLayout + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int makeVDOFixedLayout(BlockCount physicalBlocks, + PhysicalBlockNumber startingOffset, + BlockCount blockMapBlocks, + BlockCount journalBlocks, + BlockCount summaryBlocks, + FixedLayout **layoutPtr) +{ + BlockCount necessarySize + = (startingOffset + blockMapBlocks + journalBlocks + summaryBlocks); + if (necessarySize > physicalBlocks) { + return logErrorWithStringError(VDO_NO_SPACE, "Not enough space to" + " make a VDO"); + } + + FixedLayout *layout; + int result = makeFixedLayout(physicalBlocks - startingOffset, + startingOffset, &layout); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeFixedLayoutPartition(layout, BLOCK_MAP_PARTITION, + blockMapBlocks, FROM_BEGINNING, 0); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + result = makeFixedLayoutPartition(layout, SLAB_SUMMARY_PARTITION, + summaryBlocks, FROM_END, 0); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + result = makeFixedLayoutPartition(layout, RECOVERY_JOURNAL_PARTITION, + journalBlocks, FROM_END, 0); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + /* + * The block allocator no longer traffics in relative PBNs so the offset + * doesn't matter. We need to keep this partition around both for upgraded + * systems, and because we decided that all of the usable space in the + * volume, other than the super block, should be part of some partition. + */ + result = makeFixedLayoutPartition(layout, BLOCK_ALLOCATOR_PARTITION, + ALL_FREE_BLOCKS, FROM_BEGINNING, + blockMapBlocks); + if (result != VDO_SUCCESS) { + freeFixedLayout(&layout); + return result; + } + + *layoutPtr = layout; + return VDO_SUCCESS; +} + +/** + * Get the offset of a given partition. + * + * @param layout The layout containing the partition + * @param partitionID The ID of the partition whose offset is desired + * + * @return The offset of the partition (in blocks) + **/ +__attribute__((warn_unused_result)) +static BlockCount getPartitionOffset(VDOLayout *layout, + PartitionID partitionID) +{ + return getFixedLayoutPartitionOffset(getVDOPartition(layout, partitionID)); +} + +/**********************************************************************/ +int makeVDOLayout(BlockCount physicalBlocks, + PhysicalBlockNumber startingOffset, + BlockCount blockMapBlocks, + BlockCount journalBlocks, + BlockCount summaryBlocks, + VDOLayout **vdoLayoutPtr) +{ + VDOLayout *vdoLayout; + int result = ALLOCATE(1, VDOLayout, __func__, &vdoLayout); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeVDOFixedLayout(physicalBlocks, startingOffset, blockMapBlocks, + journalBlocks, summaryBlocks, &vdoLayout->layout); + if (result != VDO_SUCCESS) { + freeVDOLayout(&vdoLayout); + return result; + } + + vdoLayout->startingOffset = startingOffset; + + *vdoLayoutPtr = vdoLayout; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int decodeVDOLayout(Buffer *buffer, VDOLayout **vdoLayoutPtr) +{ + VDOLayout *vdoLayout; + int result = ALLOCATE(1, VDOLayout, __func__, &vdoLayout); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeFixedLayout(buffer, &vdoLayout->layout); + if (result != VDO_SUCCESS) { + freeVDOLayout(&vdoLayout); + return result; + } + + // Check that all the expected partitions exist + Partition *partition; + for (uint8_t i = 0; i < REQUIRED_PARTITION_COUNT; i++) { + result = getPartition(vdoLayout->layout, REQUIRED_PARTITIONS[i], + &partition); + if (result != VDO_SUCCESS) { + freeVDOLayout(&vdoLayout); + return logErrorWithStringError(result, + "VDO layout is missing required partition" + " %u", REQUIRED_PARTITIONS[i]); + } + } + + // XXX Assert this is the same as where we loaded the super block. + vdoLayout->startingOffset + = getPartitionOffset(vdoLayout, BLOCK_MAP_PARTITION); + + *vdoLayoutPtr = vdoLayout; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeVDOLayout(VDOLayout **vdoLayoutPtr) +{ + VDOLayout *vdoLayout = *vdoLayoutPtr; + if (vdoLayout == NULL) { + return; + } + + freeCopyCompletion(&vdoLayout->copyCompletion); + freeFixedLayout(&vdoLayout->nextLayout); + freeFixedLayout(&vdoLayout->layout); + freeFixedLayout(&vdoLayout->previousLayout); + FREE(vdoLayout); + *vdoLayoutPtr = NULL; +} + +/** + * Get a partition from a FixedLayout in conditions where we expect that it can + * not fail. + * + * @param layout The FixedLayout from which to get the partition + * @param id The ID of the partition to retrieve + * + * @return The desired partition + **/ +__attribute__((warn_unused_result)) +static Partition *retrievePartition(FixedLayout *layout, PartitionID id) +{ + Partition *partition; + int result = getPartition(layout, id, &partition); + ASSERT_LOG_ONLY(result == VDO_SUCCESS, "VDOLayout has expected partition"); + return partition; +} + +/**********************************************************************/ +Partition *getVDOPartition(VDOLayout *vdoLayout, PartitionID id) +{ + return retrievePartition(vdoLayout->layout, id); +} + +/** + * Get a partition from a VDOLayout's next FixedLayout. This method should + * only be called when the VDOLayout is prepared to grow. + * + * @param vdoLayout The VDOLayout from which to get the partition + * @param id The ID of the desired partition + * + * @return The requested partition + **/ +__attribute__((warn_unused_result)) +static Partition *getPartitionFromNextLayout(VDOLayout *vdoLayout, + PartitionID id) +{ + ASSERT_LOG_ONLY(vdoLayout->nextLayout != NULL, + "VDOLayout is prepared to grow"); + return retrievePartition(vdoLayout->nextLayout, id); +} + +/** + * Get the size of a given partition. + * + * @param layout The layout containing the partition + * @param partitionID The partition ID whose size to find + * + * @return The size of the partition (in blocks) + **/ +__attribute__((warn_unused_result)) +static BlockCount getPartitionSize(VDOLayout *layout, PartitionID partitionID) +{ + return getFixedLayoutPartitionSize(getVDOPartition(layout, partitionID)); +} + +/**********************************************************************/ +int prepareToGrowVDOLayout(VDOLayout *vdoLayout, + BlockCount oldPhysicalBlocks, + BlockCount newPhysicalBlocks, + PhysicalLayer *layer) +{ + if (getNextVDOLayoutSize(vdoLayout) == newPhysicalBlocks) { + // We are already prepared to grow to the new size, so we're done. + return VDO_SUCCESS; + } + + // Make a copy completion if there isn't one + if (vdoLayout->copyCompletion == NULL) { + int result = makeCopyCompletion(layer, &vdoLayout->copyCompletion); + if (result != VDO_SUCCESS) { + return result; + } + } + + // Free any unused preparation. + freeFixedLayout(&vdoLayout->nextLayout); + + // Make a new layout with the existing partition sizes for everything but the + // block allocator partition. + int result = makeVDOFixedLayout(newPhysicalBlocks, + vdoLayout->startingOffset, + getPartitionSize(vdoLayout, + BLOCK_MAP_PARTITION), + getPartitionSize(vdoLayout, + RECOVERY_JOURNAL_PARTITION), + getPartitionSize(vdoLayout, + SLAB_SUMMARY_PARTITION), + &vdoLayout->nextLayout); + if (result != VDO_SUCCESS) { + freeCopyCompletion(&vdoLayout->copyCompletion); + return result; + } + + // Ensure the new journal and summary are entirely within the added blocks. + Partition *slabSummaryPartition + = getPartitionFromNextLayout(vdoLayout, SLAB_SUMMARY_PARTITION); + Partition *recoveryJournalPartition + = getPartitionFromNextLayout(vdoLayout, RECOVERY_JOURNAL_PARTITION); + BlockCount minNewSize + = (oldPhysicalBlocks + + getFixedLayoutPartitionSize(slabSummaryPartition) + + getFixedLayoutPartitionSize(recoveryJournalPartition)); + if (minNewSize > newPhysicalBlocks) { + // Copying the journal and summary would destroy some old metadata. + freeFixedLayout(&vdoLayout->nextLayout); + freeCopyCompletion(&vdoLayout->copyCompletion); + return VDO_INCREMENT_TOO_SMALL; + } + + return VDO_SUCCESS; +} + +/** + * Get the size of a VDO from the specified FixedLayout and the + * starting offset thereof. + * + * @param layout The fixed layout whose size to use + * @param startingOffset The starting offset of the layout + * + * @return The total size of a VDO (in blocks) with the given layout + **/ +__attribute__((warn_unused_result)) +static BlockCount getVDOSize(FixedLayout *layout, BlockCount startingOffset) +{ + // The FixedLayout does not include the super block or any earlier + // metadata; all that is captured in the VDOLayout's starting offset + return getTotalFixedLayoutSize(layout) + startingOffset; +} + +/**********************************************************************/ +BlockCount getNextVDOLayoutSize(VDOLayout *vdoLayout) +{ + return ((vdoLayout->nextLayout == NULL) + ? 0 : getVDOSize(vdoLayout->nextLayout, vdoLayout->startingOffset)); +} + +/**********************************************************************/ +BlockCount getNextBlockAllocatorPartitionSize(VDOLayout *vdoLayout) +{ + if (vdoLayout->nextLayout == NULL) { + return 0; + } + + Partition *partition = getPartitionFromNextLayout(vdoLayout, + BLOCK_ALLOCATOR_PARTITION); + return getFixedLayoutPartitionSize(partition); +} + +/**********************************************************************/ +BlockCount growVDOLayout(VDOLayout *vdoLayout) +{ + ASSERT_LOG_ONLY(vdoLayout->nextLayout != NULL, + "VDO prepared to grow physical"); + vdoLayout->previousLayout = vdoLayout->layout; + vdoLayout->layout = vdoLayout->nextLayout; + vdoLayout->nextLayout = NULL; + + return getVDOSize(vdoLayout->layout, vdoLayout->startingOffset); +} + +/**********************************************************************/ +BlockCount revertVDOLayout(VDOLayout *vdoLayout) +{ + if ((vdoLayout->previousLayout != NULL) + && (vdoLayout->previousLayout != vdoLayout->layout)) { + // Only revert if there's something to revert to. + freeFixedLayout(&vdoLayout->layout); + vdoLayout->layout = vdoLayout->previousLayout; + vdoLayout->previousLayout = NULL; + } + + return getVDOSize(vdoLayout->layout, vdoLayout->startingOffset); +} + +/**********************************************************************/ +void finishVDOLayoutGrowth(VDOLayout *vdoLayout) +{ + if (vdoLayout->layout != vdoLayout->previousLayout) { + freeFixedLayout(&vdoLayout->previousLayout); + } + + if (vdoLayout->layout != vdoLayout->nextLayout) { + freeFixedLayout(&vdoLayout->nextLayout); + } + + freeCopyCompletion(&vdoLayout->copyCompletion); +} + +/**********************************************************************/ +void copyPartition(VDOLayout *layout, + PartitionID partitionID, + VDOCompletion *parent) +{ + copyPartitionAsync(layout->copyCompletion, + getVDOPartition(layout, partitionID), + getPartitionFromNextLayout(layout, partitionID), parent); +} + +/**********************************************************************/ +size_t getVDOLayoutEncodedSize(const VDOLayout *vdoLayout) +{ + return getFixedLayoutEncodedSize(vdoLayout->layout); +} + +/**********************************************************************/ +int encodeVDOLayout(const VDOLayout *vdoLayout, Buffer *buffer) +{ + return encodeFixedLayout(vdoLayout->layout, buffer); +} + diff --git a/source/vdo/base/vdoLayout.h b/source/vdo/base/vdoLayout.h new file mode 100644 index 0000000..3de24ae --- /dev/null +++ b/source/vdo/base/vdoLayout.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayout.h#2 $ + */ + +/** + * VDOLayout is an object which manages the layout of a VDO. It wraps + * FixedLayout, but includes the knowledge of exactly which partitions a VDO is + * expected to have. Because of this knowledge, the VDOLayout validates the + * FixedLayout encoded in the super block at load time, obviating the need for + * subsequent error checking when other modules need to get partitions from the + * layout. + * + * The VDOLayout also manages the preparation and growth of the layout for grow + * physical operations. + **/ + +#ifndef VDO_LAYOUT_H +#define VDO_LAYOUT_H + +#include "fixedLayout.h" +#include "types.h" + +/** + * Make a VDO layout with the specified parameters. + * + * @param [in] physicalBlocks The number of physical blocks in the VDO + * @param [in] startingOffset The starting offset of the layout + * @param [in] blockMapBlocks The size of the block map partition + * @param [in] journalBlocks The size of the journal partition + * @param [in] summaryBlocks The size of the slab summary partition + * @param [out] vdoLayoutPtr A pointer to hold the new VDOLayout + * + * @return VDO_SUCCESS or an error + **/ +int makeVDOLayout(BlockCount physicalBlocks, + PhysicalBlockNumber startingOffset, + BlockCount blockMapBlocks, + BlockCount journalBlocks, + BlockCount summaryBlocks, + VDOLayout **vdoLayoutPtr) + __attribute__((warn_unused_result)); + +/** + * Decode a VDOLayout from a buffer. + * + * @param [in] buffer The buffer from which to decode + * @param [out] vdoLayoutPtr A pointer to hold the VDOLayout + * + * @return VDO_SUCCESS or an error + **/ +int decodeVDOLayout(Buffer *buffer, VDOLayout **vdoLayoutPtr) + __attribute__((warn_unused_result)); + +/** + * Free a VDOLayout and NULL out the reference to it. + * + * @param vdoLayoutPtr The pointer to a VDOLayout to free + **/ +void freeVDOLayout(VDOLayout **vdoLayoutPtr); + +/** + * Get a partition from a VDOLayout. Because the layout's FixedLayout has + * already been validated, this can not fail. + * + * @param vdoLayout The VDOLayout from which to get the partition + * @param id The ID of the desired partition + * + * @return The requested partition + **/ +Partition *getVDOPartition(VDOLayout *vdoLayout, PartitionID id) + __attribute__((warn_unused_result)); + +/** + * Prepare the layout to be grown. + * + * @param vdoLayout The layout to grow + * @param oldPhysicalBlocks The current size of the VDO + * @param newPhysicalBlocks The size to which the VDO will be grown + * @param layer The layer being grown + * + * @return VDO_SUCCESS or an error code + **/ +int prepareToGrowVDOLayout(VDOLayout *vdoLayout, + BlockCount oldPhysicalBlocks, + BlockCount newPhysicalBlocks, + PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Get the size of the next layout. + * + * @param vdoLayout The layout to check + * + * @return The size which was specified when the layout was prepared for growth + * or 0 if the layout is not prepared to grow + **/ +BlockCount getNextVDOLayoutSize(VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Get the size of the next block allocator partition. + * + * @param vdoLayout The VDOLayout which has been prepared to grow + * + * @return The size of the block allocator partition in the next layout or 0 + * if the layout is not prepared to grow + **/ +BlockCount getNextBlockAllocatorPartitionSize(VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Grow the layout by swapping in the prepared layout. + * + * @param vdoLayout The layout to grow + * + * @return The new size of the VDO + **/ +BlockCount growVDOLayout(VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Revert the last growth attempt. + * + * @param vdoLayout The layout to revert + * + * @return The reverted size (in blocks) of the VDO + **/ +BlockCount revertVDOLayout(VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Clean up any unused resources once an attempt to grow has completed. + * + * @param vdoLayout The layout + **/ +void finishVDOLayoutGrowth(VDOLayout *vdoLayout); + +/** + * Copy a partition from the location specified in the current layout to that in + * the next layout. + * + * @param layout The VDOLayout which is prepared to grow + * @param partitionID The ID of the partition to copy + * @param parent The completion to notify when the copy is complete + **/ +void copyPartition(VDOLayout *layout, + PartitionID partitionID, + VDOCompletion *parent); + +/** + * Get the size of an encoded VDOLayout. + * + * @param vdoLayout The VDOLayout + * + * @return The encoded size of the VDOLayout + **/ +size_t getVDOLayoutEncodedSize(const VDOLayout *vdoLayout) + __attribute__((warn_unused_result)); + +/** + * Encode a VDOLayout into a buffer. + * + * @param vdoLayout The VDOLayout to encode + * @param buffer The buffer to encode into + * + * @return UDS_SUCCESS or an error + **/ +int encodeVDOLayout(const VDOLayout *vdoLayout, Buffer *buffer) + __attribute__((warn_unused_result)); + +#endif // VDO_LAYOUT_H diff --git a/source/vdo/base/vdoLayoutInternals.h b/source/vdo/base/vdoLayoutInternals.h new file mode 100644 index 0000000..5f038fe --- /dev/null +++ b/source/vdo/base/vdoLayoutInternals.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayoutInternals.h#2 $ + */ + +#ifndef VDO_LAYOUT_INTERNALS_H +#define VDO_LAYOUT_INTERNALS_H + +#include "fixedLayout.h" +#include "types.h" + +struct vdoLayout { + // The current layout of the VDO + FixedLayout *layout; + // The next layout of the VDO + FixedLayout *nextLayout; + // The previous layout of the VDO + FixedLayout *previousLayout; + // The first block in the layouts + PhysicalBlockNumber startingOffset; + // A pointer to the copy completion (if there is one) + VDOCompletion *copyCompletion; +}; + +#endif // VDO_LAYOUT_INTERNALS_H diff --git a/source/vdo/base/vdoLoad.c b/source/vdo/base/vdoLoad.c new file mode 100644 index 0000000..c72f39e --- /dev/null +++ b/source/vdo/base/vdoLoad.c @@ -0,0 +1,560 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLoad.c#17 $ + */ + +#include "vdoLoad.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "completion.h" +#include "constants.h" +#include "hashZone.h" +#include "header.h" +#include "logicalZone.h" +#include "physicalZone.h" +#include "readOnlyRebuild.h" +#include "recoveryJournal.h" +#include "releaseVersions.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "threadConfig.h" +#include "types.h" +#include "vdoInternal.h" +#include "vdoRecovery.h" +#include "volumeGeometry.h" + +/** + * Extract the VDO from an AdminCompletion, checking that the current operation + * is a load. + * + * @param completion The AdminCompletion's sub-task completion + * + * @return The VDO + **/ +static inline VDO *vdoFromLoadSubTask(VDOCompletion *completion) +{ + return vdoFromAdminSubTask(completion, ADMIN_OPERATION_LOAD); +} + +/** + * Finish aborting a load now that any entry to read-only mode is complete. + * This callback is registered in abortLoad(). + * + * @param completion The sub-task completion + **/ +static void finishAborting(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + vdo->closeRequired = false; + finishParentCallback(completion); +} + +/** + * Make sure the recovery journal is closed when aborting a load. + * + * @param completion The sub-task completion + **/ +static void closeRecoveryJournalForAbort(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + prepareAdminSubTask(vdo, finishAborting, finishAborting); + drainRecoveryJournal(vdo->recoveryJournal, ADMIN_STATE_SAVING, completion); +} + +/** + * Clean up after an error loading a VDO. This error handler is set in + * loadCallback() and loadVDOComponents(). + * + * @param completion The sub-task completion + **/ +static void abortLoad(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + logErrorWithStringError(completion->result, "aborting load"); + if (vdo->readOnlyNotifier == NULL) { + // There are no threads, so we're done + finishParentCallback(completion); + return; + } + + // Preserve the error. + setCompletionResult(completion->parent, completion->result); + if (vdo->recoveryJournal == NULL) { + prepareAdminSubTask(vdo, finishAborting, finishAborting); + } else { + prepareAdminSubTaskOnThread(vdo, closeRecoveryJournalForAbort, + closeRecoveryJournalForAbort, + getJournalZoneThread(getThreadConfig(vdo))); + } + + waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, completion); +} + +/** + * Wait for the VDO to be in read-only mode. + * + * @param completion The sub-task completion + **/ +static void waitForReadOnlyMode(VDOCompletion *completion) +{ + prepareToFinishParent(completion, completion->parent); + setCompletionResult(completion, VDO_READ_ONLY); + VDO *vdo = vdoFromLoadSubTask(completion); + waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, completion); +} + +/** + * Finish loading the VDO after an error, but leave it in read-only + * mode. This error handler is set in makeDirty(), scrubSlabs(), and + * loadVDOComponents(). + * + * @param completion The sub-task completion + **/ +static void continueLoadReadOnly(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + logErrorWithStringError(completion->result, + "Entering read-only mode due to load error"); + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); + waitForReadOnlyMode(completion); +} + +/** + * Exit recovery mode if necessary now that online slab scrubbing or loading + * is complete. This callback is registrered in scrubSlabs(). + * + * @param completion The slab scrubber completion + **/ +static void finishScrubbingSlabs(VDOCompletion *completion) +{ + VDO *vdo = completion->parent; + assertOnAdminThread(vdo, __func__); + if (inRecoveryMode(vdo)) { + leaveRecoveryMode(vdo); + } else { + logInfo("VDO commencing normal operation"); + } +} + +/** + * Handle an error scrubbing or loading all slabs after the VDO has come + * online. This error handler is registered in scrubSlabs(). + * + * @param completion The slab scrubber completion + **/ +static void handleScrubAllError(VDOCompletion *completion) +{ + VDO *vdo = completion->parent; + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); +} + +/** + * Initiate slab scrubbing if necessary. This callback is registered in + * prepareToComeOnline(). + * + * @param completion The sub-task completion + **/ +static void scrubSlabs(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + if (!hasUnrecoveredSlabs(vdo->depot)) { + finishParentCallback(completion); + return; + } + + if (requiresRecovery(vdo)) { + enterRecoveryMode(vdo); + } + + prepareAdminSubTask(vdo, finishParentCallback, continueLoadReadOnly); + scrubAllUnrecoveredSlabs(vdo->depot, vdo, finishScrubbingSlabs, + handleScrubAllError, 0, completion); +} + +/** + * This is the error handler for slab scrubbing. It is registered in + * prepareToComeOnline(). + * + * @param completion The sub-task completion + **/ +static void handleScrubbingError(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); + waitForReadOnlyMode(completion); +} + +/** + * This is the callback after the super block is written. It prepares the block + * allocator to come online and start allocating. It is registered in + * makeDirty(). + * + * @param completion The sub-task completion + **/ +static void prepareToComeOnline(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + SlabDepotLoadType loadType = NORMAL_LOAD; + if (requiresReadOnlyRebuild(vdo)) { + loadType = REBUILD_LOAD; + } else if (requiresRecovery(vdo)) { + loadType = RECOVERY_LOAD; + } + + initializeBlockMapFromJournal(vdo->blockMap, vdo->recoveryJournal); + + prepareAdminSubTask(vdo, scrubSlabs, handleScrubbingError); + prepareToAllocate(vdo->depot, loadType, completion); +} + +/** + * Mark the super block as dirty now that everything has been loaded or + * rebuilt. + * + * @param completion The sub-task completion + **/ +static void makeDirty(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + if (isReadOnly(vdo->readOnlyNotifier)) { + finishCompletion(completion->parent, VDO_READ_ONLY); + return; + } + + vdo->state = VDO_DIRTY; + prepareAdminSubTask(vdo, prepareToComeOnline, continueLoadReadOnly); + saveVDOComponentsAsync(vdo, completion); +} + +/** + * Callback to do the destructive parts of a load now that the new VDO device + * is being resumed. + * + * @param completion The sub-task completion + **/ +static void loadCallback(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + assertOnAdminThread(vdo, __func__); + + // Prepare the recovery journal for new entries. + openRecoveryJournal(vdo->recoveryJournal, vdo->depot, vdo->blockMap); + vdo->closeRequired = true; + if (isReadOnly(vdo->readOnlyNotifier)) { + // In read-only mode we don't use the allocator and it may not + // even be readable, so use the default structure. + finishCompletion(completion->parent, VDO_READ_ONLY); + return; + } + + if (requiresReadOnlyRebuild(vdo)) { + prepareAdminSubTask(vdo, makeDirty, abortLoad); + launchRebuild(vdo, completion); + return; + } + + if (requiresRebuild(vdo)) { + prepareAdminSubTask(vdo, makeDirty, continueLoadReadOnly); + launchRecovery(vdo, completion); + return; + } + + prepareAdminSubTask(vdo, makeDirty, continueLoadReadOnly); + loadSlabDepot(vdo->depot, + (wasNew(vdo) ? ADMIN_STATE_FORMATTING : ADMIN_STATE_LOADING), + completion, NULL); +} + +/**********************************************************************/ +int performVDOLoad(VDO *vdo) +{ + return performAdminOperation(vdo, ADMIN_OPERATION_LOAD, NULL, loadCallback, + loadCallback); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int startVDODecode(VDO *vdo, bool validateConfig) +{ + int result = validateVDOVersion(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeVDOComponent(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + if (!validateConfig) { + return VDO_SUCCESS; + } + + if (vdo->loadConfig.nonce != vdo->nonce) { + return logErrorWithStringError(VDO_BAD_NONCE, "Geometry nonce %" PRIu64 + " does not match superblock nonce %llu", + vdo->loadConfig.nonce, vdo->nonce); + } + + BlockCount blockCount = vdo->layer->getBlockCount(vdo->layer); + return validateVDOConfig(&vdo->config, blockCount, true); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int finishVDODecode(VDO *vdo) +{ + Buffer *buffer = getComponentBuffer(vdo->superBlock); + const ThreadConfig *threadConfig = getThreadConfig(vdo); + int result = makeRecoveryJournal(vdo->nonce, vdo->layer, + getVDOPartition(vdo->layout, + RECOVERY_JOURNAL_PARTITION), + vdo->completeRecoveries, + vdo->config.recoveryJournalSize, + RECOVERY_JOURNAL_TAIL_BUFFER_SIZE, + vdo->readOnlyNotifier, threadConfig, + &vdo->recoveryJournal); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeRecoveryJournal(vdo->recoveryJournal, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeSlabDepot(buffer, threadConfig, vdo->nonce, vdo->layer, + getVDOPartition(vdo->layout, + SLAB_SUMMARY_PARTITION), + vdo->readOnlyNotifier, vdo->recoveryJournal, + &vdo->depot); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeBlockMap(buffer, vdo->config.logicalBlocks, threadConfig, + &vdo->blockMap); + if (result != VDO_SUCCESS) { + return result; + } + + ASSERT_LOG_ONLY((contentLength(buffer) == 0), + "All decoded component data was used"); + return VDO_SUCCESS; +} + +/** + * Decode the component data portion of a super block and fill in the + * corresponding portions of the VDO being loaded. This will also allocate the + * recovery journal and slab depot. If this method is called with an + * asynchronous layer (i.e. a thread config which specifies at least one base + * thread), the block map and packer will be constructed as well. + * + * @param vdo The VDO being loaded + * @param validateConfig Whether to validate the config + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int decodeVDO(VDO *vdo, bool validateConfig) +{ + int result = startVDODecode(vdo, validateConfig); + if (result != VDO_SUCCESS) { + return result; + } + + const ThreadConfig *threadConfig = getThreadConfig(vdo); + result = makeReadOnlyNotifier(inReadOnlyMode(vdo), threadConfig, vdo->layer, + &vdo->readOnlyNotifier); + if (result != VDO_SUCCESS) { + return result; + } + + result = enableReadOnlyEntry(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); + if (result != VDO_SUCCESS) { + return result; + } + + result = finishVDODecode(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + result = makeFlusher(vdo); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount maximumAge = getConfiguredBlockMapMaximumAge(vdo); + BlockCount journalLength + = getRecoveryJournalLength(vdo->config.recoveryJournalSize); + if ((maximumAge > (journalLength / 2)) || (maximumAge < 1)) { + return VDO_BAD_CONFIGURATION; + } + result = makeBlockMapCaches(vdo->blockMap, vdo->layer, + vdo->readOnlyNotifier, vdo->recoveryJournal, + vdo->nonce, getConfiguredCacheSize(vdo), + maximumAge); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(threadConfig->hashZoneCount, HashZone *, __func__, + &vdo->hashZones); + if (result != VDO_SUCCESS) { + return result; + } + + for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { + result = makeHashZone(vdo, zone, &vdo->hashZones[zone]); + if (result != VDO_SUCCESS) { + return result; + } + } + + result = makeLogicalZones(vdo, &vdo->logicalZones); + if (result != VDO_SUCCESS) { + return result; + } + + result = ALLOCATE(threadConfig->physicalZoneCount, PhysicalZone *, __func__, + &vdo->physicalZones); + if (result != VDO_SUCCESS) { + return result; + } + + for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { + result = makePhysicalZone(vdo, zone, &vdo->physicalZones[zone]); + if (result != VDO_SUCCESS) { + return result; + } + } + + return makePacker(vdo->layer, DEFAULT_PACKER_INPUT_BINS, + DEFAULT_PACKER_OUTPUT_BINS, threadConfig, &vdo->packer); +} + +/** + * Load the components of a VDO. This is the super block load callback + * set by loadCallback(). + * + * @param completion The sub-task completion + **/ +static void loadVDOComponents(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + + prepareCompletion(completion, finishParentCallback, abortLoad, + completion->callbackThreadID, completion->parent); + finishCompletion(completion, decodeVDO(vdo, true)); +} + +/** + * Callback to initiate a pre-load, registered in prepareToLoadVDO(). + * + * @param completion The sub-task completion + **/ +static void preLoadCallback(VDOCompletion *completion) +{ + VDO *vdo = vdoFromLoadSubTask(completion); + assertOnAdminThread(vdo, __func__); + prepareAdminSubTask(vdo, loadVDOComponents, abortLoad); + loadSuperBlockAsync(completion, getFirstBlockOffset(vdo), &vdo->superBlock); +} + +/**********************************************************************/ +int prepareToLoadVDO(VDO *vdo, const VDOLoadConfig *loadConfig) +{ + vdo->loadConfig = *loadConfig; + return performAdminOperation(vdo, ADMIN_OPERATION_LOAD, NULL, + preLoadCallback, preLoadCallback); +} + +/**********************************************************************/ +__attribute__((warn_unused_result)) +static int decodeSynchronousVDO(VDO *vdo, bool validateConfig) +{ + int result = startVDODecode(vdo, validateConfig); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); + if (result != VDO_SUCCESS) { + return result; + } + + return finishVDODecode(vdo); +} + +/**********************************************************************/ +int loadVDOSuperblock(PhysicalLayer *layer, + VolumeGeometry *geometry, + bool validateConfig, + VDODecoder *decoder, + VDO **vdoPtr) +{ + VDO *vdo; + int result = makeVDO(layer, &vdo); + if (result != VDO_SUCCESS) { + return result; + } + + setLoadConfigFromGeometry(geometry, &vdo->loadConfig); + result = loadSuperBlock(layer, getFirstBlockOffset(vdo), &vdo->superBlock); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + result = ((decoder == NULL) + ? decodeSynchronousVDO(vdo, validateConfig) + : decoder(vdo, validateConfig)); + if (result != VDO_SUCCESS) { + freeVDO(&vdo); + return result; + } + + *vdoPtr = vdo; + return VDO_SUCCESS; + +} +/**********************************************************************/ +int loadVDO(PhysicalLayer *layer, + bool validateConfig, + VDODecoder *decoder, + VDO **vdoPtr) +{ + VolumeGeometry geometry; + int result = loadVolumeGeometry(layer, &geometry); + if (result != VDO_SUCCESS) { + return result; + } + + return loadVDOSuperblock(layer, &geometry, validateConfig, decoder, vdoPtr); +} diff --git a/source/vdo/base/vdoLoad.h b/source/vdo/base/vdoLoad.h new file mode 100644 index 0000000..893d6e4 --- /dev/null +++ b/source/vdo/base/vdoLoad.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLoad.h#3 $ + */ + +#ifndef VDO_LOAD_H +#define VDO_LOAD_H + +#include "volumeGeometry.h" +#include "types.h" + +/** + * A function which decodes a VDO from a super block. + * + * @param vdo The VDO to be decoded (its super block must already + * be loaded) + * @param validateConfig If true, the VDO's configuration will + * be validated before the decode is attempted + * + * @return VDO_SUCCESS or an error + **/ +typedef int VDODecoder(VDO *vdo, bool validateConfig); + +/** + * Load a VDO for normal operation. This method must not be called from a base + * thread. + * + * @param vdo The VDO to load + * + * @return VDO_SUCCESS or an error + **/ +int performVDOLoad(VDO *vdo) + __attribute__((warn_unused_result)); + +/** + * Perpare a VDO for loading by reading structures off disk. This method does + * not alter the on-disk state. It should be called from the VDO constructor, + * whereas performVDOLoad() will be called during pre-resume if the VDO has + * not been resumed before. + **/ +int prepareToLoadVDO(VDO *vdo, const VDOLoadConfig *loadConfig) + __attribute__((warn_unused_result)); + +/** + * Synchronously load a VDO from a specified super block location for use by + * user-space tools. + * + * @param [in] layer The physical layer the VDO sits on + * @param [in] geometry A pointer to the geometry for the volume + * @param [in] validateConfig Whether to validate the VDO against the layer + * @param [in] decoder The VDO decoder to use, if NULL, the default + * decoder will be used + * @param [out] vdoPtr A pointer to hold the decoded VDO + * + * @return VDO_SUCCESS or an error + **/ +int loadVDOSuperblock(PhysicalLayer *layer, + VolumeGeometry *geometry, + bool validateConfig, + VDODecoder *decoder, + VDO **vdoPtr) + __attribute__((warn_unused_result)); + +/** + * Synchronously load a VDO volume for use by user-space tools. + * + * @param [in] layer The physical layer the VDO sits on + * @param [in] validateConfig Whether to validate the VDO against the layer + * @param [in] decoder The VDO decoder to use, if NULL, the default + * decoder will be used + * @param [out] vdoPtr A pointer to hold the decoded VDO + * + * @return VDO_SUCCESS or an error + **/ +int loadVDO(PhysicalLayer *layer, + bool validateConfig, + VDODecoder *decoder, + VDO **vdoPtr) + __attribute__((warn_unused_result)); + +#endif /* VDO_LOAD_H */ diff --git a/source/vdo/base/vdoPageCache.c b/source/vdo/base/vdoPageCache.c new file mode 100644 index 0000000..c8f4585 --- /dev/null +++ b/source/vdo/base/vdoPageCache.c @@ -0,0 +1,1369 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCache.c#11 $ + */ + +#include "vdoPageCacheInternals.h" + +#if __KERNEL__ +#include +#endif + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "adminState.h" +#include "constants.h" +#include "numUtils.h" +#include "readOnlyNotifier.h" +#include "statusCodes.h" +#include "types.h" +#include "vio.h" + +enum { + LOG_INTERVAL = 4000, + DISPLAY_INTERVAL = 100000, +}; + +/**********************************************************************/ +static char *getPageBuffer(PageInfo *info) +{ + VDOPageCache *cache = info->cache; + return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE]; +} + +/** + * Allocate components of the cache which require their own allocation. The + * caller is responsible for all clean up on errors. + * + * @param cache The cache being constructed + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int allocateCacheComponents(VDOPageCache *cache) +{ + int result = ALLOCATE(cache->pageCount, PageInfo, "page infos", + &cache->infos); + if (result != UDS_SUCCESS) { + return result; + } + + uint64_t size = cache->pageCount * (uint64_t) VDO_BLOCK_SIZE; + result = allocateMemory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages); + if (result != UDS_SUCCESS) { + return result; + } + + return makeIntMap(cache->pageCount, 0, &cache->pageMap); +} + +/** + * Initialize all page info structures and put them on the free list. + * + * @param cache The cache to initialize + * + * @return VDO_SUCCESS or an error + **/ +static int initializeInfo(VDOPageCache *cache) +{ + initializeRing(&cache->freeList); + PageInfo *info; + for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { + info->cache = cache; + info->state = PS_FREE; + info->pbn = NO_PAGE; + + if (cache->layer->createMetadataVIO != NULL) { + int result = createVIO(cache->layer, VIO_TYPE_BLOCK_MAP, + VIO_PRIORITY_METADATA, info, getPageBuffer(info), + &info->vio); + if (result != VDO_SUCCESS) { + return result; + } + + // The thread ID should never change. + info->vio->completion.callbackThreadID = cache->zone->threadID; + } + + initializeRing(&info->listNode); + pushRingNode(&cache->freeList, &info->listNode); + initializeRing(&info->lruNode); + } + + relaxedStore64(&cache->stats.counts.freePages, cache->pageCount); + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void writeDirtyPagesCallback(RingNode *node, void *context); + +/**********************************************************************/ +int makeVDOPageCache(PhysicalLayer *layer, + PageCount pageCount, + VDOPageReadFunction *readHook, + VDOPageWriteFunction *writeHook, + size_t pageContextSize, + BlockCount maximumAge, + BlockMapZone *zone, + VDOPageCache **cachePtr) +{ + int result = ASSERT(pageContextSize <= MAX_PAGE_CONTEXT_SIZE, + "page context size %zu cannot exceed %u bytes", + pageContextSize, MAX_PAGE_CONTEXT_SIZE); + if (result != VDO_SUCCESS) { + return result; + } + + VDOPageCache *cache; + result = ALLOCATE(1, VDOPageCache, "page cache", &cache); + if (result != UDS_SUCCESS) { + return result; + } + + cache->layer = layer; + cache->pageCount = pageCount; + cache->readHook = readHook; + cache->writeHook = writeHook; + cache->zone = zone; + + result = allocateCacheComponents(cache); + if (result != VDO_SUCCESS) { + freeVDOPageCache(&cache); + return result; + } + + result = initializeInfo(cache); + if (result != VDO_SUCCESS) { + freeVDOPageCache(&cache); + return result; + } + + result = makeDirtyLists(maximumAge, writeDirtyPagesCallback, cache, + &cache->dirtyLists); + if (result != VDO_SUCCESS) { + freeVDOPageCache(&cache); + return result; + } + + // initialize empty circular queues + initializeRing(&cache->lruList); + initializeRing(&cache->outgoingList); + + *cachePtr = cache; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeVDOPageCache(VDOPageCache **cachePtr) +{ + VDOPageCache *cache = *cachePtr; + if (cache == NULL) { + return; + } + + if (cache->infos != NULL) { + PageInfo *info; + for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { + freeVIO(&info->vio); + } + } + + freeDirtyLists(&cache->dirtyLists); + freeIntMap(&cache->pageMap); + FREE(cache->infos); + FREE(cache->pages); + FREE(cache); + *cachePtr = NULL; +} + +/**********************************************************************/ +void setVDOPageCacheInitialPeriod(VDOPageCache *cache, SequenceNumber period) +{ + setCurrentPeriod(cache->dirtyLists, period); +} + +/**********************************************************************/ +void setVDOPageCacheRebuildMode(VDOPageCache *cache, bool rebuilding) +{ + cache->rebuilding = rebuilding; +} + +/** + * Assert that a function has been called on the VDO page cache's thread. + * + * @param cache the page cache + * @param functionName the name of the function + **/ +static inline void assertOnCacheThread(VDOPageCache *cache, + const char *functionName) +{ + ThreadID threadID = getCallbackThreadID(); + ASSERT_LOG_ONLY((threadID == cache->zone->threadID), + "%s() must only be called on cache thread %d, not thread %d", + functionName, cache->zone->threadID, threadID); +} + +/** + * Assert that a page cache may issue I/O. + * + * @param cache the page cache + **/ +static inline void assertIOAllowed(VDOPageCache *cache) +{ + ASSERT_LOG_ONLY(!isQuiescent(&cache->zone->state), + "VDO page cache may issue I/O"); +} + +/** + * Log and, if enabled, report cache pressure. + * + * @param cache the page cache + **/ +static void reportCachePressure(VDOPageCache *cache) +{ + relaxedAdd64(&cache->stats.cachePressure, 1); + if (cache->waiterCount > cache->pageCount) { + if ((cache->pressureReport % LOG_INTERVAL) == 0) { + logInfo("page cache pressure %llu", + relaxedLoad64(&cache->stats.cachePressure)); + } + + if (++cache->pressureReport >= DISPLAY_INTERVAL) { + cache->pressureReport = 0; + } + } +} + +/**********************************************************************/ +const char *vpcPageStateName(PageState state) +{ + static const char *stateNames[] = { + "FREE", + "INCOMING", + "FAILED", + "RESIDENT", + "DIRTY", + "OUTGOING" + }; + STATIC_ASSERT(COUNT_OF(stateNames) == PAGE_STATE_COUNT); + + int result = ASSERT(state < COUNT_OF(stateNames), + "Unknown PageState value %d", state); + if (result != UDS_SUCCESS) { + return "[UNKNOWN PAGE STATE]"; + } + + return stateNames[state]; +} + +/** + * Update the counter associated with a given state. + * + * @param info the page info to count + * @param delta the delta to apply to the counter + **/ +static void updateCounter(PageInfo *info, int32_t delta) +{ + VDOPageCache *cache = info->cache; + switch (info->state) { + case PS_FREE: + relaxedAdd64(&cache->stats.counts.freePages, delta); + return; + + case PS_INCOMING: + relaxedAdd64(&cache->stats.counts.incomingPages, delta); + return; + + case PS_OUTGOING: + relaxedAdd64(&cache->stats.counts.outgoingPages, delta); + return; + + case PS_FAILED: + relaxedAdd64(&cache->stats.counts.failedPages, delta); + return; + + case PS_RESIDENT: + relaxedAdd64(&cache->stats.counts.cleanPages, delta); + return; + + case PS_DIRTY: + relaxedAdd64(&cache->stats.counts.dirtyPages, delta); + return; + + default: + return; + } +} + +/** + * Update the lru information for an active page. + **/ +static void updateLru(PageInfo *info) +{ + VDOPageCache *cache = info->cache; + + if (cache->lruList.prev != &info->lruNode) { + pushRingNode(&cache->lruList, &info->lruNode); + } +} + +/** + * Set the state of a PageInfo and put it on the right list, adjusting + * counters. + * + * @param info the PageInfo to modify + * @param newState the new state for the PageInfo + **/ +static void setInfoState(PageInfo *info, PageState newState) +{ + if (newState == info->state) { + return; + } + + updateCounter(info, -1); + info->state = newState; + updateCounter(info, 1); + + switch (info->state) { + case PS_FREE: + case PS_FAILED: + pushRingNode(&info->cache->freeList, &info->listNode); + return; + + case PS_OUTGOING: + pushRingNode(&info->cache->outgoingList, &info->listNode); + return; + + case PS_DIRTY: + return; + + default: + unspliceRingNode(&info->listNode); + } +} + +/** + * Set the pbn for an info, updating the map as needed. + * + * @param info The page info + * @param pbn The physical block number to set + **/ +__attribute__((warn_unused_result)) +static int setInfoPBN(PageInfo *info, PhysicalBlockNumber pbn) +{ + VDOPageCache *cache = info->cache; + + // Either the new or the old page number must be NO_PAGE. + int result = ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE), + "Must free a page before reusing it."); + if (result != VDO_SUCCESS) { + return result; + } + + if (info->pbn != NO_PAGE) { + intMapRemove(cache->pageMap, info->pbn); + } + + info->pbn = pbn; + + if (pbn != NO_PAGE) { + result = intMapPut(cache->pageMap, pbn, info, true, NULL); + if (result != UDS_SUCCESS) { + return result; + } + } + return VDO_SUCCESS; +} + +/** + * Reset page info to represent an unallocated page. + **/ +static int resetPageInfo(PageInfo *info) +{ + int result = ASSERT(info->busy == 0, "VDO Page must not be busy"); + if (result != UDS_SUCCESS) { + return result; + } + + result = ASSERT(!hasWaiters(&info->waiting), + "VDO Page must not have waiters"); + if (result != UDS_SUCCESS) { + return result; + } + + result = setInfoPBN(info, NO_PAGE); + setInfoState(info, PS_FREE); + unspliceRingNode(&info->lruNode); + return result; +} + +/** + * Find a free page. + * + * @param cache the page cache + * + * @return a pointer to the page info structure (if found), NULL otherwise + **/ +__attribute__((warn_unused_result)) +static PageInfo *findFreePage(VDOPageCache *cache) +{ + if (cache->freeList.next == &cache->freeList) { + return NULL; + } + PageInfo *info = pageInfoFromListNode(cache->freeList.next); + unspliceRingNode(&info->listNode); + return info; +} + +/**********************************************************************/ +PageInfo *vpcFindPage(VDOPageCache *cache, PhysicalBlockNumber pbn) +{ + if ((cache->lastFound != NULL) + && (cache->lastFound->pbn == pbn)) { + return cache->lastFound; + } + cache->lastFound = intMapGet(cache->pageMap, pbn); + return cache->lastFound; +} + +/** + * Determine which page is least recently used. + * + * @param cache the page cache structure + * + * @return a pointer to the info structure for a relevant page, + * or NULL if no such page can be found. The page can be + * dirty or resident. + * + * @note Picks the least recently used from among the non-busy entries + * at the front of each of the lru ring. + * Since whenever we mark a page busy we also put it to the end + * of the ring it is unlikely that the entries at the front + * are busy unless the queue is very short, but not impossible. + **/ +__attribute__((warn_unused_result)) +static PageInfo *selectLRUPage(VDOPageCache *cache) +{ + PageInfoNode *lru; + for (lru = cache->lruList.next; + lru != &cache->lruList; + lru = lru->next) { + PageInfo *info = pageInfoFromLRUNode(lru); + if ((info->busy == 0) && !isInFlight(info)) { + return info; + } + } + + return NULL; +} + +/**********************************************************************/ +AtomicPageCacheStatistics *getVDOPageCacheStatistics(VDOPageCache *cache) +{ + return &cache->stats; +} + +// ASYNCHRONOUS INTERFACE BEYOND THIS POINT + +/** + * Helper to complete the VDO Page Completion request successfully. + * + * @param info the page info representing the result page + * @param vdoPageComp the VDO page completion to complete + **/ +static void completeWithPage(PageInfo *info, VDOPageCompletion *vdoPageComp) +{ + bool available = vdoPageComp->writable ? isPresent(info) : isValid(info); + if (!available) { + logErrorWithStringError(VDO_BAD_PAGE, + "Requested cache page %llu in state %s is" + " not %s", + info->pbn, vpcPageStateName(info->state), + vdoPageComp->writable ? "present" : "valid"); + finishCompletion(&vdoPageComp->completion, VDO_BAD_PAGE); + return; + } + + vdoPageComp->info = info; + vdoPageComp->ready = true; + finishCompletion(&vdoPageComp->completion, VDO_SUCCESS); +} + +/** + * Complete a page completion with an error code. Implements WaiterCallback. + * + * @param waiter The page completion, as a waiter + * @param resultPtr A pointer to the error code. + **/ +static void completeWaiterWithError(Waiter *waiter, void *resultPtr) +{ + int *result = resultPtr; + VDOPageCompletion *completion = pageCompletionFromWaiter(waiter); + finishCompletion(&completion->completion, *result); +} + +/** + * Complete a queue of VDOPageCompletions with an error code. + * + * @param [in] result the error result + * @param [in, out] queue a pointer to the queue + * + * @note upon completion the queue will be empty + **/ +static void distributeErrorOverQueue(int result, WaitQueue *queue) +{ + notifyAllWaiters(queue, completeWaiterWithError, &result); +} + +/** + * Complete a page completion with a page. Implements WaiterCallback. + * + * @param waiter The page completion, as a waiter + * @param pageInfo The page info to complete with + **/ +static void completeWaiterWithPage(Waiter *waiter, void *pageInfo) +{ + PageInfo *info = pageInfo; + VDOPageCompletion *completion = pageCompletionFromWaiter(waiter); + completeWithPage(info, completion); +} + +/** + * Complete a queue of VDOPageCompletions with a page result. + * + * @param [in] info the page info describing the page + * @param [in, out] queue a pointer to a queue of waiters + * + * @return the number of pages distributed + * + * @note upon completion the queue will be empty + * + **/ +static unsigned int distributePageOverQueue(PageInfo *info, WaitQueue *queue) +{ + updateLru(info); + + size_t pages = countWaiters(queue); + + /* + * Increment the busy count once for each pending completion so that + * this page does not stop being busy until all completions have + * been processed (VDO-83). + */ + info->busy += pages; + + notifyAllWaiters(queue, completeWaiterWithPage, info); + return pages; +} + +/** + * Set a persistent error which all requests will receive in the future. + * + * @param cache the page cache + * @param context a string describing what triggered the error + * @param result the error result + * + * Once triggered, all enqueued completions will get this error. + * Any future requests will result in this error as well. + **/ +static void setPersistentError(VDOPageCache *cache, + const char *context, + int result) +{ + // If we're already read-only, there's no need to log. + ReadOnlyNotifier *notifier = cache->zone->readOnlyNotifier; + if ((result != VDO_READ_ONLY) && !isReadOnly(notifier)) { + logErrorWithStringError(result, "VDO Page Cache persistent error: %s", + context); + enterReadOnlyMode(notifier, result); + } + + assertOnCacheThread(cache, __func__); + + distributeErrorOverQueue(result, &cache->freeWaiters); + cache->waiterCount = 0; + + PageInfo *info; + for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { + distributeErrorOverQueue(result, &info->waiting); + } +} + +/**********************************************************************/ +void initVDOPageCompletion(VDOPageCompletion *pageCompletion, + VDOPageCache *cache, + PhysicalBlockNumber pbn, + bool writable, + void *parent, + VDOAction *callback, + VDOAction *errorHandler) +{ + ASSERT_LOG_ONLY((pageCompletion->waiter.nextWaiter == NULL), + "New page completion was not already on a wait queue"); + + *pageCompletion = (VDOPageCompletion) { + .pbn = pbn, + .writable = writable, + .cache = cache, + }; + + VDOCompletion *completion = &pageCompletion->completion; + initializeCompletion(completion, VDO_PAGE_COMPLETION, cache->layer); + prepareCompletion(completion, callback, errorHandler, cache->zone->threadID, + parent); +} + +/** + * Helper function to check that a completion represents a successfully + * completed VDO Page Completion referring to a valid page. + * + * @param completion a VDO completion + * @param writable whether a writable page is required + * + * @return the embedding completion if valid, NULL if not + **/ +__attribute__((warn_unused_result)) +static VDOPageCompletion *validateCompletedPage(VDOCompletion *completion, + bool writable) +{ + VDOPageCompletion *vpc = asVDOPageCompletion(completion); + + int result = ASSERT(vpc->ready, "VDO Page completion not ready"); + if (result != UDS_SUCCESS) { + return NULL; + } + + result = ASSERT(vpc->info != NULL, "VDO Page Completion must be complete"); + if (result != UDS_SUCCESS) { + return NULL; + } + + result = ASSERT(vpc->info->pbn == vpc->pbn, + "VDO Page Completion pbn must be consistent"); + if (result != UDS_SUCCESS) { + return NULL; + } + + result = ASSERT(isValid(vpc->info), + "VDO Page Completion page must be valid"); + if (result != UDS_SUCCESS) { + return NULL; + } + + if (writable) { + result = ASSERT(vpc->writable, "VDO Page Completion is writable"); + if (result != UDS_SUCCESS) { + return NULL; + } + } + + return vpc; +} + +/**********************************************************************/ +bool isPageCacheActive(VDOPageCache *cache) +{ + return ((cache->outstandingReads != 0) || (cache->outstandingWrites != 0)); +} + +/** + * VIO callback used when a page has been loaded. + * + * @param completion A completion for the VIO, the parent of which is a + * PageInfo. + **/ +static void pageIsLoaded(VDOCompletion *completion) +{ + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + assertOnCacheThread(cache, __func__); + + setInfoState(info, PS_RESIDENT); + distributePageOverQueue(info, &info->waiting); + + /* + * Don't decrement until right before calling checkForDrainComplete() to + * ensure that the above work can't cause the page cache to be freed out from + * under us. + */ + cache->outstandingReads--; + checkForDrainComplete(cache->zone); +} + +/** + * Handle page load errors. + * + * @param completion The page read VIO + **/ +static void handleLoadError(VDOCompletion *completion) +{ + int result = completion->result; + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + assertOnCacheThread(cache, __func__); + + enterReadOnlyMode(cache->zone->readOnlyNotifier, result); + relaxedAdd64(&cache->stats.failedReads, 1); + setInfoState(info, PS_FAILED); + distributeErrorOverQueue(result, &info->waiting); + resetPageInfo(info); + + /* + * Don't decrement until right before calling checkForDrainComplete() to + * ensure that the above work can't cause the page cache to be freed out from + * under us. + */ + cache->outstandingReads--; + checkForDrainComplete(cache->zone); +} + +/** + * Run the read hook after a page is loaded. This callback is registered in + * launchPageLoad() when there is a read hook. + * + * @param completion The page load completion + **/ +static void runReadHook(VDOCompletion *completion) +{ + PageInfo *info = completion->parent; + completion->callback = pageIsLoaded; + resetCompletion(completion); + int result = info->cache->readHook(getPageBuffer(info), info->pbn, + info->cache->zone, info->context); + continueCompletion(completion, result); +} + +/** + * Handle a read error during a read-only rebuild. + * + * @param completion The page load completion + **/ +static void handleRebuildReadError(VDOCompletion *completion) +{ + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + assertOnCacheThread(cache, __func__); + + // We are doing a read-only rebuild, so treat this as a successful read + // of an uninitialized page. + relaxedAdd64(&cache->stats.failedReads, 1); + memset(getPageBuffer(info), 0, VDO_BLOCK_SIZE); + resetCompletion(completion); + if (cache->readHook != NULL) { + runReadHook(completion); + } else { + pageIsLoaded(completion); + } +} + +/** + * Begin the process of loading a page. + * + * @param info the page info representing where to load the page + * @param pbn the absolute pbn of the desired page + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int launchPageLoad(PageInfo *info, PhysicalBlockNumber pbn) +{ + VDOPageCache *cache = info->cache; + assertIOAllowed(cache); + + int result = setInfoPBN(info, pbn); + if (result != VDO_SUCCESS) { + return result; + } + + result = ASSERT((info->busy == 0), "Page is not busy before loading."); + if (result != VDO_SUCCESS) { + return result; + } + + setInfoState(info, PS_INCOMING); + cache->outstandingReads++; + relaxedAdd64(&cache->stats.pagesLoaded, 1); + launchReadMetadataVIO(info->vio, pbn, + (cache->readHook != NULL) ? runReadHook : pageIsLoaded, + (cache->rebuilding + ? handleRebuildReadError : handleLoadError)); + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void writePages(VDOCompletion *completion); + +/** + * Handle errors flushing the layer. + * + * @param completion The flush VIO + **/ +static void handleFlushError(VDOCompletion *completion) +{ + VDOPageCache *cache = ((PageInfo *) completion->parent)->cache; + setPersistentError(cache, "flush failed", completion->result); + writePages(completion); +} + +/** + * Attempt to save the outgoing pages by first flushing the layer. + * + * @param cache The cache + **/ +static void savePages(VDOPageCache *cache) +{ + if ((cache->pagesInFlush > 0) || (cache->pagesToFlush == 0)) { + return; + } + + assertIOAllowed(cache); + + PageInfo *info = pageInfoFromListNode(cache->outgoingList.next); + cache->pagesInFlush = cache->pagesToFlush; + cache->pagesToFlush = 0; + relaxedAdd64(&cache->stats.flushCount, 1); + + VIO *vio = info->vio; + PhysicalLayer *layer = vio->completion.layer; + + /* + * We must make sure that the recovery journal entries that changed these + * pages were successfully persisted, and thus must issue a flush before + * each batch of pages is written to ensure this. However, in sync mode, + * every journal block is written with FUA, thus guaranteeing the journal + * persisted already. + */ + if (layer->getWritePolicy(layer) != WRITE_POLICY_SYNC) { + launchFlush(vio, writePages, handleFlushError); + return; + } + + writePages(&vio->completion); +} + +/** + * Add a page to the outgoing list of pages waiting to be saved. Once in the + * list, a page may not be used until it has been written out. + * + * @param info The page to save + **/ +static void schedulePageSave(PageInfo *info) +{ + if (info->busy > 0) { + info->writeStatus = WRITE_STATUS_DEFERRED; + return; + } + + info->cache->pagesToFlush++; + info->cache->outstandingWrites++; + setInfoState(info, PS_OUTGOING); +} + +/**********************************************************************/ +static void writeDirtyPagesCallback(RingNode *expired, void *context) +{ + while (!isRingEmpty(expired)) { + schedulePageSave(pageInfoFromListNode(chopRingNode(expired))); + } + + savePages((VDOPageCache *) context); +} + +/** + * Add a page to outgoing pages waiting to be saved, and then start saving + * pages if another save is not in progress. + * + * @param info The page to save + **/ +static void launchPageSave(PageInfo *info) +{ + schedulePageSave(info); + savePages(info->cache); +} + +/** + * Determine whether a given VDOPageCompletion (as a waiter) is requesting a + * given page number. Implements WaiterMatch. + * + * @param waiter The page completion in question + * @param context A pointer to the pbn of the desired page + * + * @return true if the page completion is for the desired page number + **/ +static bool completionNeedsPage(Waiter *waiter, void *context) +{ + PhysicalBlockNumber *pbn = context; + return (pageCompletionFromWaiter(waiter)->pbn == *pbn); +} + +/** + * Allocate a free page to the first completion in the waiting queue, + * and any other completions that match it in page number. + **/ +static void allocateFreePage(PageInfo *info) +{ + VDOPageCache *cache = info->cache; + assertOnCacheThread(cache, __func__); + + if (!hasWaiters(&cache->freeWaiters)) { + if (relaxedLoad64(&cache->stats.cachePressure) > 0) { + logInfo("page cache pressure relieved"); + relaxedStore64(&cache->stats.cachePressure, 0); + } + return; + } + + int result = resetPageInfo(info); + if (result != VDO_SUCCESS) { + setPersistentError(cache, "cannot reset page info", result); + return; + } + + Waiter *oldestWaiter = getFirstWaiter(&cache->freeWaiters); + PhysicalBlockNumber pbn = pageCompletionFromWaiter(oldestWaiter)->pbn; + + // Remove all entries which match the page number in question + // and push them onto the page info's wait queue. + dequeueMatchingWaiters(&cache->freeWaiters, completionNeedsPage, + &pbn, &info->waiting); + cache->waiterCount -= countWaiters(&info->waiting); + + result = launchPageLoad(info, pbn); + if (result != VDO_SUCCESS) { + distributeErrorOverQueue(result, &info->waiting); + } +} + +/** + * Begin the process of discarding a page. + * + * @param cache the page cache + * + * @note If no page is discardable, increments a count of deferred frees so + * that the next release of a page which is no longer busy will kick + * off another discard cycle. This is an indication that the cache is + * not big enough. + * + * @note If the selected page is not dirty, immediately allocates the page + * to the oldest completion waiting for a free page. + **/ +static void discardAPage(VDOPageCache *cache) +{ + PageInfo *info = selectLRUPage(cache); + if (info == NULL) { + reportCachePressure(cache); + return; + } + + if (!isDirty(info)) { + allocateFreePage(info); + return; + } + + ASSERT_LOG_ONLY(!isInFlight(info), + "page selected for discard is not in flight"); + + ++cache->discardCount; + info->writeStatus = WRITE_STATUS_DISCARD; + launchPageSave(info); +} + +/** + * Helper used to trigger a discard so that the completion can get a different + * page. + * + * @param vdoPageComp the VDO Page completion + **/ +static void discardPageForCompletion(VDOPageCompletion *vdoPageComp) +{ + VDOPageCache *cache = vdoPageComp->cache; + + ++cache->waiterCount; + + int result = enqueueWaiter(&cache->freeWaiters, &vdoPageComp->waiter); + if (result != VDO_SUCCESS) { + setPersistentError(cache, "cannot enqueue waiter", result); + } + + discardAPage(cache); +} + +/** + * Helper used to trigger a discard if the cache needs another free page. + * + * @param cache the page cache + **/ +static void discardPageIfNeeded(VDOPageCache *cache) +{ + if (cache->waiterCount > cache->discardCount) { + discardAPage(cache); + } +} + +/**********************************************************************/ +void advanceVDOPageCachePeriod(VDOPageCache *cache, SequenceNumber period) +{ + assertOnCacheThread(cache, __func__); + advancePeriod(cache->dirtyLists, period); +} + +/** + * Inform the cache that a write has finished (possibly with an error). + * + * @param info The info structure for the page whose write just completed + * + * @return true if the page write was a discard + **/ +static bool writeHasFinished(PageInfo *info) +{ + assertOnCacheThread(info->cache, __func__); + info->cache->outstandingWrites--; + + bool wasDiscard = (info->writeStatus == WRITE_STATUS_DISCARD); + info->writeStatus = WRITE_STATUS_NORMAL; + return wasDiscard; +} + +/** + * Handler for page write errors. + * + * @param completion The page write VIO + **/ +static void handlePageWriteError(VDOCompletion *completion) +{ + int result = completion->result; + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + + // If we're already read-only, write failures are to be expected. + if (result != VDO_READ_ONLY) { +#if __KERNEL__ + static DEFINE_RATELIMIT_STATE(errorLimiter, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (__ratelimit(&errorLimiter)) { + logError("failed to write block map page %llu", info->pbn); + } +#else + logError("failed to write block map page %llu", info->pbn); +#endif + } + + setInfoState(info, PS_DIRTY); + relaxedAdd64(&cache->stats.failedWrites, 1); + setPersistentError(cache, "cannot write page", result); + + if (!writeHasFinished(info)) { + discardPageIfNeeded(cache); + } + + checkForDrainComplete(cache->zone); +} + +/** + * VIO callback used when a page has been written out. + * + * @param completion A completion for the VIO, the parent of which + * is embedded in PageInfo. + **/ +static void pageIsWrittenOut(VDOCompletion *completion) +{ + PageInfo *info = completion->parent; + VDOPageCache *cache = info->cache; + + if (cache->writeHook != NULL) { + bool rewrite = cache->writeHook(getPageBuffer(info), cache->zone, + info->context); + if (rewrite) { + launchWriteMetadataVIOWithFlush(info->vio, info->pbn, pageIsWrittenOut, + handlePageWriteError, true, false); + return; + } + } + + bool wasDiscard = writeHasFinished(info); + bool reclaimed = (!wasDiscard || (info->busy > 0) + || hasWaiters(&info->waiting)); + + setInfoState(info, PS_RESIDENT); + + uint32_t reclamations = distributePageOverQueue(info, &info->waiting); + relaxedAdd64(&cache->stats.reclaimed, reclamations); + + if (wasDiscard) { + cache->discardCount--; + } + + if (reclaimed) { + discardPageIfNeeded(cache); + } else { + allocateFreePage(info); + } + + checkForDrainComplete(cache->zone); +} + +/** + * Write the batch of pages which were covered by the layer flush which just + * completed. This callback is registered in savePages(). + * + * @param flushCompletion The flush VIO + **/ +static void writePages(VDOCompletion *flushCompletion) +{ + VDOPageCache *cache = ((PageInfo *) flushCompletion->parent)->cache; + + /* + * We need to cache these two values on the stack since in the error case + * below, it is possible for the last page info to cause the page cache to + * get freed. Hence once we launch the last page, it may be unsafe to + * dereference the cache [VDO-4724]. + */ + bool hasUnflushedPages = (cache->pagesToFlush > 0); + PageCount pagesInFlush = cache->pagesInFlush; + cache->pagesInFlush = 0; + while (pagesInFlush-- > 0) { + PageInfo *info = pageInfoFromListNode(chopRingNode(&cache->outgoingList)); + if (isReadOnly(info->cache->zone->readOnlyNotifier)) { + VDOCompletion *completion = &info->vio->completion; + resetCompletion(completion); + completion->callback = pageIsWrittenOut; + completion->errorHandler = handlePageWriteError; + finishCompletion(completion, VDO_READ_ONLY); + continue; + } + relaxedAdd64(&info->cache->stats.pagesSaved, 1); + launchWriteMetadataVIO(info->vio, info->pbn, pageIsWrittenOut, + handlePageWriteError); + } + + if (hasUnflushedPages) { + // If there are unflushed pages, the cache can't have been freed, so this + // call is safe. + savePages(cache); + } +} + +/**********************************************************************/ +void releaseVDOPageCompletion(VDOCompletion *completion) +{ + if (completion == NULL) { + return; + } + + PageInfo *discardInfo = NULL; + VDOPageCompletion *pageCompletion; + if (completion->result == VDO_SUCCESS) { + pageCompletion = validateCompletedPage(completion, false); + if (--pageCompletion->info->busy == 0) { + discardInfo = pageCompletion->info; + } + } else { + // Do not check for errors if the completion was not successful. + pageCompletion = asVDOPageCompletion(completion); + } + ASSERT_LOG_ONLY((pageCompletion->waiter.nextWaiter == NULL), + "Page being released after leaving all queues"); + + VDOPageCache *cache = pageCompletion->cache; + assertOnCacheThread(cache, __func__); + memset(pageCompletion, 0, sizeof(VDOPageCompletion)); + + if (discardInfo != NULL) { + if (discardInfo->writeStatus == WRITE_STATUS_DEFERRED) { + discardInfo->writeStatus = WRITE_STATUS_NORMAL; + launchPageSave(discardInfo); + } + // if there are excess requests for pages (that have not already started + // discards) we need to discard some page (which may be this one) + discardPageIfNeeded(cache); + } +} + +/** + * Helper function to load a page as described by a VDO Page Completion. + * + * @param info the page info representing where to load the page + * @param vdoPageComp the VDO Page Completion describing the page + **/ +static void loadPageForCompletion(PageInfo *info, + VDOPageCompletion *vdoPageComp) +{ + int result = enqueueWaiter(&info->waiting, &vdoPageComp->waiter); + if (result != VDO_SUCCESS) { + finishCompletion(&vdoPageComp->completion, result); + return; + } + + result = launchPageLoad(info, vdoPageComp->pbn); + if (result != VDO_SUCCESS) { + distributeErrorOverQueue(result, &info->waiting); + } +} + +/**********************************************************************/ +void getVDOPageAsync(VDOCompletion *completion) +{ + VDOPageCompletion *vdoPageComp = asVDOPageCompletion(completion); + VDOPageCache *cache = vdoPageComp->cache; + assertOnCacheThread(cache, __func__); + + if (vdoPageComp->writable && isReadOnly(cache->zone->readOnlyNotifier)) { + finishCompletion(completion, VDO_READ_ONLY); + return; + } + + if (vdoPageComp->writable) { + relaxedAdd64(&cache->stats.writeCount, 1); + } else { + relaxedAdd64(&cache->stats.readCount, 1); + } + + PageInfo *info = vpcFindPage(cache, vdoPageComp->pbn); + if (info != NULL) { + // The page is in the cache already. + if ((info->writeStatus == WRITE_STATUS_DEFERRED) || isIncoming(info) + || (isOutgoing(info) && vdoPageComp->writable)) { + // The page is unusable until it has finished I/O. + relaxedAdd64(&cache->stats.waitForPage, 1); + int result = enqueueWaiter(&info->waiting, &vdoPageComp->waiter); + if (result != VDO_SUCCESS) { + finishCompletion(&vdoPageComp->completion, result); + } + + return; + } + + if (isValid(info)) { + // The page is usable. + relaxedAdd64(&cache->stats.foundInCache, 1); + if (!isPresent(info)) { + relaxedAdd64(&cache->stats.readOutgoing, 1); + } + updateLru(info); + ++info->busy; + completeWithPage(info, vdoPageComp); + return; + } + // Something horrible has gone wrong. + ASSERT_LOG_ONLY(false, "Info found in a usable state."); + } + + // The page must be fetched. + info = findFreePage(cache); + if (info != NULL) { + relaxedAdd64(&cache->stats.fetchRequired, 1); + loadPageForCompletion(info, vdoPageComp); + return; + } + + // The page must wait for a page to be discarded. + relaxedAdd64(&cache->stats.discardRequired, 1); + discardPageForCompletion(vdoPageComp); +} + +/**********************************************************************/ +void markCompletedVDOPageDirty(VDOCompletion *completion, + SequenceNumber oldDirtyPeriod, + SequenceNumber newDirtyPeriod) +{ + VDOPageCompletion *vdoPageComp = validateCompletedPage(completion, true); + if (vdoPageComp == NULL) { + return; + } + + PageInfo *info = vdoPageComp->info; + setInfoState(info, PS_DIRTY); + addToDirtyLists(info->cache->dirtyLists, &info->listNode, oldDirtyPeriod, + newDirtyPeriod); +} + +/**********************************************************************/ +void requestVDOPageWrite(VDOCompletion *completion) +{ + VDOPageCompletion *vdoPageComp = validateCompletedPage(completion, true); + if (vdoPageComp == NULL) { + return; + } + + PageInfo *info = vdoPageComp->info; + setInfoState(info, PS_DIRTY); + launchPageSave(info); +} + +/**********************************************************************/ +static void *dereferencePageCompletion(VDOPageCompletion *completion) +{ + return ((completion != NULL) ? getPageBuffer(completion->info) : NULL); +} + +/**********************************************************************/ +const void *dereferenceReadableVDOPage(VDOCompletion *completion) +{ + return dereferencePageCompletion(validateCompletedPage(completion, false)); +} + +/**********************************************************************/ +void *dereferenceWritableVDOPage(VDOCompletion *completion) +{ + return dereferencePageCompletion(validateCompletedPage(completion, true)); +} + +/**********************************************************************/ +void *getVDOPageCompletionContext(VDOCompletion *completion) +{ + VDOPageCompletion *pageCompletion = asVDOPageCompletion(completion); + PageInfo *info = ((pageCompletion != NULL) ? pageCompletion->info : NULL); + return (((info != NULL) && isValid(info)) ? info->context : NULL); +} + +/**********************************************************************/ +void drainVDOPageCache(VDOPageCache *cache) +{ + assertOnCacheThread(cache, __func__); + ASSERT_LOG_ONLY(isDraining(&cache->zone->state), + "drainVDOPageCache() called during block map drain"); + + if (!isSuspending(&cache->zone->state)) { + flushDirtyLists(cache->dirtyLists); + savePages(cache); + } +} + +/**********************************************************************/ +int invalidateVDOPageCache(VDOPageCache *cache) +{ + assertOnCacheThread(cache, __func__); + + // Make sure we don't throw away any dirty pages. + PageInfo *info; + for (info = cache->infos; info < cache->infos + cache->pageCount; info++) { + int result = ASSERT(!isDirty(info), "cache must have no dirty pages"); + if (result != VDO_SUCCESS) { + return result; + } + } + + // Reset the pageMap by re-allocating it. + freeIntMap(&cache->pageMap); + return makeIntMap(cache->pageCount, 0, &cache->pageMap); +} diff --git a/source/vdo/base/vdoPageCache.h b/source/vdo/base/vdoPageCache.h new file mode 100644 index 0000000..e6a944d --- /dev/null +++ b/source/vdo/base/vdoPageCache.h @@ -0,0 +1,385 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCache.h#7 $ + */ + +#ifndef VDO_PAGE_CACHE_H +#define VDO_PAGE_CACHE_H + +#include "adminState.h" +#include "atomic.h" +#include "completion.h" +#include "types.h" +#include "waitQueue.h" + +/** + * Structure describing page meta data (defined internally). + **/ +typedef struct pageInfo PageInfo; + +/** + * Structure describing entire page cache. + * (Unfortunately the name "PageCache" is already taken by Albireo.) + **/ +typedef struct vdoPageCache VDOPageCache; + +/** + * Generation counter for page references. + **/ +typedef uint32_t VDOPageGeneration; + +/** + * Page-state count statistics sub-structure. + **/ +typedef struct { + /* free pages */ + Atomic64 freePages; + /* clean (resident) pages */ + Atomic64 cleanPages; + /* dirty pages per era */ + Atomic64 dirtyPages; + /* pages incoming */ + Atomic64 incomingPages; + /* pages outgoing */ + Atomic64 outgoingPages; + /* pages in failed state */ + Atomic64 failedPages; +} AtomicPageStateCounts; + +/** + * Statistics and debugging fields for the page cache. + */ +typedef struct { + /* counts of how many pages are in each state */ + AtomicPageStateCounts counts; + /* how many times free page not available */ + Atomic64 cachePressure; + /* number of getVDOPageAsync() for read */ + Atomic64 readCount; + /* number or getVDOPageAsync() for write */ + Atomic64 writeCount; + /* number of times pages failed to read */ + Atomic64 failedReads; + /* number of times pages failed to write */ + Atomic64 failedWrites; + /* number of gets that are reclaimed */ + Atomic64 reclaimed; + /* number of gets for outgoing pages */ + Atomic64 readOutgoing; + /* number of gets that were already there */ + Atomic64 foundInCache; + /* number of gets requiring discard */ + Atomic64 discardRequired; + /* number of gets enqueued for their page */ + Atomic64 waitForPage; + /* number of gets that have to fetch */ + Atomic64 fetchRequired; + /* number of page fetches */ + Atomic64 pagesLoaded; + /* number of page saves */ + Atomic64 pagesSaved; + /* number of flushes initiated */ + Atomic64 flushCount; +} AtomicPageCacheStatistics; + +/** + * Signature for a function to call when a page is read into the cache. + * + *

If specified, this function is called when a page is fetched from disk. + * + * @param rawPage The raw memory of the freshly-fetched page + * @param pbn The absolute physical block number of the page + * @param zone The block map zone to which the cache belongs + * @param pageContext A pointer to client-specific data for the new page + * + * @return VDO_SUCCESS on success or VDO_BAD_PAGE if the page is incorrectly + * formatted + **/ +typedef int VDOPageReadFunction(void *rawPage, + PhysicalBlockNumber pbn, + BlockMapZone *zone, + void *pageContext); + +/** + * Signature for a function to call when a page is written from the cache. + * + *

If specified, this function is called when a page is written to disk. + * + * @param rawPage The raw memory of the freshly-written page + * @param zone The block map zone to which the cache belongs + * @param pageContext A pointer to client-specific data for the new page + * + * @return whether the page needs to be rewritten + **/ +typedef bool VDOPageWriteFunction(void *rawPage, + BlockMapZone *zone, + void *pageContext); + +/** + * Construct a PageCache. + * + * @param [in] layer The physical layer to read and write + * @param [in] pageCount The number of cache pages to hold + * @param [in] readHook The function to be called when a page is read + * into the cache + * @param [in] writeHook The function to be called after a page is + * written from the cache + * @param [in] pageContextSize The size of the per-page context that will be + * passed to the read and write hooks + * @param [in] maximumAge The number of journal blocks before a dirtied + * page is considered old and must be written + * out + * @param [in] zone The block map zone which owns this cache + * @param [out] cachePtr A pointer to hold the cache + * + * @return a success or error code + **/ +int makeVDOPageCache(PhysicalLayer *layer, + PageCount pageCount, + VDOPageReadFunction *readHook, + VDOPageWriteFunction *writeHook, + size_t pageContextSize, + BlockCount maximumAge, + BlockMapZone *zone, + VDOPageCache **cachePtr) + __attribute__((warn_unused_result)); + +/** + * Free the page cache structure and null out the reference to it. + * + * @param cachePtr a pointer to the cache to free + **/ +void freeVDOPageCache(VDOPageCache **cachePtr); + +/** + * Set the initial dirty period for a page cache. + * + * @param cache The cache + * @param period The initial dirty period to set + **/ +void setVDOPageCacheInitialPeriod(VDOPageCache *cache, SequenceNumber period); + +/** + * Switch the page cache into or out of read-only rebuild mode. + * + * @param cache The cache + * @param rebuilding true if the cache should be put into + * read-only rebuild mode, false otherwise + **/ +void setVDOPageCacheRebuildMode(VDOPageCache *cache, bool rebuilding); + +/** + * Check whether a page cache is active (i.e. has any active lookups, + * outstanding I/O, or pending I/O). + * + * @param cache The cache to check + * + * @return true if the cache is active + **/ +bool isPageCacheActive(VDOPageCache *cache) + __attribute__((warn_unused_result)); + +/** + * Advance the dirty period for a page cache. + * + * @param cache The cache to advance + * @param period The new dirty period + **/ +void advanceVDOPageCachePeriod(VDOPageCache *cache, SequenceNumber period); + +/** + * Write one or more batches of dirty pages. + * + * All writable pages in the ancient era and some number in the old era + * are scheduled for writing. + * + * @param cache the VDO page cache + * @param batches how many batches to write now + * @param total how many batches (including those being written now) remain + * in this era + **/ +void writeVDOPageCachePages(VDOPageCache *cache, + size_t batches, + size_t total); + +/** + * Rotate the dirty page eras. + * + * Move all pages in the old era to the ancient era and then move + * the current era bin into the old era. + * + * @param cache the VDO page cache + **/ +void rotateVDOPageCacheEras(VDOPageCache *cache); + +// ASYNC + +/** + * A completion awaiting a specific page. Also a live reference into the + * page once completed, until freed. + **/ +typedef struct { + /** The generic completion */ + VDOCompletion completion; + /** The cache involved */ + VDOPageCache *cache; + /** The waiter for the pending list */ + Waiter waiter; + /** The absolute physical block number of the page on disk */ + PhysicalBlockNumber pbn; + /** Whether the page may be modified */ + bool writable; + /** Whether the page is available */ + bool ready; + /** The info structure for the page, only valid when ready */ + PageInfo *info; +} VDOPageCompletion; + +/** + * Initialize a VDO Page Completion, requesting a particular page from the + * cache. + * + * @param pageCompletion The VDOPageCompletion to initialize + * @param cache The VDO page cache + * @param pbn The absolute physical block of the desired page + * @param writable Whether the page can be modified + * @param parent The parent object + * @param callback The completion callback + * @param errorHandler The handler for page errors + * + * @note Once a completion has occurred for the getVDOPageAsync operation, + * the underlying page shall be busy (stuck in memory) until the + * VDOCompletion returned by this operation has been released. + **/ +void initVDOPageCompletion(VDOPageCompletion *pageCompletion, + VDOPageCache *cache, + PhysicalBlockNumber pbn, + bool writable, + void *parent, + VDOAction *callback, + VDOAction *errorHandler); + +/** + * Release a VDO Page Completion. + * + * The page referenced by this completion (if any) will no longer be + * held busy by this completion. If a page becomes discardable and + * there are completions awaiting free pages then a new round of + * page discarding is started. + * + * @param completion The completion to release + **/ +void releaseVDOPageCompletion(VDOCompletion *completion); + +/** + * Asynchronous operation to get a VDO page. + * + * May cause another page to be discarded (potentially writing a dirty page) + * and the one nominated by the completion to be loaded from disk. + * + * When the page becomes available the callback registered in the completion + * provided is triggered. Once triggered the page is marked busy until + * the completion is destroyed. + * + * @param completion the completion initialized my initVDOPageCompletion(). + **/ +void getVDOPageAsync(VDOCompletion *completion); + +/** + * Mark a VDO page referenced by a completed VDOPageCompletion as dirty. + * + * @param completion a VDO Page Completion whose callback has been called + * @param oldDirtyPeriod the period in which the page was already dirty (0 if + * it wasn't) + * @param newDirtyPeriod the period in which the page is now dirty + **/ +void markCompletedVDOPageDirty(VDOCompletion *completion, + SequenceNumber oldDirtyPeriod, + SequenceNumber newDirtyPeriod); + +/** + * Request that a VDO page be written out as soon as it is not busy. + * + * @param completion the VDOPageCompletion containing the page + **/ +void requestVDOPageWrite(VDOCompletion *completion); + +/** + * Access the raw memory for a read-only page of a completed VDOPageCompletion. + * + * @param completion a vdo page completion whose callback has been called + * + * @return a pointer to the raw memory at the beginning of the page, or + * NULL if the page is not available. + **/ +const void *dereferenceReadableVDOPage(VDOCompletion *completion); + +/** + * Access the raw memory for a writable page of a completed VDOPageCompletion. + * + * @param completion a vdo page completion whose callback has been called + * + * @return a pointer to the raw memory at the beginning of the page, or + * NULL if the page is not available, or if the page is read-only + **/ +void *dereferenceWritableVDOPage(VDOCompletion *completion); + +/** + * Get the per-page client context for the page in a page completion whose + * callback has been invoked. Should only be called after dereferencing the + * page completion to validate the page. + * + * @param completion a vdo page completion whose callback has been invoked + * + * @return a pointer to the per-page client context, or NULL if + * the page is not available + **/ +void *getVDOPageCompletionContext(VDOCompletion *completion); + +/** + * Drain I/O for a page cache. + * + * @param cache The cache to drain + **/ +void drainVDOPageCache(VDOPageCache *cache); + +/** + * Invalidate all entries in the VDO page cache. There must not be any + * dirty pages in the cache. + * + * @param cache the cache to invalidate + * + * @return a success or error code + **/ +int invalidateVDOPageCache(VDOPageCache *cache) + __attribute__((warn_unused_result)); + +// STATISTICS & TESTING + +/** + * Get current cache statistics. + * + * @param cache the page cache + * + * @return the statistics + **/ +AtomicPageCacheStatistics *getVDOPageCacheStatistics(VDOPageCache *cache) + __attribute__((warn_unused_result)); + +#endif // VDO_PAGE_CACHE_H diff --git a/source/vdo/base/vdoPageCacheInternals.h b/source/vdo/base/vdoPageCacheInternals.h new file mode 100644 index 0000000..4e2c67f --- /dev/null +++ b/source/vdo/base/vdoPageCacheInternals.h @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCacheInternals.h#8 $ + */ + +#ifndef VDO_PAGE_CACHE_INTERNALS_H +#define VDO_PAGE_CACHE_INTERNALS_H + +#include "vdoPageCache.h" + +#ifndef __KERNEL__ +# include +#endif + +#include "blockMapInternals.h" +#include "completion.h" +#include "dirtyLists.h" +#include "intMap.h" +#include "physicalLayer.h" +#include "ringNode.h" + +enum { + MAX_PAGE_CONTEXT_SIZE = 8, +}; + +static const PhysicalBlockNumber NO_PAGE = 0xFFFFFFFFFFFFFFFF; + +/** + * A PageInfoNode is a ring node. + **/ +typedef RingNode PageInfoNode; + +/** + * The VDO Page Cache abstraction. + **/ +struct vdoPageCache { + /** the physical layer to page to */ + PhysicalLayer *layer; + /** number of pages in cache */ + PageCount pageCount; + /** function to call on page read */ + VDOPageReadFunction *readHook; + /** function to call on page write */ + VDOPageWriteFunction *writeHook; + /** number of pages to write in the current batch */ + PageCount pagesInBatch; + /** Whether the VDO is doing a read-only rebuild */ + bool rebuilding; + + /** array of page information entries */ + PageInfo *infos; + /** raw memory for pages */ + char *pages; + /** cache last found page info */ + PageInfo *lastFound; + /** map of page number to info */ + IntMap *pageMap; + /** master LRU list (all infos) */ + PageInfoNode lruList; + /** dirty pages by period */ + DirtyLists *dirtyLists; + /** free page list (oldest first) */ + PageInfoNode freeList; + /** outgoing page list */ + PageInfoNode outgoingList; + /** number of read I/O operations pending */ + PageCount outstandingReads; + /** number of write I/O operations pending */ + PageCount outstandingWrites; + /** number of pages covered by the current flush */ + PageCount pagesInFlush; + /** number of pages waiting to be included in the next flush */ + PageCount pagesToFlush; + /** number of discards in progress */ + unsigned int discardCount; + /** how many VPCs waiting for free page */ + unsigned int waiterCount; + /** queue of waiters who want a free page */ + WaitQueue freeWaiters; + /** statistics */ + AtomicPageCacheStatistics stats; + /** counter for pressure reports */ + uint32_t pressureReport; + /** the block map zone to which this cache belongs */ + BlockMapZone *zone; +}; + +/** + * The state of a page buffer. If the page buffer is free no particular page is + * bound to it, otherwise the page buffer is bound to particular page whose + * absolute pbn is in the pbn field. If the page is resident or dirty the page + * data is stable and may be accessed. Otherwise the page is in flight + * (incoming or outgoing) and its data should not be accessed. + * + * @note Update the static data in vpcPageStateName() and vpcPageStateFlag() + * if you change this enumeration. + **/ +typedef enum __attribute__((packed)) pageState { + /* this page buffer is not being used */ + PS_FREE, + /* this page is being read from store */ + PS_INCOMING, + /* attempt to load this page failed */ + PS_FAILED, + /* this page is valid and un-modified */ + PS_RESIDENT, + /* this page is valid and modified */ + PS_DIRTY, + /* this page is being written and should not be used */ + PS_OUTGOING, + /* not a state */ + PAGE_STATE_COUNT, +} PageState; + +/** + * The write status of page + **/ +typedef enum __attribute__((packed)) { + WRITE_STATUS_NORMAL, + WRITE_STATUS_DISCARD, + WRITE_STATUS_DEFERRED, +} WriteStatus; + +/** + * Per-page-slot information. + **/ +struct pageInfo { + /** Preallocated page VIO */ + VIO *vio; + /** back-link for references */ + VDOPageCache *cache; + /** the pbn of the page */ + PhysicalBlockNumber pbn; + /** page is busy (temporarily locked) */ + uint16_t busy; + /** the write status the page */ + WriteStatus writeStatus; + /** page state */ + PageState state; + /** queue of completions awaiting this item */ + WaitQueue waiting; + /** state linked list node */ + PageInfoNode listNode; + /** LRU node */ + PageInfoNode lruNode; + /** Space for per-page client data */ + byte context[MAX_PAGE_CONTEXT_SIZE]; +}; + +// PAGE INFO LIST OPERATIONS + +/**********************************************************************/ +static inline PageInfo *pageInfoFromListNode(PageInfoNode *node) +{ + if (node == NULL) { + return NULL; + } + return (PageInfo *) ((uintptr_t) node - offsetof(PageInfo, listNode)); +} + +/**********************************************************************/ +static inline PageInfo *pageInfoFromLRUNode(PageInfoNode *node) +{ + if (node == NULL) { + return NULL; + } + return (PageInfo *) ((uintptr_t) node - offsetof(PageInfo, lruNode)); +} + +// PAGE INFO STATE ACCESSOR FUNCTIONS + +/**********************************************************************/ +static inline bool isFree(const PageInfo *info) +{ + return info->state == PS_FREE; +} + +/**********************************************************************/ +static inline bool isAvailable(const PageInfo *info) +{ + return (info->state == PS_FREE) || (info->state == PS_FAILED); +} + +/**********************************************************************/ +static inline bool isPresent(const PageInfo *info) +{ + return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY); +} + +/**********************************************************************/ +static inline bool isDirty(const PageInfo *info) +{ + return info->state == PS_DIRTY; +} + +/**********************************************************************/ +static inline bool isResident(const PageInfo *info) +{ + return info->state == PS_RESIDENT; +} + +/**********************************************************************/ +static inline bool isInFlight(const PageInfo *info) +{ + return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING); +} + +/**********************************************************************/ +static inline bool isIncoming(const PageInfo *info) +{ + return info->state == PS_INCOMING; +} + +/**********************************************************************/ +static inline bool isOutgoing(const PageInfo *info) +{ + return info->state == PS_OUTGOING; +} + +/**********************************************************************/ +static inline bool isValid(const PageInfo *info) +{ + return isPresent(info) || isOutgoing(info); +} + +// COMPLETION CONVERSIONS + +/**********************************************************************/ +static inline VDOPageCompletion *asVDOPageCompletion(VDOCompletion *completion) +{ + assertCompletionType(completion->type, VDO_PAGE_COMPLETION); + return (VDOPageCompletion *) ((uintptr_t) completion + - offsetof(VDOPageCompletion, completion)); +} + +/**********************************************************************/ +static inline +VDOPageCompletion *pageCompletionFromWaiter(Waiter *waiter) +{ + if (waiter == NULL) { + return NULL; + } + + VDOPageCompletion *completion = (VDOPageCompletion *) + ((uintptr_t) waiter - offsetof(VDOPageCompletion, waiter)); + assertCompletionType(completion->completion.type, VDO_PAGE_COMPLETION); + return completion; +} + +// COMMONLY USED FUNCTIONS + +// All of these functions are prefixed "vpc" in order to prevent namespace +// issues (ordinarily they would be static). + +/** + * Find the page info (if any) associated with a given pbn. + * + * @param cache the page cache + * @param pbn the absolute physical block number of the page + * + * @return the page info for the page if available, or NULL if not + **/ +PageInfo *vpcFindPage(VDOPageCache *cache, PhysicalBlockNumber pbn) + __attribute__((warn_unused_result)); + +/** + * Return the name of a page state. + * + * @param state a page state + * + * @return a pointer to a static page state name + * + * @note If the page state is invalid a static string is returned and the + * invalid state is logged. + **/ +const char *vpcPageStateName(PageState state) + __attribute__((warn_unused_result)); + +#endif // VDO_PAGE_CACHE_INTERNALS_H diff --git a/source/vdo/base/vdoRecovery.c b/source/vdo/base/vdoRecovery.c new file mode 100644 index 0000000..97e72eb --- /dev/null +++ b/source/vdo/base/vdoRecovery.c @@ -0,0 +1,1257 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.c#16 $ + */ + +#include "vdoRecoveryInternals.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "blockAllocator.h" +#include "blockAllocatorInternals.h" +#include "blockMapInternals.h" +#include "blockMapPage.h" +#include "blockMapRecovery.h" +#include "completion.h" +#include "numUtils.h" +#include "packedRecoveryJournalBlock.h" +#include "recoveryJournal.h" +#include "recoveryUtils.h" +#include "slab.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "slabJournalInternals.h" +#include "vdoInternal.h" +#include "waitQueue.h" + +enum { + // The int map needs capacity of twice the number of VIOs in the system. + INT_MAP_CAPACITY = MAXIMUM_USER_VIOS * 2, + // There can be as many missing decrefs as there are VIOs in the system. + MAXIMUM_SYNTHESIZED_DECREFS = MAXIMUM_USER_VIOS, +}; + +typedef struct missingDecref { + /** A waiter for queueing this object */ + Waiter waiter; + /** The parent of this object */ + RecoveryCompletion *recovery; + /** Whether this decref is complete */ + bool complete; + /** The slot for which the last decref was lost */ + BlockMapSlot slot; + /** The penultimate block map entry for this LBN */ + DataLocation penultimateMapping; + /** The page completion used to fetch the block map page for this LBN */ + VDOPageCompletion pageCompletion; + /** The journal point which will be used for this entry */ + JournalPoint journalPoint; + /** The slab journal to which this entry will be applied */ + SlabJournal *slabJournal; +} MissingDecref; + +/** + * Convert a Waiter to the missing decref of which it is a part. + * + * @param waiter The Waiter to convert + * + * @return The MissingDecref wrapping the Waiter + **/ +__attribute__((warn_unused_result)) +static inline MissingDecref *asMissingDecref(Waiter *waiter) +{ + STATIC_ASSERT(offsetof(MissingDecref, waiter) == 0); + return (MissingDecref *) waiter; +} + +/** + * Enqueue a MissingDecref. If the enqueue fails, enter read-only mode. + * + * @param queue The queue on which to enqueue the decref + * @param decref The MissingDecref to enqueue + * + * @return VDO_SUCCESS or an error + **/ +static int enqueueMissingDecref(WaitQueue *queue, MissingDecref *decref) +{ + int result = enqueueWaiter(queue, &decref->waiter); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(decref->recovery->vdo->readOnlyNotifier, result); + setCompletionResult(&decref->recovery->completion, result); + FREE(decref); + } + + return result; +} + +/** + * Convert a BlockMapSlot into a unique uint64_t. + * + * @param slot The block map slot to convert. + * + * @return a one-to-one mappable uint64_t. + **/ +static uint64_t slotAsNumber(BlockMapSlot slot) +{ + return (((uint64_t) slot.pbn << 10) + slot.slot); +} + +/** + * Create a MissingDecref and enqueue it to wait for a determination of its + * penultimate mapping. + * + * @param [in] recovery The parent recovery completion + * @param [in] entry The recovery journal entry for the increment which is + * missing a decref + * @param [out] decrefPtr A pointer to hold the new MissingDecref + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int makeMissingDecref(RecoveryCompletion *recovery, + RecoveryJournalEntry entry, + MissingDecref **decrefPtr) +{ + MissingDecref *decref; + int result = ALLOCATE(1, MissingDecref, __func__, &decref); + if (result != VDO_SUCCESS) { + return result; + } + + decref->recovery = recovery; + result = enqueueMissingDecref(&recovery->missingDecrefs[0], decref); + if (result != VDO_SUCCESS) { + return result; + } + + /* + * Each synthsized decref needs a unique journal point. Otherwise, in the + * event of a crash, we would be unable to tell which synthesized decrefs had + * already been committed in the slab journals. Instead of using real + * recovery journal space for this, we can use fake journal points between + * the last currently valid entry in the tail block and the first journal + * entry in the next block. We can't overflow the entry count since the + * number of synthesized decrefs is bounded by the DataVIO limit. + * + * It is vital that any given missing decref always have the same fake + * journal point since a failed recovery may be retried with a different + * number of zones after having written out some slab journal blocks. Since + * the missing decrefs are always read out of the journal in the same order, + * we can assign them a journal point when they are read. Their subsequent + * use will ensure that, for any given slab journal, they are applied in + * the order dictated by these assigned journal points. + */ + decref->slot = entry.slot; + decref->journalPoint = recovery->nextSynthesizedJournalPoint; + recovery->nextSynthesizedJournalPoint.entryCount++; + recovery->missingDecrefCount++; + recovery->incompleteDecrefCount++; + + *decrefPtr = decref; + return VDO_SUCCESS; +} + +/** + * Move the given recovery point forward by one entry. + * + * @param point The recovery point to alter + **/ +static void incrementRecoveryPoint(RecoveryPoint *point) +{ + point->entryCount++; + if ((point->sectorCount == (SECTORS_PER_BLOCK - 1)) + && (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR)) { + point->sequenceNumber++; + point->sectorCount = 1; + point->entryCount = 0; + } + + if (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) { + point->sectorCount++; + point->entryCount = 0; + return; + } +} + +/** + * Move the given recovery point backwards by one entry. + * + * @param point The recovery point to alter + **/ +static void decrementRecoveryPoint(RecoveryPoint *point) +{ + STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR > 0); + + if ((point->sectorCount <= 1) && (point->entryCount == 0)) { + point->sequenceNumber--; + point->sectorCount = SECTORS_PER_BLOCK - 1; + point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR - 1; + return; + } + + if (point->entryCount == 0) { + point->sectorCount--; + point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR - 1; + return; + } + + point->entryCount--; +} + +/** + * Check whether the first point precedes the second point. + * + * @param first The first recovery point + * @param second The second recovery point + * + * @return true if the first point precedes the second point + **/ +__attribute__((warn_unused_result)) +static bool beforeRecoveryPoint(const RecoveryPoint *first, + const RecoveryPoint *second) +{ + if (first->sequenceNumber < second->sequenceNumber) { + return true; + } + + if (first->sequenceNumber > second->sequenceNumber) { + return false; + } + + if (first->sectorCount < second->sectorCount) { + return true; + } + + return ((first->sectorCount == second->sectorCount) + && (first->entryCount < second->entryCount)); +} + +/** + * Prepare the sub-task completion. + * + * @param recovery The RecoveryCompletion whose sub-task completion is to + * be prepared + * @param callback The callback to register for the next sub-task + * @param errorHandler The error handler for the next sub-task + * @param zoneType The type of zone on which the callback or errorHandler + * should run + **/ +static void prepareSubTask(RecoveryCompletion *recovery, + VDOAction callback, + VDOAction errorHandler, + ZoneType zoneType) +{ + const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo); + ThreadID threadID; + switch (zoneType) { + case ZONE_TYPE_LOGICAL: + // All blockmap access is done on single thread, so use logical zone 0. + threadID = getLogicalZoneThread(threadConfig, 0); + break; + + case ZONE_TYPE_PHYSICAL: + threadID = recovery->allocator->threadID; + break; + + case ZONE_TYPE_ADMIN: + default: + threadID = getAdminThread(threadConfig); + } + + prepareCompletion(&recovery->subTaskCompletion, callback, errorHandler, + threadID, recovery); +} + +/**********************************************************************/ +int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr) +{ + const ThreadConfig *threadConfig = getThreadConfig(vdo); + RecoveryCompletion *recovery; + int result = ALLOCATE_EXTENDED(RecoveryCompletion, + threadConfig->physicalZoneCount, RingNode, + __func__, &recovery); + if (result != VDO_SUCCESS) { + return result; + } + + recovery->vdo = vdo; + for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) { + initializeWaitQueue(&recovery->missingDecrefs[z]); + } + + result = initializeEnqueueableCompletion(&recovery->completion, + RECOVERY_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + freeRecoveryCompletion(&recovery); + return result; + } + + result = initializeEnqueueableCompletion(&recovery->subTaskCompletion, + SUB_TASK_COMPLETION, vdo->layer); + if (result != VDO_SUCCESS) { + freeRecoveryCompletion(&recovery); + return result; + } + + result = makeIntMap(INT_MAP_CAPACITY, 0, &recovery->slotEntryMap); + if (result != VDO_SUCCESS) { + freeRecoveryCompletion(&recovery); + return result; + } + + *recoveryPtr = recovery; + return VDO_SUCCESS; +} + +/** + * A waiter callback to free MissingDecrefs. + * + * Implements WaiterCallback. + **/ +static void freeMissingDecref(Waiter *waiter, + void *context __attribute__((unused))) +{ + FREE(asMissingDecref(waiter)); +} + +/**********************************************************************/ +void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr) +{ + RecoveryCompletion *recovery = *recoveryPtr; + if (recovery == NULL) { + return; + } + + freeIntMap(&recovery->slotEntryMap); + const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo); + for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) { + notifyAllWaiters(&recovery->missingDecrefs[z], freeMissingDecref, NULL); + } + + FREE(recovery->journalData); + FREE(recovery->entries); + destroyEnqueueable(&recovery->subTaskCompletion); + destroyEnqueueable(&recovery->completion); + FREE(recovery); + *recoveryPtr = NULL; +} + +/** + * Finish recovering, free the recovery completion and notify the parent. + * + * @param completion The recovery completion + **/ +static void finishRecovery(VDOCompletion *completion) +{ + VDOCompletion *parent = completion->parent; + RecoveryCompletion *recovery = asRecoveryCompletion(completion); + VDO *vdo = recovery->vdo; + uint64_t recoveryCount = ++vdo->completeRecoveries; + initializeRecoveryJournalPostRecovery(vdo->recoveryJournal, + recoveryCount, recovery->highestTail); + freeRecoveryCompletion(&recovery); + logInfo("Rebuild complete."); + + // Now that we've freed the recovery completion and its vast array of + // journal entries, we can allocate refcounts. + int result = allocateSlabRefCounts(vdo->depot); + finishCompletion(parent, result); +} + +/** + * Handle a recovery error. + * + * @param completion The recovery completion + **/ +static void abortRecovery(VDOCompletion *completion) +{ + VDOCompletion *parent = completion->parent; + int result = completion->result; + RecoveryCompletion *recovery = asRecoveryCompletion(completion); + freeRecoveryCompletion(&recovery); + logWarning("Recovery aborted"); + finishCompletion(parent, result); +} + +/** + * Abort a recovery if there is an error. + * + * @param result The result to check + * @param recovery The recovery completion + * + * @return true if the result was an error + **/ +__attribute__((warn_unused_result)) +static bool abortRecoveryOnError(int result, RecoveryCompletion *recovery) +{ + if (result == VDO_SUCCESS) { + return false; + } + + finishCompletion(&recovery->completion, result); + return true; +} + +/** + * Unpack the recovery journal entry associated with the given recovery point. + * + * @param recovery The recovery completion + * @param point The recovery point + * + * @return The unpacked contents of the matching recovery journal entry + **/ +static RecoveryJournalEntry getEntry(const RecoveryCompletion *recovery, + const RecoveryPoint *point) +{ + RecoveryJournal *journal = recovery->vdo->recoveryJournal; + PhysicalBlockNumber blockNumber + = getRecoveryJournalBlockNumber(journal, point->sequenceNumber); + off_t sectorOffset + = (blockNumber * VDO_BLOCK_SIZE) + (point->sectorCount * VDO_SECTOR_SIZE); + PackedJournalSector *sector + = (PackedJournalSector *) &recovery->journalData[sectorOffset]; + return unpackRecoveryJournalEntry(§or->entries[point->entryCount]); +} + +/** + * Create an array of all valid journal entries, in order, and store it in the + * recovery completion. + * + * @param recovery The recovery completion + * + * @return VDO_SUCCESS or an error code + **/ +static int extractJournalEntries(RecoveryCompletion *recovery) +{ + // Allocate a NumberedBlockMapping array just large enough to transcribe + // every increment PackedRecoveryJournalEntry from every valid journal block. + int result = ALLOCATE(recovery->increfCount, NumberedBlockMapping, __func__, + &recovery->entries); + if (result != VDO_SUCCESS) { + return result; + } + + RecoveryPoint recoveryPoint = { + .sequenceNumber = recovery->blockMapHead, + .sectorCount = 1, + .entryCount = 0, + }; + while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { + RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); + result = validateRecoveryJournalEntry(recovery->vdo, &entry); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); + return result; + } + + if (isIncrementOperation(entry.operation)) { + recovery->entries[recovery->entryCount] = (NumberedBlockMapping) { + .blockMapSlot = entry.slot, + .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state), + .number = recovery->entryCount, + }; + recovery->entryCount++; + } + + incrementRecoveryPoint(&recoveryPoint); + } + + result = ASSERT((recovery->entryCount <= recovery->increfCount), + "approximate incref count is an upper bound"); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); + } + + return result; +} + +/** + * Extract journal entries and recover the block map. This callback is + * registered in startSuperBlockSave(). + * + * @param completion The sub-task completion + **/ +static void launchBlockMapRecovery(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + assertOnLogicalZoneThread(vdo, 0, __func__); + + // Extract the journal entries for the block map recovery. + int result = extractJournalEntries(recovery); + if (abortRecoveryOnError(result, recovery)) { + return; + } + + prepareToFinishParent(completion, &recovery->completion); + recoverBlockMap(vdo, recovery->entryCount, recovery->entries, completion); +} + +/** + * Finish flushing all slab journals and start a write of the super block. + * This callback is registered in addSynthesizedEntries(). + * + * @param completion The sub-task completion + **/ +static void startSuperBlockSave(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + assertOnAdminThread(vdo, __func__); + + logInfo("Saving recovery progress"); + vdo->state = VDO_REPLAYING; + + // The block map access which follows the super block save must be done + // on a logical thread. + prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback, + ZONE_TYPE_LOGICAL); + saveVDOComponentsAsync(vdo, completion); +} + +/** + * The callback from loading the slab depot. It will update the logical blocks + * and block map data blocks counts in the recovery journal and then drain the + * slab depot in order to commit the recovered slab journals. It is registered + * in applyToDepot(). + * + * @param completion The sub-task completion + **/ +static void finishRecoveringDepot(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + assertOnAdminThread(vdo, __func__); + + logInfo("Replayed %zu journal entries into slab journals", + recovery->entriesAddedToSlabJournals); + logInfo("Synthesized %zu missing journal entries", + recovery->missingDecrefCount); + vdo->recoveryJournal->logicalBlocksUsed = recovery->logicalBlocksUsed; + vdo->recoveryJournal->blockMapDataBlocks = recovery->blockMapDataBlocks; + + prepareSubTask(recovery, startSuperBlockSave, finishParentCallback, + ZONE_TYPE_ADMIN); + drainSlabDepot(vdo->depot, ADMIN_STATE_RECOVERING, completion); +} + +/** + * The error handler for recovering slab journals. It will skip any remaining + * recovery on the current zone and propagate the error. It is registered in + * addSlabJournalEntries() and addSynthesizedEntries(). + * + * @param completion The completion of the block allocator being recovered + **/ +static void handleAddSlabJournalEntryError(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + notifySlabJournalsAreRecovered(recovery->allocator, completion->result); +} + +/** + * Add synthesized entries into slab journals, waiting when necessary. + * + * @param completion The allocator completion + **/ +static void addSynthesizedEntries(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + + // Get ready in case we need to enqueue again + prepareCompletion(completion, addSynthesizedEntries, + handleAddSlabJournalEntryError, + completion->callbackThreadID, recovery); + WaitQueue *missingDecrefs + = &recovery->missingDecrefs[recovery->allocator->zoneNumber]; + while (hasWaiters(missingDecrefs)) { + MissingDecref *decref = asMissingDecref(getFirstWaiter(missingDecrefs)); + if (!attemptReplayIntoSlabJournal(decref->slabJournal, + decref->penultimateMapping.pbn, + DATA_DECREMENT, &decref->journalPoint, + completion)) { + return; + } + + dequeueNextWaiter(missingDecrefs); + FREE(decref); + } + + notifySlabJournalsAreRecovered(recovery->allocator, VDO_SUCCESS); +} + +/** + * Determine the LBNs used count as of the end of the journal (but + * not including any changes to that count from entries that will be + * synthesized later). + * + * @param recovery The recovery completion + * + * @return VDO_SUCCESS or an error + **/ +static int computeUsages(RecoveryCompletion *recovery) +{ + RecoveryJournal *journal = recovery->vdo->recoveryJournal; + PackedJournalHeader *tailHeader + = getJournalBlockHeader(journal, recovery->journalData, recovery->tail); + + RecoveryBlockHeader unpacked; + unpackRecoveryBlockHeader(tailHeader, &unpacked); + recovery->logicalBlocksUsed = unpacked.logicalBlocksUsed; + recovery->blockMapDataBlocks = unpacked.blockMapDataBlocks; + + RecoveryPoint recoveryPoint = { + .sequenceNumber = recovery->tail, + .sectorCount = 1, + .entryCount = 0, + }; + while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { + RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); + if (isMappedLocation(&entry.mapping)) { + switch (entry.operation) { + case DATA_INCREMENT: + recovery->logicalBlocksUsed++; + break; + + case DATA_DECREMENT: + recovery->logicalBlocksUsed--; + break; + + case BLOCK_MAP_INCREMENT: + recovery->blockMapDataBlocks++; + break; + + default: + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, + "Recovery journal entry at " + "sequence number %" PRIu64 + ", sector %u, entry %u had invalid " + "operation %u", + recoveryPoint.sequenceNumber, + recoveryPoint.sectorCount, + recoveryPoint.entryCount, + entry.operation); + } + } + + incrementRecoveryPoint(&recoveryPoint); + } + + return VDO_SUCCESS; +} + +/** + * Advance the current recovery and journal points. + * + * @param recovery The RecoveryCompletion whose points are to be + * advanced + * @param entriesPerBlock The number of entries in a recovery journal block + **/ +static void advancePoints(RecoveryCompletion *recovery, + JournalEntryCount entriesPerBlock) +{ + incrementRecoveryPoint(&recovery->nextRecoveryPoint); + advanceJournalPoint(&recovery->nextJournalPoint, entriesPerBlock); +} + +/** + * Replay recovery journal entries into the slab journals of the allocator + * currently being recovered, waiting for slab journal tailblock space when + * necessary. This method is its own callback. + * + * @param completion The allocator completion + **/ +static void addSlabJournalEntries(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + RecoveryJournal *journal = vdo->recoveryJournal; + + // Get ready in case we need to enqueue again. + prepareCompletion(completion, addSlabJournalEntries, + handleAddSlabJournalEntryError, + completion->callbackThreadID, recovery); + for (RecoveryPoint *recoveryPoint = &recovery->nextRecoveryPoint; + beforeRecoveryPoint(recoveryPoint, &recovery->tailRecoveryPoint); + advancePoints(recovery, journal->entriesPerBlock)) { + RecoveryJournalEntry entry = getEntry(recovery, recoveryPoint); + int result = validateRecoveryJournalEntry(vdo, &entry); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(journal->readOnlyNotifier, result); + finishCompletion(completion, result); + return; + } + + if (entry.mapping.pbn == ZERO_BLOCK) { + continue; + } + + Slab *slab = getSlab(vdo->depot, entry.mapping.pbn); + if (slab->allocator != recovery->allocator) { + continue; + } + + if (!attemptReplayIntoSlabJournal(slab->journal, entry.mapping.pbn, + entry.operation, + &recovery->nextJournalPoint, + completion)) { + return; + } + + recovery->entriesAddedToSlabJournals++; + } + + logInfo("Recreating missing journal entries for zone %u", + recovery->allocator->zoneNumber); + addSynthesizedEntries(completion); +} + +/**********************************************************************/ +void replayIntoSlabJournals(BlockAllocator *allocator, + VDOCompletion *completion, + void *context) +{ + RecoveryCompletion *recovery = context; + assertOnPhysicalZoneThread(recovery->vdo, allocator->zoneNumber, __func__); + if ((recovery->journalData == NULL) || isReplaying(recovery->vdo)) { + // there's nothing to replay + notifySlabJournalsAreRecovered(allocator, VDO_SUCCESS); + return; + } + + recovery->allocator = allocator; + recovery->nextRecoveryPoint = (RecoveryPoint) { + .sequenceNumber = recovery->slabJournalHead, + .sectorCount = 1, + .entryCount = 0, + }; + + recovery->nextJournalPoint = (JournalPoint) { + .sequenceNumber = recovery->slabJournalHead, + .entryCount = 0, + }; + + logInfo("Replaying entries into slab journals for zone %u", + allocator->zoneNumber); + completion->parent = recovery; + addSlabJournalEntries(completion); +} + +/** + * A waiter callback to enqueue a MissingDecref on the queue for the physical + * zone in which it will be applied. + * + * Implements WaiterCallback. + **/ +static void queueOnPhysicalZone(Waiter *waiter, void *context) +{ + MissingDecref *decref = asMissingDecref(waiter); + DataLocation mapping = decref->penultimateMapping; + if (isMappedLocation(&mapping)) { + decref->recovery->logicalBlocksUsed--; + } + + if (mapping.pbn == ZERO_BLOCK) { + // Decrefs of zero are not applied to slab journals. + FREE(decref); + return; + } + + decref->slabJournal = getSlabJournal((SlabDepot *) context, mapping.pbn); + ZoneCount zoneNumber = decref->slabJournal->slab->allocator->zoneNumber; + enqueueMissingDecref(&decref->recovery->missingDecrefs[zoneNumber], decref); +} + +/** + * Queue each missing decref on the slab journal to which it is to be applied + * then load the slab depot. This callback is registered in + * findSlabJournalEntries(). + * + * @param completion The sub-task completion + **/ +static void applyToDepot(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + assertOnAdminThread(recovery->vdo, __func__); + prepareSubTask(recovery, finishRecoveringDepot, finishParentCallback, + ZONE_TYPE_ADMIN); + + SlabDepot *depot = getSlabDepot(recovery->vdo); + notifyAllWaiters(&recovery->missingDecrefs[0], queueOnPhysicalZone, depot); + if (abortRecoveryOnError(recovery->completion.result, recovery)) { + return; + } + + loadSlabDepot(depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery); +} + +/** + * Validate the location of the penultimate mapping for a MissingDecref. If it + * is valid, enqueue it for the appropriate physical zone or account for it. + * Otherwise, dispose of it and signal an error. + * + * @param decref The decref whose penultimate mapping has just been found + * @param location The penultimate mapping + * @param errorCode The error code to use if the location is invalid + **/ +static int recordMissingDecref(MissingDecref *decref, + DataLocation location, + int errorCode) +{ + RecoveryCompletion *recovery = decref->recovery; + recovery->incompleteDecrefCount--; + if (isValidLocation(&location) + && isPhysicalDataBlock(recovery->vdo->depot, location.pbn)) { + decref->penultimateMapping = location; + decref->complete = true; + return VDO_SUCCESS; + } + + // The location was invalid + enterReadOnlyMode(recovery->vdo->readOnlyNotifier, errorCode); + setCompletionResult(&recovery->completion, errorCode); + logErrorWithStringError(errorCode, + "Invalid mapping for pbn %llu with state %u", + location.pbn, location.state); + return errorCode; +} + +/** + * Find the block map slots with missing decrefs. + * + * To find the slots missing decrefs, we iterate through the journal in reverse + * so we see decrefs before increfs; if we see an incref before its paired + * decref, we instantly know this incref is missing its decref. + * + * Simultaneously, we attempt to determine the missing decref. If there is a + * missing decref, and at least two increfs for that slot, we know we should + * decref the PBN from the penultimate incref. Otherwise, there is only one + * incref for that slot: we must synthesize the decref out of the block map + * instead of the recovery journal. + * + * @param recovery The recovery completion + * + * @return VDO_SUCCESS or an error code + **/ +__attribute__((warn_unused_result)) +static int findMissingDecrefs(RecoveryCompletion *recovery) +{ + IntMap *slotEntryMap = recovery->slotEntryMap; + // This placeholder decref is used to mark lbns for which we have observed a + // decref but not the paired incref (going backwards through the journal). + MissingDecref foundDecref; + + // A buffer is allocated based on the number of incRef entries found, so use + // the earliest head. + SequenceNumber head = minSequenceNumber(recovery->blockMapHead, + recovery->slabJournalHead); + RecoveryPoint headPoint = { + .sequenceNumber = head, + .sectorCount = 1, + .entryCount = 0, + }; + + // Set up for the first fake journal point that will be used for a + // synthesized entry. + recovery->nextSynthesizedJournalPoint = (JournalPoint) { + .sequenceNumber = recovery->tail, + .entryCount = recovery->vdo->recoveryJournal->entriesPerBlock, + }; + + RecoveryPoint recoveryPoint = recovery->tailRecoveryPoint; + while (beforeRecoveryPoint(&headPoint, &recoveryPoint)) { + decrementRecoveryPoint(&recoveryPoint); + RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); + + if (!isIncrementOperation(entry.operation)) { + // Observe that we've seen a decref before its incref, but only if + // the IntMap does not contain an unpaired incref for this lbn. + int result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), + &foundDecref, false, NULL); + if (result != VDO_SUCCESS) { + return result; + } + + continue; + } + + recovery->increfCount++; + + MissingDecref *decref + = intMapRemove(slotEntryMap, slotAsNumber(entry.slot)); + if (entry.operation == BLOCK_MAP_INCREMENT) { + if (decref != NULL) { + return logErrorWithStringError(VDO_CORRUPT_JOURNAL, + "decref found for block map block %" + PRIu64 " with state %u", + entry.mapping.pbn, entry.mapping.state); + } + + // There are no decrefs for block map pages, so they can't be missing. + continue; + } + + if (decref == &foundDecref) { + // This incref already had a decref in the intmap, so we know it is + // not missing its decref. + continue; + } + + if (decref == NULL) { + // This incref is missing a decref. Add a missing decref object. + int result = makeMissingDecref(recovery, entry, &decref); + if (result != VDO_SUCCESS) { + return result; + } + + result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), decref, + false, NULL); + if (result != VDO_SUCCESS) { + return result; + } + + continue; + } + + /* + * This MissingDecref was left here by an incref without a decref. + * We now know what its penultimate mapping is, and all entries + * before here in the journal are paired, decref before incref, so + * we needn't remember it in the intmap any longer. + */ + int result = recordMissingDecref(decref, entry.mapping, + VDO_CORRUPT_JOURNAL); + if (result != VDO_SUCCESS) { + return result; + } + } + + return VDO_SUCCESS; +} + +/** + * Process a fetched block map page for a missing decref. This callback is + * registered in findSlabJournalEntries(). + * + * @param completion The page completion which has just finished loading + **/ +static void processFetchedPage(VDOCompletion *completion) +{ + MissingDecref *currentDecref = completion->parent; + RecoveryCompletion *recovery = currentDecref->recovery; + assertOnLogicalZoneThread(recovery->vdo, 0, __func__); + + const BlockMapPage *page = dereferenceReadableVDOPage(completion); + DataLocation location + = unpackBlockMapEntry(&page->entries[currentDecref->slot.slot]); + releaseVDOPageCompletion(completion); + recordMissingDecref(currentDecref, location, VDO_BAD_MAPPING); + if (recovery->incompleteDecrefCount == 0) { + completeCompletion(&recovery->subTaskCompletion); + } +} + +/** + * Handle an error fetching a block map page for a missing decref. + * This error handler is registered in findSlabJournalEntries(). + * + * @param completion The page completion which has just finished loading + **/ +static void handleFetchError(VDOCompletion *completion) +{ + MissingDecref *decref = completion->parent; + RecoveryCompletion *recovery = decref->recovery; + assertOnLogicalZoneThread(recovery->vdo, 0, __func__); + + // If we got a VDO_OUT_OF_RANGE error, it is because the pbn we read from + // the journal was bad, so convert the error code + setCompletionResult(&recovery->subTaskCompletion, + ((completion->result == VDO_OUT_OF_RANGE) + ? VDO_CORRUPT_JOURNAL : completion->result)); + releaseVDOPageCompletion(completion); + if (--recovery->incompleteDecrefCount == 0) { + completeCompletion(&recovery->subTaskCompletion); + } +} + +/** + * The waiter callback to requeue a missing decref and launch its page fetch. + * + * Implements WaiterCallback. + **/ +static void launchFetch(Waiter *waiter, void *context) +{ + MissingDecref *decref = asMissingDecref(waiter); + RecoveryCompletion *recovery = decref->recovery; + if (enqueueMissingDecref(&recovery->missingDecrefs[0], decref) + != VDO_SUCCESS) { + return; + } + + if (decref->complete) { + // We've already found the mapping for this decref, no fetch needed. + return; + } + + BlockMapZone *zone = context; + initVDOPageCompletion(&decref->pageCompletion, zone->pageCache, + decref->slot.pbn, false, decref, processFetchedPage, + handleFetchError); + getVDOPageAsync(&decref->pageCompletion.completion); +} + +/** + * Find all entries which need to be replayed into the slab journals. + * + * @param completion The sub-task completion + **/ +static void findSlabJournalEntries(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + + // We need to be on logical zone 0's thread since we are going to use its + // page cache. + assertOnLogicalZoneThread(vdo, 0, __func__); + int result = findMissingDecrefs(recovery); + if (abortRecoveryOnError(result, recovery)) { + return; + } + + prepareSubTask(recovery, applyToDepot, finishParentCallback, + ZONE_TYPE_ADMIN); + + /* + * Increment the incompleteDecrefCount so that the fetch callback can't + * complete the sub-task while we are still processing the queue of missing + * decrefs. + */ + if (recovery->incompleteDecrefCount++ > 0) { + // Fetch block map pages to fill in the incomplete missing decrefs. + notifyAllWaiters(&recovery->missingDecrefs[0], launchFetch, + getBlockMapZone(getBlockMap(vdo), 0)); + } + + if (--recovery->incompleteDecrefCount == 0) { + completeCompletion(completion); + } +} + +/** + * Find the contiguous range of journal blocks. + * + * @param recovery The recovery completion + * + * @return true if there were valid journal blocks + **/ +static bool findContiguousRange(RecoveryCompletion *recovery) +{ + RecoveryJournal *journal = recovery->vdo->recoveryJournal; + SequenceNumber head + = minSequenceNumber(recovery->blockMapHead, recovery->slabJournalHead); + + bool foundEntries = false; + for (SequenceNumber i = head; i <= recovery->highestTail; i++) { + recovery->tail = i; + recovery->tailRecoveryPoint = (RecoveryPoint) { + .sequenceNumber = i, + .sectorCount = 0, + .entryCount = 0, + }; + + PackedJournalHeader *packedHeader + = getJournalBlockHeader(journal, recovery->journalData, i); + RecoveryBlockHeader header; + unpackRecoveryBlockHeader(packedHeader, &header); + + if (!isExactRecoveryJournalBlock(journal, &header, i) + || (header.entryCount > journal->entriesPerBlock)) { + // A bad block header was found so this must be the end of the journal. + break; + } + + JournalEntryCount blockEntries = header.entryCount; + // Examine each sector in turn to determine the last valid sector. + for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) { + PackedJournalSector *sector = getJournalBlockSector(packedHeader, j); + + // A bad sector means that this block was torn. + if (!isValidRecoveryJournalSector(&header, sector)) { + break; + } + + JournalEntryCount sectorEntries = minBlock(sector->entryCount, + blockEntries); + if (sectorEntries > 0) { + foundEntries = true; + recovery->tailRecoveryPoint.sectorCount++; + recovery->tailRecoveryPoint.entryCount = sectorEntries; + blockEntries -= sectorEntries; + } + + // If this sector is short, the later sectors can't matter. + if ((sectorEntries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) + || (blockEntries == 0)) { + break; + } + } + + // If this block was not filled, or if it tore, no later block can matter. + if ((header.entryCount != journal->entriesPerBlock) + || (blockEntries > 0)) { + break; + } + } + + // Set the tail to the last valid tail block, if there is one. + if (foundEntries && (recovery->tailRecoveryPoint.sectorCount == 0)) { + recovery->tail--; + } + + return foundEntries; +} + +/** + * Count the number of increment entries in the journal. + * + * @param recovery The recovery completion + **/ +static int countIncrementEntries(RecoveryCompletion *recovery) +{ + RecoveryPoint recoveryPoint = { + .sequenceNumber = recovery->blockMapHead, + .sectorCount = 1, + .entryCount = 0, + }; + while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { + RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); + int result = validateRecoveryJournalEntry(recovery->vdo, &entry); + if (result != VDO_SUCCESS) { + enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); + return result; + } + if (isIncrementOperation(entry.operation)) { + recovery->increfCount++; + } + incrementRecoveryPoint(&recoveryPoint); + } + + return VDO_SUCCESS; +} + +/** + * Determine the limits of the valid recovery journal and prepare to replay + * into the slab journals and block map. + * + * @param completion The sub-task completion + **/ +static void prepareToApplyJournalEntries(VDOCompletion *completion) +{ + RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); + VDO *vdo = recovery->vdo; + RecoveryJournal *journal = vdo->recoveryJournal; + logInfo("Finished reading recovery journal"); + bool foundEntries = findHeadAndTail(journal, recovery->journalData, + &recovery->highestTail, + &recovery->blockMapHead, + &recovery->slabJournalHead); + if (foundEntries) { + foundEntries = findContiguousRange(recovery); + } + + // Both reap heads must be behind the tail. + if ((recovery->blockMapHead > recovery->tail) + || (recovery->slabJournalHead > recovery->tail)) { + int result = logErrorWithStringError(VDO_CORRUPT_JOURNAL, + "Journal tail too early. " + "block map head: %" PRIu64 + ", slab journal head: %" PRIu64 + ", tail: %llu", + recovery->blockMapHead, + recovery->slabJournalHead, + recovery->tail); + finishCompletion(&recovery->completion, result); + return; + } + + if (!foundEntries) { + // This message must be recognizable by VDOTest::RebuildBase. + logInfo("Replaying 0 recovery entries into block map"); + // We still need to load the SlabDepot. + FREE(recovery->journalData); + recovery->journalData = NULL; + prepareSubTask(recovery, finishParentCallback, finishParentCallback, + ZONE_TYPE_ADMIN); + loadSlabDepot(getSlabDepot(vdo), ADMIN_STATE_LOADING_FOR_RECOVERY, + completion, recovery); + return; + } + + logInfo("Highest-numbered recovery journal block has sequence number" + " %llu, and the highest-numbered usable block is %" + PRIu64, recovery->highestTail, recovery->tail); + + if (isReplaying(vdo)) { + // We need to know how many entries the block map rebuild completion will + // need to hold. + int result = countIncrementEntries(recovery); + if (result != VDO_SUCCESS) { + finishCompletion(&recovery->completion, result); + return; + } + + // We need to access the block map from a logical zone. + prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback, + ZONE_TYPE_LOGICAL); + loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, + recovery); + return; + } + + int result = computeUsages(recovery); + if (abortRecoveryOnError(result, recovery)) { + return; + } + + prepareSubTask(recovery, findSlabJournalEntries, finishParentCallback, + ZONE_TYPE_LOGICAL); + invokeCallback(completion); +} + +/**********************************************************************/ +void launchRecovery(VDO *vdo, VDOCompletion *parent) +{ + // Note: This message must be recognizable by Permabit::VDODeviceBase. + logWarning("Device was dirty, rebuilding reference counts"); + + RecoveryCompletion *recovery; + int result = makeRecoveryCompletion(vdo, &recovery); + if (result != VDO_SUCCESS) { + finishCompletion(parent, result); + return; + } + + VDOCompletion *completion = &recovery->completion; + prepareCompletion(completion, finishRecovery, abortRecovery, + parent->callbackThreadID, parent); + prepareSubTask(recovery, prepareToApplyJournalEntries, finishParentCallback, + ZONE_TYPE_ADMIN); + loadJournalAsync(vdo->recoveryJournal, &recovery->subTaskCompletion, + &recovery->journalData); +} diff --git a/source/vdo/base/vdoRecovery.h b/source/vdo/base/vdoRecovery.h new file mode 100644 index 0000000..f817a05 --- /dev/null +++ b/source/vdo/base/vdoRecovery.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.h#2 $ + */ + +#ifndef VDO_RECOVERY_H +#define VDO_RECOVERY_H + +#include "completion.h" +#include "vdo.h" + +/** + * Replay recovery journal entries in the the slab journals of slabs owned by a + * given BlockAllocator. + * + * @param allocator The allocator whose slab journals are to be recovered + * @param completion The completion to use for waiting on slab journal space + * @param context The slab depot load context supplied by a recovery when + * it loads the depot + **/ +void replayIntoSlabJournals(BlockAllocator *allocator, + VDOCompletion *completion, + void *context); + +/** + * Construct a recovery completion and launch it. Apply all valid journal block + * entries to all VDO structures. This function performs the offline portion of + * recovering a VDO from a crash. + * + * @param vdo The vdo to recover + * @param parent The completion to notify when the offline portion of the + * recovery is complete + **/ +void launchRecovery(VDO *vdo, VDOCompletion *parent); + +#endif // VDO_RECOVERY_H diff --git a/source/vdo/base/vdoRecoveryInternals.h b/source/vdo/base/vdoRecoveryInternals.h new file mode 100644 index 0000000..b0414c1 --- /dev/null +++ b/source/vdo/base/vdoRecoveryInternals.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecoveryInternals.h#2 $ + */ + +#ifndef VDO_RECOVERY_INTERNALS_H +#define VDO_RECOVERY_INTERNALS_H + +#include "vdoRecovery.h" + +#include "blockMapRecovery.h" +#include "intMap.h" +#include "journalPoint.h" +#include "ringNode.h" +#include "types.h" +#include "waitQueue.h" + +/** + * The absolute position of an entry in the recovery journal, including + * the sector number and the entry number within the sector. + **/ +typedef struct { + SequenceNumber sequenceNumber; // Block sequence number + uint8_t sectorCount; // Sector number + JournalEntryCount entryCount; // Entry number +} RecoveryPoint; + +typedef struct { + /** The completion header */ + VDOCompletion completion; + /** The sub-task completion */ + VDOCompletion subTaskCompletion; + /** The VDO in question */ + VDO *vdo; + /** The BlockAllocator whose journals are being recovered */ + BlockAllocator *allocator; + /** A buffer to hold the data read off disk */ + char *journalData; + /** The number of increfs */ + size_t increfCount; + + /** The entry data for the block map recovery */ + NumberedBlockMapping *entries; + /** The number of entries in the entry array */ + size_t entryCount; + /** The sequence number of the first valid block for block map recovery */ + SequenceNumber blockMapHead; + /** The sequence number of the first valid block for slab journal replay */ + SequenceNumber slabJournalHead; + /** The sequence number of the last valid block of the journal (if known) */ + SequenceNumber tail; + /** + * The highest sequence number of the journal, not the same as the tail, + * since the tail ignores blocks after the first hole. + */ + SequenceNumber highestTail; + + /** A location just beyond the last valid entry of the journal */ + RecoveryPoint tailRecoveryPoint; + /** The location of the next recovery journal entry to apply */ + RecoveryPoint nextRecoveryPoint; + /** The number of logical blocks currently known to be in use */ + BlockCount logicalBlocksUsed; + /** The number of block map data blocks known to be allocated */ + BlockCount blockMapDataBlocks; + /** The journal point to give to the next synthesized decref */ + JournalPoint nextJournalPoint; + /** The number of entries played into slab journals */ + size_t entriesAddedToSlabJournals; + + // Decref synthesis fields + + /** An intMap for use in finding which slots are missing decrefs */ + IntMap *slotEntryMap; + /** The number of synthesized decrefs */ + size_t missingDecrefCount; + /** The number of incomplete decrefs */ + size_t incompleteDecrefCount; + /** The fake journal point of the next missing decref */ + JournalPoint nextSynthesizedJournalPoint; + /** The queue of missing decrefs */ + WaitQueue missingDecrefs[]; +} RecoveryCompletion; + +/** + * Convert a generic completion to a RecoveryCompletion. + * + * @param completion The completion to convert + * + * @return The RecoveryCompletion + **/ +__attribute__((warn_unused_result)) +static inline RecoveryCompletion * +asRecoveryCompletion(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(RecoveryCompletion, completion) == 0); + assertCompletionType(completion->type, RECOVERY_COMPLETION); + return (RecoveryCompletion *) completion; +} + +/** + * Allocate and initialize a RecoveryCompletion. + * + * @param vdo The VDO in question + * @param recoveryPtr A pointer to hold the new RecoveryCompletion + * + * @return VDO_SUCCESS or a status code + **/ +int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr) + __attribute__((warn_unused_result)); + +/** + * Free a RecoveryCompletion and all underlying structures. + * + * @param recoveryPtr A pointer to the recovery completion to free + **/ +void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr); + +#endif // VDO_RECOVERY_INTERNALS_H diff --git a/source/vdo/base/vdoResize.c b/source/vdo/base/vdoResize.c new file mode 100644 index 0000000..ee3271d --- /dev/null +++ b/source/vdo/base/vdoResize.c @@ -0,0 +1,250 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResize.c#15 $ + */ + +#include "vdoResize.h" + +#include "logger.h" + +#include "adminCompletion.h" +#include "completion.h" +#include "recoveryJournal.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "vdoInternal.h" +#include "vdoLayout.h" + +typedef enum { + GROW_PHYSICAL_PHASE_START = 0, + GROW_PHYSICAL_PHASE_COPY_SUMMARY, + GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS, + GROW_PHYSICAL_PHASE_USE_NEW_SLABS, + GROW_PHYSICAL_PHASE_END, + GROW_PHYSICAL_PHASE_ERROR, +} GrowPhysicalPhase; + +static const char *GROW_PHYSICAL_PHASE_NAMES[] = { + "GROW_PHYSICAL_PHASE_START", + "GROW_PHYSICAL_PHASE_COPY_SUMMARY", + "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS", + "GROW_PHYSICAL_PHASE_USE_NEW_SLABS", + "GROW_PHYSICAL_PHASE_END", + "GROW_PHYSICAL_PHASE_ERROR", +}; + +/** + * Implements ThreadIDGetterForPhase. + **/ +__attribute__((warn_unused_result)) +static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) +{ + return getAdminThread(getThreadConfig(adminCompletion->completion.parent)); +} + +/** + * Callback to initiate a grow physical, registered in performGrowPhysical(). + * + * @param completion The sub-task completion + **/ +static void growPhysicalCallback(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, ADMIN_OPERATION_GROW_PHYSICAL); + assertAdminPhaseThread(adminCompletion, __func__, GROW_PHYSICAL_PHASE_NAMES); + + VDO *vdo = adminCompletion->completion.parent; + switch (adminCompletion->phase++) { + case GROW_PHYSICAL_PHASE_START: + if (isReadOnly(vdo->readOnlyNotifier)) { + logErrorWithStringError(VDO_READ_ONLY, + "Can't grow physical size of a read-only VDO"); + setCompletionResult(resetAdminSubTask(completion), VDO_READ_ONLY); + break; + } + + if (startOperationWithWaiter(&vdo->adminState, + ADMIN_STATE_SUSPENDED_OPERATION, + &adminCompletion->completion, NULL)) { + // Copy the journal into the new layout. + copyPartition(vdo->layout, RECOVERY_JOURNAL_PARTITION, + resetAdminSubTask(completion)); + } + return; + + case GROW_PHYSICAL_PHASE_COPY_SUMMARY: + copyPartition(vdo->layout, SLAB_SUMMARY_PARTITION, + resetAdminSubTask(completion)); + return; + + case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS: + vdo->config.physicalBlocks = growVDOLayout(vdo->layout); + updateSlabDepotSize(vdo->depot); + saveVDOComponentsAsync(vdo, resetAdminSubTask(completion)); + return; + + case GROW_PHYSICAL_PHASE_USE_NEW_SLABS: + useNewSlabs(vdo->depot, resetAdminSubTask(completion)); + return; + + case GROW_PHYSICAL_PHASE_END: + setSlabSummaryOrigin(getSlabSummary(vdo->depot), + getVDOPartition(vdo->layout, SLAB_SUMMARY_PARTITION)); + setRecoveryJournalPartition(vdo->recoveryJournal, + getVDOPartition(vdo->layout, + RECOVERY_JOURNAL_PARTITION)); + break; + + case GROW_PHYSICAL_PHASE_ERROR: + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); + break; + + default: + setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); + } + + finishVDOLayoutGrowth(vdo->layout); + finishOperationWithResult(&vdo->adminState, completion->result); +} + +/** + * Handle an error during the grow physical process. + * + * @param completion The sub-task completion + **/ +static void handleGrowthError(VDOCompletion *completion) +{ + adminCompletionFromSubTask(completion)->phase = GROW_PHYSICAL_PHASE_ERROR; + growPhysicalCallback(completion); +} + +/**********************************************************************/ +int performGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) +{ + BlockCount oldPhysicalBlocks = vdo->config.physicalBlocks; + + // Skip any noop grows. + if (oldPhysicalBlocks == newPhysicalBlocks) { + return VDO_SUCCESS; + } + + if (newPhysicalBlocks != getNextVDOLayoutSize(vdo->layout)) { + /* + * Either the VDO isn't prepared to grow, or it was prepared to grow + * to a different size. Doing this check here relies on the fact that + * the call to this method is done under the dmsetup message lock. + */ + finishVDOLayoutGrowth(vdo->layout); + abandonNewSlabs(vdo->depot); + return VDO_PARAMETER_MISMATCH; + } + + // Validate that we are prepared to grow appropriately. + BlockCount newDepotSize = getNextBlockAllocatorPartitionSize(vdo->layout); + BlockCount preparedDepotSize = getNewDepotSize(vdo->depot); + if (preparedDepotSize != newDepotSize) { + return VDO_PARAMETER_MISMATCH; + } + + int result = performAdminOperation(vdo, ADMIN_OPERATION_GROW_PHYSICAL, + getThreadIDForPhase, growPhysicalCallback, + handleGrowthError); + if (result != VDO_SUCCESS) { + return result; + } + + logInfo("Physical block count was %llu, now %llu", + oldPhysicalBlocks, newPhysicalBlocks); + return VDO_SUCCESS; +} + +/** + * Callback to check that we're not in recovery mode, used in + * prepareToGrowPhysical(). + * + * @param completion The sub-task completion + **/ +static void checkMayGrowPhysical(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, + ADMIN_OPERATION_PREPARE_GROW_PHYSICAL); + + VDO *vdo = adminCompletion->completion.parent; + assertOnAdminThread(vdo, __func__); + + resetAdminSubTask(completion); + + // This check can only be done from a base code thread. + if (isReadOnly(vdo->readOnlyNotifier)) { + finishCompletion(completion->parent, VDO_READ_ONLY); + return; + } + + // This check should only be done from a base code thread. + if (inRecoveryMode(vdo)) { + finishCompletion(completion->parent, VDO_RETRY_AFTER_REBUILD); + return; + } + + completeCompletion(completion->parent); +} + +/**********************************************************************/ +int prepareToGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) +{ + BlockCount currentPhysicalBlocks = vdo->config.physicalBlocks; + if (newPhysicalBlocks < currentPhysicalBlocks) { + return logErrorWithStringError(VDO_NOT_IMPLEMENTED, + "Removing physical storage from a VDO is " + "not supported"); + } + + if (newPhysicalBlocks == currentPhysicalBlocks) { + logWarning("Requested physical block count %" PRIu64 + " not greater than %llu", + newPhysicalBlocks, currentPhysicalBlocks); + finishVDOLayoutGrowth(vdo->layout); + abandonNewSlabs(vdo->depot); + return VDO_PARAMETER_MISMATCH; + } + + int result = performAdminOperation(vdo, + ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, + getThreadIDForPhase, checkMayGrowPhysical, + finishParentCallback); + if (result != VDO_SUCCESS) { + return result; + } + + result = prepareToGrowVDOLayout(vdo->layout, currentPhysicalBlocks, + newPhysicalBlocks, vdo->layer); + if (result != VDO_SUCCESS) { + return result; + } + + BlockCount newDepotSize = getNextBlockAllocatorPartitionSize(vdo->layout); + result = prepareToGrowSlabDepot(vdo->depot, newDepotSize); + if (result != VDO_SUCCESS) { + finishVDOLayoutGrowth(vdo->layout); + return result; + } + + return VDO_SUCCESS; +} diff --git a/source/vdo/base/vdoResize.h b/source/vdo/base/vdoResize.h new file mode 100644 index 0000000..76bfc1f --- /dev/null +++ b/source/vdo/base/vdoResize.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResize.h#1 $ + */ + +#ifndef VDO_RESIZE_H +#define VDO_RESIZE_H + +#include "types.h" + +/** + * Make the completion for an asynchronous resize. + * + * @param vdo The VDO + * @param newPhysicalBlocks The new physical size in blocks + * @param completionPtr A pointer to hold the completion + * + * @return VDO_SUCCESS or an error + **/ +int makeResizeVDOCompletion(VDO *vdo, + BlockCount newPhysicalBlocks, + VDOCompletion **completionPtr) + __attribute__((warn_unused_result)); + +/** + * Free the completion for an asynchronous resize, and NULL out the + * reference to it. + * + * @param completionPtr A reference to the completion to free + **/ +void freeResizeVDOCompletion(VDOCompletion **completionPtr); + +/** + * Grow the physical size of the VDO. This method may only be called when the + * VDO has been suspended and must not be called from a base thread. + * + * @param vdo The VDO to resize + * @param newPhysicalBlocks The new physical size in blocks + * + * @return VDO_SUCCESS or an error + **/ +int performGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks); + +/** + * Prepare to resize the VDO, allocating memory as needed. + * + * @param vdo The VDO + * @param newPhysicalBlocks The new physical size in blocks + **/ +int prepareToGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) + __attribute__((warn_unused_result)); + +#endif /* VDO_RESIZE_H */ diff --git a/source/vdo/base/vdoResizeLogical.c b/source/vdo/base/vdoResizeLogical.c new file mode 100644 index 0000000..97a06d1 --- /dev/null +++ b/source/vdo/base/vdoResizeLogical.c @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResizeLogical.c#6 $ + */ + +#include "vdoResizeLogical.h" + +#include "logger.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "completion.h" +#include "vdoInternal.h" + +typedef enum { + GROW_LOGICAL_PHASE_START = 0, + GROW_LOGICAL_PHASE_GROW_BLOCK_MAP, + GROW_LOGICAL_PHASE_END, + GROW_LOGICAL_PHASE_ERROR, +} GrowLogicalPhase; + +static const char *GROW_LOGICAL_PHASE_NAMES[] = { + "GROW_LOGICAL_PHASE_START", + "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP", + "GROW_LOGICAL_PHASE_END", + "GROW_LOGICAL_PHASE_ERROR", +}; + +/** + * Implements ThreadIDGetterForPhase. + **/ +__attribute__((warn_unused_result)) +static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) +{ + return getAdminThread(getThreadConfig(adminCompletion->completion.parent)); +} + +/** + * Callback to initiate a grow logical, registered in performGrowLogical(). + * + * @param completion The sub-task completion + **/ +static void growLogicalCallback(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, ADMIN_OPERATION_GROW_LOGICAL); + assertAdminPhaseThread(adminCompletion, __func__, GROW_LOGICAL_PHASE_NAMES); + + VDO *vdo = adminCompletion->completion.parent; + switch (adminCompletion->phase++) { + case GROW_LOGICAL_PHASE_START: + if (isReadOnly(vdo->readOnlyNotifier)) { + logErrorWithStringError(VDO_READ_ONLY, + "Can't grow logical size of a read-only VDO"); + finishCompletion(resetAdminSubTask(completion), VDO_READ_ONLY); + return; + } + + if (startOperationWithWaiter(&vdo->adminState, + ADMIN_STATE_SUSPENDED_OPERATION, + &adminCompletion->completion, NULL)) { + + vdo->config.logicalBlocks = getNewEntryCount(getBlockMap(vdo)); + saveVDOComponentsAsync(vdo, resetAdminSubTask(completion)); + } + + return; + + case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP: + growBlockMap(getBlockMap(vdo), resetAdminSubTask(completion)); + return; + + case GROW_LOGICAL_PHASE_END: + break; + + case GROW_LOGICAL_PHASE_ERROR: + enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); + break; + + default: + setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); + } + + finishOperationWithResult(&vdo->adminState, completion->result); +} + +/** + * Handle an error during the grow physical process. + * + * @param completion The sub-task completion + **/ +static void handleGrowthError(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + if (adminCompletion->phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) { + // We've failed to write the new size in the super block, so set our + // in memory config back to the old size. + VDO *vdo = adminCompletion->completion.parent; + BlockMap *map = getBlockMap(vdo); + vdo->config.logicalBlocks = getNumberOfBlockMapEntries(map); + abandonBlockMapGrowth(map); + } + + adminCompletion->phase = GROW_LOGICAL_PHASE_ERROR; + growLogicalCallback(completion); +} + +/**********************************************************************/ +int performGrowLogical(VDO *vdo, BlockCount newLogicalBlocks) +{ + if (getNewEntryCount(getBlockMap(vdo)) != newLogicalBlocks) { + return VDO_PARAMETER_MISMATCH; + } + + return performAdminOperation(vdo, ADMIN_OPERATION_GROW_LOGICAL, + getThreadIDForPhase, growLogicalCallback, + handleGrowthError); +} + +/**********************************************************************/ +int prepareToGrowLogical(VDO *vdo, BlockCount newLogicalBlocks) +{ + if (newLogicalBlocks < vdo->config.logicalBlocks) { + return logErrorWithStringError(VDO_PARAMETER_MISMATCH, + "Can't shrink VDO logical size from its " + "current value of %llu", + vdo->config.logicalBlocks); + } + + if (newLogicalBlocks == vdo->config.logicalBlocks) { + return logErrorWithStringError(VDO_PARAMETER_MISMATCH, + "Can't grow VDO logical size to its " + "current value of %llu", + vdo->config.logicalBlocks); + } + + return prepareToGrowBlockMap(getBlockMap(vdo), newLogicalBlocks); +} diff --git a/source/vdo/base/vdoResizeLogical.h b/source/vdo/base/vdoResizeLogical.h new file mode 100644 index 0000000..fbea60d --- /dev/null +++ b/source/vdo/base/vdoResizeLogical.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResizeLogical.h#1 $ + */ + +#ifndef VDO_RESIZE_LOGICAL_H +#define VDO_RESIZE_LOGICAL_H + +#include "types.h" + +/** + * Grow the logical size of the VDO. This method may only be called when the + * VDO has been suspended and must not be called from a base thread. + * + * @param vdo The VDO to grow + * @param newLogicalBlocks The size to which the VDO should be grown + * + * @return VDO_SUCCESS or an error + **/ +int performGrowLogical(VDO *vdo, BlockCount newLogicalBlocks); + +/** + * Prepare to grow the logical size of the VDO. This method may only be called + * while the VDO is running. + * + * @param vdo The VDO to prepare for growth + * @param newLogicalBlocks The size to which the VDO should be grown + * + * @return VDO_SUCCESS or an error + **/ +int prepareToGrowLogical(VDO *vdo, BlockCount newLogicalBlocks); + +#endif /* VDO_RESIZE_LOGICAL_H */ diff --git a/source/vdo/base/vdoResume.c b/source/vdo/base/vdoResume.c new file mode 100644 index 0000000..a10c2ef --- /dev/null +++ b/source/vdo/base/vdoResume.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResume.c#3 $ + */ + +#include "vdoResume.h" + +#include "logger.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "completion.h" +#include "logicalZone.h" +#include "recoveryJournal.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "threadConfig.h" +#include "vdoInternal.h" + +typedef enum { + RESUME_PHASE_START = 0, + RESUME_PHASE_ALLOW_READ_ONLY_MODE, + RESUME_PHASE_DEPOT, + RESUME_PHASE_JOURNAL, + RESUME_PHASE_BLOCK_MAP, + RESUME_PHASE_LOGICAL_ZONES, + RESUME_PHASE_PACKER, + RESUME_PHASE_END, +} ResumePhase; + +static const char *RESUME_PHASE_NAMES[] = { + "RESUME_PHASE_START", + "RESUME_PHASE_ALLOW_READ_ONLY_MODE", + "RESUME_PHASE_DEPOT", + "RESUME_PHASE_JOURNAL", + "RESUME_PHASE_BLOCK_MAP", + "RESUME_PHASE_LOGICAL_ZONES", + "RESUME_PHASE_PACKER", + "RESUME_PHASE_END", +}; + +/** + * Implements ThreadIDGetterForPhase. + **/ +__attribute__((warn_unused_result)) +static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) +{ + const ThreadConfig *threadConfig + = getThreadConfig(adminCompletion->completion.parent); + switch (adminCompletion->phase) { + case RESUME_PHASE_JOURNAL: + return getJournalZoneThread(threadConfig); + + case RESUME_PHASE_PACKER: + return getPackerZoneThread(threadConfig); + + default: + return getAdminThread(threadConfig); + } +} + +/** + * Update the VDO state and save the super block. + * + * @param vdo The VDO being resumed + * @param completion The AdminCompletion's sub-task completion + **/ +static void writeSuperBlock(VDO *vdo, VDOCompletion *completion) +{ + switch (vdo->state) { + case VDO_CLEAN: + case VDO_NEW: + vdo->state = VDO_DIRTY; + saveVDOComponentsAsync(vdo, completion); + return; + + case VDO_DIRTY: + case VDO_READ_ONLY_MODE: + case VDO_FORCE_REBUILD: + case VDO_RECOVERING: + case VDO_REBUILD_FOR_UPGRADE: + // No need to write the super block in these cases + completeCompletion(completion); + return; + + case VDO_REPLAYING: + default: + finishCompletion(completion, UDS_BAD_STATE); + } +} + +/** + * Callback to resume a VDO. + * + * @param completion The sub-task completion + **/ +static void resumeCallback(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + assertAdminOperationType(adminCompletion, ADMIN_OPERATION_RESUME); + assertAdminPhaseThread(adminCompletion, __func__, RESUME_PHASE_NAMES); + + VDO *vdo = adminCompletion->completion.parent; + switch (adminCompletion->phase++) { + case RESUME_PHASE_START: + if (startResuming(&vdo->adminState, ADMIN_STATE_RESUMING, + &adminCompletion->completion, NULL)) { + writeSuperBlock(vdo, completion); + } + return; + + case RESUME_PHASE_ALLOW_READ_ONLY_MODE: + allowReadOnlyModeEntry(vdo->readOnlyNotifier, + resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_DEPOT: + resumeSlabDepot(vdo->depot, resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_JOURNAL: + resumeRecoveryJournal(vdo->recoveryJournal, resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_BLOCK_MAP: + resumeBlockMap(vdo->blockMap, resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_LOGICAL_ZONES: + resumeLogicalZones(vdo->logicalZones,resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_PACKER: + resumePacker(vdo->packer, resetAdminSubTask(completion)); + return; + + case RESUME_PHASE_END: + break; + + default: + setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); + } + + finishResumingWithResult(&vdo->adminState, completion->result); +} + +/**********************************************************************/ +int performVDOResume(VDO *vdo) +{ + return performAdminOperation(vdo, ADMIN_OPERATION_RESUME, + getThreadIDForPhase, resumeCallback, + preserveErrorAndContinue); +} diff --git a/source/vdo/base/vdoResume.h b/source/vdo/base/vdoResume.h new file mode 100644 index 0000000..1ef25b2 --- /dev/null +++ b/source/vdo/base/vdoResume.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResume.h#1 $ + */ + +#ifndef VDO_RESUME_H +#define VDO_RESUME_H + +#include "types.h" + +/** + * Resume a suspended VDO. + * + * @param vdo The VDO to resume + * + * @return VDO_SUCCESS or an error + **/ +int performVDOResume(VDO *vdo); + +#endif /* VDO_RESUME_H */ diff --git a/source/vdo/base/vdoState.c b/source/vdo/base/vdoState.c new file mode 100644 index 0000000..00d3986 --- /dev/null +++ b/source/vdo/base/vdoState.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoState.c#1 $ + */ + +#include "vdoState.h" + +#include "permassert.h" + +static const char *VDO_STATE_NAMES[] = { + [VDO_CLEAN] = "CLEAN", + [VDO_DIRTY] = "DIRTY", + [VDO_FORCE_REBUILD] = "FORCE_REBUILD", + [VDO_NEW] = "NEW", + [VDO_READ_ONLY_MODE] = "READ_ONLY_MODE", + [VDO_REBUILD_FOR_UPGRADE] = "REBUILD_FOR_UPGRADE", + [VDO_RECOVERING] = "RECOVERING", + [VDO_REPLAYING] = "REPLAYING", +}; + +/**********************************************************************/ +const char *getVDOStateName(VDOState state) +{ + // Catch if a state has been added without updating the name array. + STATIC_ASSERT(COUNT_OF(VDO_STATE_NAMES) == VDO_STATE_COUNT); + + int result = ASSERT(state < COUNT_OF(VDO_STATE_NAMES), + "VDOState value %u must have a registered name", state); + if (result != UDS_SUCCESS) { + return "INVALID VDO STATE CODE"; + } + + return VDO_STATE_NAMES[state]; +} + +/**********************************************************************/ +const char *describeVDOState(VDOState state) +{ + // These strings should all fit in the 15 chars of VDOStatistics.mode. + switch (state) { + case VDO_RECOVERING: + return "recovering"; + + case VDO_READ_ONLY_MODE: + return "read-only"; + + default: + return "normal"; + } +} diff --git a/source/vdo/base/vdoState.h b/source/vdo/base/vdoState.h new file mode 100644 index 0000000..5843565 --- /dev/null +++ b/source/vdo/base/vdoState.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoState.h#2 $ + */ + +#ifndef VDO_STATE_H +#define VDO_STATE_H + +/** + * The current operating mode of the VDO. These are persistent on disk + * so the values must not change. + **/ +typedef enum { + VDO_DIRTY = 0, + VDO_NEW = 1, + VDO_CLEAN = 2, + VDO_READ_ONLY_MODE = 3, + VDO_FORCE_REBUILD = 4, + VDO_RECOVERING = 5, + VDO_REPLAYING = 6, + VDO_REBUILD_FOR_UPGRADE = 7, + + // Keep VDO_STATE_COUNT at the bottom. + VDO_STATE_COUNT +} VDOState; + +/** + * Get the name of a VDO state code for logging purposes. + * + * @param state The state code + * + * @return The name of the state code + **/ +const char *getVDOStateName(VDOState state) + __attribute__((warn_unused_result)); + +/** + * Return a user-visible string describing the current VDO state. + * + * @param state The VDO state to describe + * + * @return A string constant describing the state + **/ +const char *describeVDOState(VDOState state) + __attribute__((warn_unused_result)); + +#endif // VDO_STATE_H diff --git a/source/vdo/base/vdoSuspend.c b/source/vdo/base/vdoSuspend.c new file mode 100644 index 0000000..e919f19 --- /dev/null +++ b/source/vdo/base/vdoSuspend.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoSuspend.c#4 $ + */ + +#include "vdoSuspend.h" + +#include "logger.h" + +#include "adminCompletion.h" +#include "blockMap.h" +#include "completion.h" +#include "logicalZone.h" +#include "recoveryJournal.h" +#include "slabDepot.h" +#include "slabSummary.h" +#include "threadConfig.h" +#include "vdoInternal.h" + +typedef enum { + SUSPEND_PHASE_START = 0, + SUSPEND_PHASE_PACKER, + SUSPEND_PHASE_LOGICAL_ZONES, + SUSPEND_PHASE_BLOCK_MAP, + SUSPEND_PHASE_JOURNAL, + SUSPEND_PHASE_DEPOT, + SUSPEND_PHASE_WRITE_SUPER_BLOCK, + SUSPEND_PHASE_END, +} SuspendPhase; + +static const char *SUSPEND_PHASE_NAMES[] = { + "SUSPEND_PHASE_START", + "SUSPEND_PHASE_PACKER", + "SUSPEND_PHASE_LOGICAL_ZONES", + "SUSPEND_PHASE_BLOCK_MAP", + "SUSPEND_PHASE_JOURNAL", + "SUSPEND_PHASE_DEPOT", + "SUSPEND_PHASE_WRITE_SUPER_BLOCK", + "SUSPEND_PHASE_END", +}; + +/** + * Implements ThreadIDGetterForPhase. + **/ +__attribute__((warn_unused_result)) +static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) +{ + const ThreadConfig *threadConfig + = getThreadConfig(adminCompletion->completion.parent); + switch (adminCompletion->phase) { + case SUSPEND_PHASE_PACKER: + return getPackerZoneThread(threadConfig); + + case SUSPEND_PHASE_JOURNAL: + return getJournalZoneThread(threadConfig); + + default: + return getAdminThread(threadConfig); + } +} + +/** + * Update the VDO state and save the super block. + * + * @param vdo The VDO being suspended + * @param completion The AdminCompletion's sub-task completion + **/ +static void writeSuperBlock(VDO *vdo, VDOCompletion *completion) +{ + switch (vdo->state) { + case VDO_DIRTY: + case VDO_NEW: + vdo->state = VDO_CLEAN; + break; + + case VDO_CLEAN: + case VDO_READ_ONLY_MODE: + case VDO_FORCE_REBUILD: + case VDO_RECOVERING: + case VDO_REBUILD_FOR_UPGRADE: + break; + + case VDO_REPLAYING: + default: + finishCompletion(completion, UDS_BAD_STATE); + return; + } + + saveVDOComponentsAsync(vdo, completion); +} + +/** + * Callback to initiate a suspend, registered in performVDOSuspend(). + * + * @param completion The sub-task completion + **/ +static void suspendCallback(VDOCompletion *completion) +{ + AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); + ASSERT_LOG_ONLY(((adminCompletion->type == ADMIN_OPERATION_SUSPEND) + || (adminCompletion->type == ADMIN_OPERATION_SAVE)), + "unexpected admin operation type %u is neither " + "suspend nor save", adminCompletion->type); + assertAdminPhaseThread(adminCompletion, __func__, SUSPEND_PHASE_NAMES); + + VDO *vdo = adminCompletion->completion.parent; + switch (adminCompletion->phase++) { + case SUSPEND_PHASE_START: + if (!startDraining(&vdo->adminState, + ((adminCompletion->type == ADMIN_OPERATION_SUSPEND) + ? ADMIN_STATE_SUSPENDING : ADMIN_STATE_SAVING), + &adminCompletion->completion, NULL)) { + return; + } + + if (!vdo->closeRequired) { + // There's nothing to do. + break; + } + + waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_PACKER: + /* + * If the VDO was already resumed from a prior suspend while read-only, + * some of the components may not have been resumed. By setting a read-only + * error here, we guarantee that the result of this suspend will be + * VDO_READ_ONLY and not VDO_INVALID_ADMIN_STATE in that case. + */ + if (inReadOnlyMode(vdo)) { + setCompletionResult(&adminCompletion->completion, VDO_READ_ONLY); + } + + drainPacker(vdo->packer, resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_LOGICAL_ZONES: + drainLogicalZones(vdo->logicalZones, vdo->adminState.state, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_BLOCK_MAP: + drainBlockMap(vdo->blockMap, vdo->adminState.state, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_JOURNAL: + drainRecoveryJournal(vdo->recoveryJournal, vdo->adminState.state, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_DEPOT: + drainSlabDepot(vdo->depot, vdo->adminState.state, + resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_WRITE_SUPER_BLOCK: + if (isSuspending(&vdo->adminState) + || (adminCompletion->completion.result != VDO_SUCCESS)) { + // If we didn't save the VDO or there was an error, we're done. + break; + } + + writeSuperBlock(vdo, resetAdminSubTask(completion)); + return; + + case SUSPEND_PHASE_END: + break; + + default: + setCompletionResult(completion, UDS_BAD_STATE); + } + + finishDrainingWithResult(&vdo->adminState, completion->result); +} + +/**********************************************************************/ +int performVDOSuspend(VDO *vdo, bool save) +{ + return performAdminOperation(vdo, (save + ? ADMIN_OPERATION_SAVE + : ADMIN_OPERATION_SUSPEND), + getThreadIDForPhase, suspendCallback, + preserveErrorAndContinue); +} diff --git a/source/vdo/base/vdoSuspend.h b/source/vdo/base/vdoSuspend.h new file mode 100644 index 0000000..39172dc --- /dev/null +++ b/source/vdo/base/vdoSuspend.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoSuspend.h#1 $ + */ + +#ifndef VDO_SUSPEND_H +#define VDO_SUSPEND_H + +#include "types.h" + +/** + * Ensure that the VDO has no outstanding I/O and will issue none until it is + * resumed. + * + * @param vdo The VDO to suspend + * @param save If true, all dirty metadata will be flushed as + * well + * + * @return VDO_SUCCESS or an error + **/ +int performVDOSuspend(VDO *vdo, bool save); + +#endif /* VDO_SUSPEND_H */ diff --git a/source/vdo/base/vio.c b/source/vdo/base/vio.c new file mode 100644 index 0000000..9bd678d --- /dev/null +++ b/source/vdo/base/vio.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vio.c#5 $ + */ + +#include "vio.h" + +#include "logger.h" + +#include "dataVIO.h" +#include "vdoInternal.h" + +#ifdef __KERNEL__ +#include +#endif + +/**********************************************************************/ +void freeVIO(VIO **vioPtr) +{ + VIO *vio = *vioPtr; + if (vio == NULL) { + return; + } + + vio->completion.layer->freeVIO(vioPtr); +} + +/**********************************************************************/ +void initializeVIO(VIO *vio, + VIOType type, + VIOPriority priority, + VDOCompletion *parent, + VDO *vdo, + PhysicalLayer *layer) +{ + vio->vdo = vdo; + vio->type = type; + vio->priority = priority; + + VDOCompletion *completion = vioAsCompletion(vio); + initializeCompletion(completion, VIO_COMPLETION, layer); + completion->parent = parent; +} + +/**********************************************************************/ +void vioDoneCallback(VDOCompletion *completion) +{ + VIO *vio = asVIO(completion); + completion->callback = vio->callback; + completion->errorHandler = vio->errorHandler; + completeCompletion(completion); +} + +/**********************************************************************/ +const char *getVIOReadWriteFlavor(const VIO *vio) +{ + if (isReadVIO(vio)) { + return "read"; + } + return (isWriteVIO(vio) ? "write" : "read-modify-write"); +} + +/**********************************************************************/ +void updateVIOErrorStats(VIO *vio, const char *format, ...) +{ + int priority; + int result = vioAsCompletion(vio)->result; + switch (result) { + case VDO_READ_ONLY: + atomicAdd64(&vio->vdo->errorStats.readOnlyErrorCount, 1); + return; + + case VDO_NO_SPACE: + atomicAdd64(&vio->vdo->errorStats.noSpaceErrorCount, 1); + priority = LOG_DEBUG; + break; + + default: + priority = LOG_ERR; + } + +#ifdef __KERNEL__ + static DEFINE_RATELIMIT_STATE(errorLimiter, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if (!__ratelimit(&errorLimiter)) { + return; + } +#endif + + va_list args; + va_start(args, format); + vLogWithStringError(priority, result, format, args); + va_end(args); +} + +/** + * Handle an error from a metadata I/O. + * + * @param completion The VIO + **/ +static void handleMetadataIOError(VDOCompletion *completion) +{ + VIO *vio = asVIO(completion); + updateVIOErrorStats(vio, + "Completing %s VIO of type %u for physical block %" + PRIu64 " with error", + getVIOReadWriteFlavor(vio), vio->type, vio->physical); + vioDoneCallback(completion); +} + +/**********************************************************************/ +void launchMetadataVIO(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler, + VIOOperation operation) +{ + vio->operation = operation; + vio->physical = physical; + vio->callback = callback; + vio->errorHandler = errorHandler; + + VDOCompletion *completion = vioAsCompletion(vio); + resetCompletion(completion); + completion->callback = vioDoneCallback; + completion->errorHandler = handleMetadataIOError; + + if (isReadVIO(vio)) { + completion->layer->readMetadata(vio); + } else { + completion->layer->writeMetadata(vio); + } +} + +/** + * Handle a flush error. + * + * @param completion The flush VIO + **/ +static void handleFlushError(VDOCompletion *completion) +{ + logErrorWithStringError(completion->result, "Error flushing layer"); + completion->errorHandler = asVIO(completion)->errorHandler; + completeCompletion(completion); +} + +/**********************************************************************/ +void launchFlush(VIO *vio, VDOAction *callback, VDOAction *errorHandler) +{ + VDOCompletion *completion = vioAsCompletion(vio); + resetCompletion(completion); + completion->callback = callback; + completion->errorHandler = handleFlushError; + vio->errorHandler = errorHandler; + vio->operation = VIO_FLUSH_BEFORE; + vio->physical = ZERO_BLOCK; + + PhysicalLayer *layer = completion->layer; + if (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC) { + // XXX It is dangerous to be subtly dropping flushes possibly + // needed for correctness in sync mode. + finishCompletion(completion, VDO_SUCCESS); + return; + } + + layer->flush(vio); +} diff --git a/source/vdo/base/vio.h b/source/vdo/base/vio.h new file mode 100644 index 0000000..8129cc6 --- /dev/null +++ b/source/vdo/base/vio.h @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vio.h#3 $ + */ + +#ifndef VIO_H +#define VIO_H + +#include + +#include "completion.h" +#include "trace.h" +#include "types.h" +#include "vdo.h" + +/** + * A representation of a single block which may be passed between the VDO base + * and the physical layer. + **/ +struct vio { + /* The completion for this VIO */ + VDOCompletion completion; + + /* The functions to call when this VIO's operation is complete */ + VDOAction *callback; + VDOAction *errorHandler; + + /* The VDO handling this VIO */ + VDO *vdo; + + /* The address on the underlying device of the block to be read/written */ + PhysicalBlockNumber physical; + + /* The type of request this VIO is servicing */ + VIOOperation operation; + + /* The queueing priority of the VIO operation */ + VIOPriority priority; + + /* The VIO type is used for statistics and instrumentation. */ + VIOType type; + + /* Used for logging and debugging */ + Trace *trace; +}; + +/** + * Convert a generic VDOCompletion to a VIO. + * + * @param completion The completion to convert + * + * @return The completion as a VIO + **/ +static inline VIO *asVIO(VDOCompletion *completion) +{ + STATIC_ASSERT(offsetof(VIO, completion) == 0); + assertCompletionType(completion->type, VIO_COMPLETION); + return (VIO *) completion; +} + +/** + * Convert a VIO to a generic completion. + * + * @param vio The VIO to convert + * + * @return The VIO as a completion + **/ +static inline VDOCompletion *vioAsCompletion(VIO *vio) +{ + return &vio->completion; +} + +/** + * Create a VIO. + * + * @param [in] layer The physical layer + * @param [in] vioType The type of VIO to create + * @param [in] priority The relative priority to assign to the VIO + * @param [in] parent The parent of the VIO + * @param [in] data The buffer + * @param [out] vioPtr A pointer to hold the new VIO + * + * @return VDO_SUCCESS or an error + **/ +static inline int createVIO(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + char *data, + VIO **vioPtr) +{ + return layer->createMetadataVIO(layer, vioType, priority, parent, data, + vioPtr); +} + +/** + * Destroy a vio. The pointer to the VIO will be nulled out. + * + * @param vioPtr A pointer to the VIO to destroy + **/ +void freeVIO(VIO **vioPtr); + +/** + * Initialize a VIO. + * + * @param vio The VIO to initialize + * @param type The VIO type + * @param priority The relative priority of the VIO + * @param parent The parent (the extent completion) to assign to the VIO + * completion + * @param vdo The VDO for this VIO + * @param layer The layer for this VIO + **/ +void initializeVIO(VIO *vio, + VIOType type, + VIOPriority priority, + VDOCompletion *parent, + VDO *vdo, + PhysicalLayer *layer); + +/** + * The very last step in processing a VIO. Set the VIO's completion's callback + * and error handler from the fields set in the VIO itself on launch and then + * actually complete the VIO's completion. + * + * @param completion The VIO + **/ +void vioDoneCallback(VDOCompletion *completion); + +/** + * Get the name of a VIO's operation. + * + * @param vio The VIO + * + * @return The name of the VIO's operation (read, write, or read-modify-write) + **/ +const char *getVIOReadWriteFlavor(const VIO *vio); + +/** + * Update per-VIO error stats and log the error. + * + * @param vio The VIO which got an error + * @param format The format of the message to log (a printf style format) + **/ +void updateVIOErrorStats(VIO *vio, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Add a trace record for the current source location. + * + * @param vio The VIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void vioAddTraceRecord(VIO *vio, TraceLocation location) +{ + if (unlikely(vio->trace != NULL)) { + addTraceRecord(vio->trace, location); + } +} + +/** + * Check whether a VIO is servicing an external data request. + * + * @param vio The VIO to check + **/ +static inline bool isDataVIO(VIO *vio) +{ + return isDataVIOType(vio->type); +} + +/** + * Check whether a VIO is for compressed block writes + * + * @param vio The VIO to check + **/ +static inline bool isCompressedWriteVIO(VIO *vio) +{ + return isCompressedWriteVIOType(vio->type); +} + +/** + * Check whether a VIO is for metadata + * + * @param vio The VIO to check + **/ +static inline bool isMetadataVIO(VIO *vio) +{ + return isMetadataVIOType(vio->type); +} + +/** + * Check whether a VIO is a read. + * + * @param vio The VIO + * + * @return true if the VIO is a read + **/ +static inline bool isReadVIO(const VIO *vio) +{ + return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ); +} + +/** + * Check whether a VIO is a read-modify-write. + * + * @param vio The VIO + * + * @return true if the VIO is a read-modify-write + **/ +static inline bool isReadModifyWriteVIO(const VIO *vio) +{ + return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ_MODIFY_WRITE); +} + +/** + * Check whether a VIO is a write. + * + * @param vio The VIO + * + * @return true if the VIO is a write + **/ +static inline bool isWriteVIO(const VIO *vio) +{ + return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_WRITE); +} + +/** + * Check whether a VIO requires a flush before doing its I/O. + * + * @param vio The VIO + * + * @return true if the VIO requires a flush before + **/ +static inline bool vioRequiresFlushBefore(const VIO *vio) +{ + return ((vio->operation & VIO_FLUSH_BEFORE) == VIO_FLUSH_BEFORE); +} + +/** + * Check whether a VIO requires a flush after doing its I/O. + * + * @param vio The VIO + * + * @return true if the VIO requires a flush after + **/ +static inline bool vioRequiresFlushAfter(const VIO *vio) +{ + return ((vio->operation & VIO_FLUSH_AFTER) == VIO_FLUSH_AFTER); +} + +/** + * Launch a metadata VIO. + * + * @param vio The VIO to launch + * @param physical The physical block number to read or write + * @param callback The function to call when the VIO completes its I/O + * @param errorHandler The handler for write errors + * @param operation The operation to perform (read or write) + **/ +void launchMetadataVIO(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler, + VIOOperation operation); + +/** + * Launch a metadata read VIO. + * + * @param vio The VIO to launch + * @param physical The physical block number to read + * @param callback The function to call when the VIO completes its read + * @param errorHandler The handler for write errors + **/ +static inline void launchReadMetadataVIO(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler) +{ + launchMetadataVIO(vio, physical, callback, errorHandler, VIO_READ); +} + +/** + * Launch a metadata write VIO. + * + * @param vio The VIO to launch + * @param physical The physical block number to write + * @param callback The function to call when the VIO completes its write + * @param errorHandler The handler for write errors + **/ +static inline void launchWriteMetadataVIO(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler) +{ + launchMetadataVIO(vio, physical, callback, errorHandler, VIO_WRITE); +} + +/** + * Launch a metadata write VIO optionally flushing the layer before and/or + * after the write operation. + * + * @param vio The VIO to launch + * @param physical The physical block number to write + * @param callback The function to call when the VIO completes its + * operation + * @param errorHandler The handler for flush or write errors + * @param flushBefore Whether or not to flush before writing + * @param flushAfter Whether or not to flush after writing + **/ +static inline +void launchWriteMetadataVIOWithFlush(VIO *vio, + PhysicalBlockNumber physical, + VDOAction *callback, + VDOAction *errorHandler, + bool flushBefore, + bool flushAfter) +{ + launchMetadataVIO(vio, physical, callback, errorHandler, + (VIO_WRITE + | (flushBefore ? VIO_FLUSH_BEFORE : 0) + | (flushAfter ? VIO_FLUSH_AFTER : 0))); +} + +/** + * Issue a flush to the layer. If the layer does not require flushing, this + * method will immediately finish the VIO with which it was called. Care must + * be taken to avoid introducing a stack overflow in that case. + * + * @param vio The VIO to notify when the flush is complete + * @param callback The function to call when the flush is complete + * @param errorHandler The handler for flush errors + **/ +void launchFlush(VIO *vio, VDOAction *callback, VDOAction *errorHandler); + +#endif // VIO_H diff --git a/source/vdo/base/vioPool.c b/source/vdo/base/vioPool.c new file mode 100644 index 0000000..3d5ce07 --- /dev/null +++ b/source/vdo/base/vioPool.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioPool.c#5 $ + */ + +#include "vioPool.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" + +#include "constants.h" +#include "vio.h" +#include "types.h" + +/** + * An VIOPool is a collection of preallocated VIOs. + **/ +struct vioPool { + /** The number of objects managed by the pool */ + size_t size; + /** The list of objects which are available */ + RingNode available; + /** The queue of requestors waiting for objects from the pool */ + WaitQueue waiting; + /** The number of objects currently in use */ + size_t busyCount; + /** The list of objects which are in use */ + RingNode busy; + /** The number of requests when no object was available */ + uint64_t outageCount; + /** The ID of the thread on which this pool may be used */ + ThreadID threadID; + /** The buffer backing the pool's VIOs */ + char *buffer; + /** The pool entries */ + VIOPoolEntry entries[]; +}; + +/**********************************************************************/ +int makeVIOPool(PhysicalLayer *layer, + size_t poolSize, + ThreadID threadID, + VIOConstructor *vioConstructor, + void *context, + VIOPool **poolPtr) +{ + VIOPool *pool; + int result = ALLOCATE_EXTENDED(VIOPool, poolSize, VIOPoolEntry, __func__, + &pool); + if (result != VDO_SUCCESS) { + return result; + } + + pool->threadID = threadID; + initializeRing(&pool->available); + initializeRing(&pool->busy); + + result = ALLOCATE(poolSize * VDO_BLOCK_SIZE, char, "VIO pool buffer", + &pool->buffer); + if (result != VDO_SUCCESS) { + freeVIOPool(&pool); + return result; + } + + char *ptr = pool->buffer; + for (size_t i = 0; i < poolSize; i++) { + VIOPoolEntry *entry = &pool->entries[i]; + entry->buffer = ptr; + entry->context = context; + result = vioConstructor(layer, entry, ptr, &entry->vio); + if (result != VDO_SUCCESS) { + freeVIOPool(&pool); + return result; + } + + ptr += VDO_BLOCK_SIZE; + initializeRing(&entry->node); + pushRingNode(&pool->available, &entry->node); + pool->size++; + } + + *poolPtr = pool; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeVIOPool(VIOPool **poolPtr) +{ + if (*poolPtr == NULL) { + return; + } + + // Remove all available entries from the object pool. + VIOPool *pool = *poolPtr; + ASSERT_LOG_ONLY(!hasWaiters(&pool->waiting), + "VIO pool must not have any waiters when being freed"); + ASSERT_LOG_ONLY((pool->busyCount == 0), + "VIO pool must not have %zu busy entries when being freed", + pool->busyCount); + ASSERT_LOG_ONLY(isRingEmpty(&pool->busy), + "VIO pool must not have busy entries when being freed"); + + VIOPoolEntry *entry; + while ((entry = asVIOPoolEntry(chopRingNode(&pool->available))) != NULL) { + freeVIO(&entry->vio); + } + + // Make sure every VIOPoolEntry has been removed. + for (size_t i = 0; i < pool->size; i++) { + VIOPoolEntry *entry = &pool->entries[i]; + ASSERT_LOG_ONLY(isRingEmpty(&entry->node), "VIO Pool entry still in use:" + " VIO is in use for physical block %" PRIu64 + " for operation %u", + entry->vio->physical, + entry->vio->operation); + } + + FREE(pool->buffer); + FREE(pool); + *poolPtr = NULL; +} + +/**********************************************************************/ +bool isVIOPoolBusy(VIOPool *pool) +{ + return (pool->busyCount != 0); +} + +/**********************************************************************/ +int acquireVIOFromPool(VIOPool *pool, Waiter *waiter) +{ + ASSERT_LOG_ONLY((pool->threadID == getCallbackThreadID()), + "acquire from active VIOPool called from correct thread"); + + if (isRingEmpty(&pool->available)) { + pool->outageCount++; + return enqueueWaiter(&pool->waiting, waiter); + } + + pool->busyCount++; + RingNode *entry = chopRingNode(&pool->available); + pushRingNode(&pool->busy, entry); + (*waiter->callback)(waiter, entry); + return VDO_SUCCESS; +} + +/**********************************************************************/ +void returnVIOToPool(VIOPool *pool, VIOPoolEntry *entry) +{ + ASSERT_LOG_ONLY((pool->threadID == getCallbackThreadID()), + "vio pool entry returned on same thread as it was acquired"); + entry->vio->completion.errorHandler = NULL; + if (hasWaiters(&pool->waiting)) { + notifyNextWaiter(&pool->waiting, NULL, entry); + return; + } + + pushRingNode(&pool->available, &entry->node); + --pool->busyCount; +} + +/**********************************************************************/ +uint64_t getVIOPoolOutageCount(VIOPool *pool) +{ + return pool->outageCount; +} diff --git a/source/vdo/base/vioPool.h b/source/vdo/base/vioPool.h new file mode 100644 index 0000000..bab3dbe --- /dev/null +++ b/source/vdo/base/vioPool.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioPool.h#4 $ + */ + +#ifndef VIO_POOL_H +#define VIO_POOL_H + +#include "permassert.h" + +#include "completion.h" +#include "types.h" +#include "waitQueue.h" + +/** + * A VIOPool is a collection of preallocated VIOs used to write arbitrary + * metadata blocks. + **/ + +/** + * An VIOPoolEntry is the pair of VIO and buffer whether in use or not. + **/ +typedef struct { + RingNode node; + VIO *vio; + void *buffer; + void *parent; + void *context; +} VIOPoolEntry; + +/** + * A function which constructs a VIO for a pool. + * + * @param [in] layer The physical layer in which the VIO will operate + * @param [in] parent The parent of the VIO + * @param [in] buffer The data buffer for the VIO + * @param [out] vioPtr A pointer to hold the new VIO + **/ +typedef int VIOConstructor(PhysicalLayer *layer, + void *parent, + void *buffer, + VIO **vioPtr); + +/** + * Create a new VIO pool. + * + * @param [in] layer the physical layer to write to and read from + * @param [in] poolSize the number of VIOs in the pool + * @param [in] threadID the ID of the thread using this pool + * @param [in] vioConstructor the constructor for VIOs in the pool + * @param [in] context the context that each entry will have + * @param [out] poolPtr the resulting pool + * + * @return a success or error code + **/ +int makeVIOPool(PhysicalLayer *layer, + size_t poolSize, + ThreadID threadID, + VIOConstructor *vioConstructor, + void *context, + VIOPool **poolPtr) + __attribute__((warn_unused_result)); + +/** + * Destroy a VIO pool + * + * @param poolPtr the pointer holding the pool, which will be nulled out + **/ +void freeVIOPool(VIOPool **poolPtr); + +/** + * Check whether an VIO pool has outstanding entries. + * + * @return true if the pool is busy + **/ +bool isVIOPoolBusy(VIOPool *pool) + __attribute__((warn_unused_result)); + +/** + * Acquire a VIO and buffer from the pool (asynchronous). + * + * @param pool the VIO pool + * @param waiter object that is requesting a VIO + * + * @return VDO_SUCCESS or an error + **/ +int acquireVIOFromPool(VIOPool *pool, Waiter *waiter); + +/** + * Return a VIO and its buffer to the pool. + * + * @param pool the VIO pool + * @param entry a VIO pool entry + **/ +void returnVIOToPool(VIOPool *pool, VIOPoolEntry *entry); + +/** + * Convert a RingNode to the VIOPoolEntry that contains it. + * + * @param node The RingNode to convert + * + * @return The VIOPoolEntry wrapping the RingNode + **/ +static inline VIOPoolEntry *asVIOPoolEntry(RingNode *node) +{ + STATIC_ASSERT(offsetof(VIOPoolEntry, node) == 0); + return (VIOPoolEntry *) node; +} + +/** + * Return the outage count of an VIO pool. + * + * @param pool The pool + * + * @return the number of times an acquisition request had to wait + **/ +uint64_t getVIOPoolOutageCount(VIOPool *pool) + __attribute__((warn_unused_result)); + +#endif // VIO_POOL_H diff --git a/source/vdo/base/vioRead.c b/source/vdo/base/vioRead.c new file mode 100644 index 0000000..ab73727 --- /dev/null +++ b/source/vdo/base/vioRead.c @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioRead.c#1 $ + */ + +#include "vioRead.h" + +#include "logger.h" + +#include "blockMap.h" +#include "dataVIO.h" +#include "vdoInternal.h" +#include "vioWrite.h" + +/** + * Do the modify-write part of a read-modify-write cycle. This callback is + * registered in readBlock(). + * + * @param completion The DataVIO which has just finished its read + **/ +static void modifyForPartialWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + + if (completion->result != VDO_SUCCESS) { + completeDataVIO(completion); + return; + } + + completion->layer->applyPartialWrite(dataVIO); + VIO *vio = dataVIOAsVIO(dataVIO); + vio->operation = VIO_WRITE | (vio->operation & ~VIO_READ_WRITE_MASK); + dataVIO->isPartialWrite = true; + launchWriteDataVIO(dataVIO); +} + +/** + * Read a block asynchronously. This is the callback registered in + * readBlockMapping(). + * + * @param completion The DataVIO to read + **/ +static void readBlock(VDOCompletion *completion) +{ + if (completion->result != VDO_SUCCESS) { + completeDataVIO(completion); + return; + } + + DataVIO *dataVIO = asDataVIO(completion); + VIO *vio = asVIO(completion); + completion->callback + = (isReadVIO(vio) ? completeDataVIO : modifyForPartialWrite); + + if (dataVIO->mapped.pbn == ZERO_BLOCK) { + completion->layer->zeroDataVIO(dataVIO); + invokeCallback(completion); + return; + } + + vio->physical = dataVIO->mapped.pbn; + dataVIO->lastAsyncOperation = READ_DATA; + completion->layer->readData(dataVIO); +} + +/** + * Read the DataVIO's mapping from the block map. This callback is registered + * in launchReadDataVIO(). + * + * @param completion The DataVIO to be read + **/ +static void readBlockMapping(VDOCompletion *completion) +{ + if (completion->result != VDO_SUCCESS) { + completeDataVIO(completion); + return; + } + + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + setLogicalCallback(dataVIO, readBlock, THIS_LOCATION("$F;cb=readBlock")); + dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK; + getMappedBlockAsync(dataVIO); +} + +/**********************************************************************/ +void launchReadDataVIO(DataVIO *dataVIO) +{ + assertInLogicalZone(dataVIO); + dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT; + // Go find the block map slot for the LBN mapping. + findBlockMapSlotAsync(dataVIO, readBlockMapping, + getLogicalZoneThreadID(dataVIO->logical.zone)); +} + +/** + * Release the logical block lock which a read DataVIO obtained now that it + * is done. + * + * @param completion The DataVIO + **/ +static void releaseLogicalLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + releaseLogicalBlockLock(dataVIO); + vioDoneCallback(completion); +} + +/** + * Clean up a DataVIO which has finished processing a read. + * + * @param dataVIO The DataVIO to clean up + **/ +void cleanupReadDataVIO(DataVIO *dataVIO) +{ + launchLogicalCallback(dataVIO, releaseLogicalLock, + THIS_LOCATION("$F;cb=releaseLL")); +} diff --git a/source/vdo/base/vioRead.h b/source/vdo/base/vioRead.h new file mode 100644 index 0000000..ae2fa37 --- /dev/null +++ b/source/vdo/base/vioRead.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioRead.h#1 $ + */ + +#ifndef VIO_READ_H +#define VIO_READ_H + +#include "types.h" + +/** + * Start the asynchronous processing of the DataVIO for a read or + * read-modify-write request which has acquired a lock on its logical block. + * The first step is to perform a block map lookup. + * + * @param dataVIO The DataVIO doing the read + **/ +void launchReadDataVIO(DataVIO *dataVIO); + +/** + * Clean up a DataVIO which has finished processing a read. + * + * @param dataVIO The DataVIO to clean up + **/ +void cleanupReadDataVIO(DataVIO *dataVIO); + +#endif /* VIO_READ_H */ diff --git a/source/vdo/base/vioWrite.c b/source/vdo/base/vioWrite.c new file mode 100644 index 0000000..ac2bb53 --- /dev/null +++ b/source/vdo/base/vioWrite.c @@ -0,0 +1,1201 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.c#9 $ + */ + +/* + * This file contains almost all of the VDO write path, which begins with + * writeExtent(). The progression through the callbacks which make up the + * write path depends upon whether or not the write policy is synchronous or + * asynchronous. The paths would proceed as outlined in the pseudo-code here + * if this were normal, synchronous code without callbacks. Complications + * involved in waiting on locks are not included. + * + * ###################################################################### + * writeExtentSynchronous(extent) + * { + * foreach (vio in extent) { + * launchWriteVIO() + * # allocateBlockForWrite() + * if (!trim and !zero-block) { + * allocate block + * if (vio is compressed) { + * completeCompressedBlockWrite() + * finishVIO() + * return + * } + * writeBlock() + * } + * finishBlockWrite() + * addJournalEntry() # Increment + * if (vio->newMapped is not ZERO_BLOCK) { + * journalIncrementForWrite() + * } + * acknowledgeWriteCallback() + * readOldBlockMapping() + * journalUnmappingForWrite() + * if (vio->mapped is not ZERO_BLOCK) { + * journalDecrementForWrite() + * } + * updateBlockMapForWrite() + * if (trim || zero-block) { + * finishVIO() + * return + * } + * + * prepareForDedupe() + * hashData() + * resolveHashZone() + * acquireHashLock() + * attemptDedupe() (query albireo) + * if (isDuplicate) { + * verifyAdvice() (read verify) + * if (isDuplicate and canAddReference) { + * shareBlock() + * addJournalEntryForDedupe() + * incrementForDedupe() + * journalUnmappingForDedupe() + * if (vio->mapped is not ZERO_BLOCK) { + * decrementForDedupe() + * } + * updateBlockMapForDedupe() + * finishVIO() + * return + * } + * } + * + * if (not canAddReference) { + * layer->updateAlbireo() + * } + * # compressData() + * if (compressing and not mooted and has no waiters) { + * layer->compressVIO() + * packCompressedData() + * if (compressed) { + * journalCompressedBlocks() + * incrementForDedupe() + * readOldBlockMappingForDedupe() + * journalUnmappingForDedupe() + * if (vio->mapped is not ZERO_BLOCK) { + * decrementForDedupe() + * } + * updateBlockMapForDedupe() + * } + * } + * + * finishVIO() + * } + * } + * + * ###################################################################### + * writeExtentAsynchronous(extent) + * { + * foreach (vio in extent) { + * launchWriteVIO() + * # allocateBlockForWrite() + * if (trim || zero-block) { + * acknowledgeWrite() + * } else { + * allocateAndLockBlock() + * if (vio is compressed) { + * writeBlock() + * completeCompressedBlockWrite() + * finishVIO() + * return + * } + * + * acknowledgeWrite() + * prepareForDedupe() + * hashData() + * resolveHashZone() + * acquireHashLock() + * attemptDedupe() (query albireo) + * if (isDuplicate) { + * verifyAdvice() (read verify) + * if (isDuplicate and canAddReference) { + * shareBlock() + * addJournalEntryForDedupe() + * incrementForDedupe() + * readOldBlockMappingForDedupe() + * journalUnmappingForDedupe() + * if (vio->mapped is not ZERO_BLOCK) { + * decrementForDedupe() + * } + * updateBlockMapForDedupe() + * finishVIO() + * return + * } + * } + * + * if (not canAddReference) { + * layer->updateAlbireo() + * } + * # compressData() + * if (compressing and not mooted and has no waiters) { + * layer->compressVIO() + * packCompressedData() + * if (compressed) { + * journalCompressedBlocks() + * journalIncrementForDedupe() + * readOldBlockMappingForDedupe() + * journalUnmappingForDedupe() + * if (vio->mapped is not ZERO_BLOCK) { + * decrementForDedupe() + * } + * updateBlockMapForDedupe() + * finishVIO() + * return + * } + * } + * + * writeBlock() + * } + * + * finishBlockWrite() + * addJournalEntry() # Increment + * if (vio->newMapped is not ZERO_BLOCK) { + * journalIncrementForWrite() + * } + * readOldBlockMappingForWrite() + * journalUnmappingForWrite() + * if (vio->mapped is not ZERO_BLOCK) { + * journalDecrementForWrite() + * } + * updateBlockMapForWrite() + * finishVIO() + * } + * } + */ + +#include "vioWrite.h" + +#include "logger.h" + +#include "allocatingVIO.h" +#include "atomic.h" +#include "blockMap.h" +#include "compressionState.h" +#include "dataVIO.h" +#include "hashLock.h" +#include "recoveryJournal.h" +#include "referenceOperation.h" +#include "slab.h" +#include "slabDepot.h" +#include "slabJournal.h" +#include "vdoInternal.h" +#include "vioRead.h" + +/** + * The steps taken cleaning up a VIO, in the order they are performed. + **/ +typedef enum dataVIOCleanupStage { + VIO_CLEANUP_START = 0, + VIO_RELEASE_ALLOCATED = VIO_CLEANUP_START, + VIO_RELEASE_RECOVERY_LOCKS, + VIO_RELEASE_HASH_LOCK, + VIO_RELEASE_LOGICAL, + VIO_CLEANUP_DONE +} DataVIOCleanupStage; + +/** + * Actions to take on error used by abortOnError(). + **/ +typedef enum { + NOT_READ_ONLY, + READ_ONLY_IF_ASYNC, + READ_ONLY, +} ReadOnlyAction; + +// Forward declarations required because of circular function references. +static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage); +static void writeBlock(DataVIO *dataVIO); + +/** + * Check whether we are in async mode. + * + * @param dataVIO A DataVIO containing a pointer to the VDO whose write + * policy we want to check + * + * @return true if we are in async mode + **/ +static inline bool isAsync(DataVIO *dataVIO) +{ + return (getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC); +} + +/** + * Release the PBN lock and/or the reference on the allocated block at the + * end of processing a DataVIO. + * + * @param completion The DataVIO + **/ +static void releaseAllocatedLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInAllocatedZone(dataVIO); + releaseAllocationLock(dataVIOAsAllocatingVIO(dataVIO)); + performCleanupStage(dataVIO, VIO_RELEASE_RECOVERY_LOCKS); +} + +/** + * Release the logical block lock and flush generation lock at the end of + * processing a DataVIO. + * + * @param completion The DataVIO + **/ +static void releaseLogicalLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + releaseLogicalBlockLock(dataVIO); + releaseFlushGenerationLock(dataVIO); + performCleanupStage(dataVIO, VIO_CLEANUP_DONE); +} + +/** + * Release the hash lock at the end of processing a DataVIO. + * + * @param completion The DataVIO + **/ +static void cleanHashLock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInHashZone(dataVIO); + releaseHashLock(dataVIO); + performCleanupStage(dataVIO, VIO_RELEASE_LOGICAL); +} + +/** + * Make some assertions about a DataVIO which has finished cleaning up + * and do its final callback. + * + * @param dataVIO The DataVIO which has finished cleaning up + **/ +static void finishCleanup(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(dataVIOAsAllocatingVIO(dataVIO)->allocationLock == NULL, + "complete DataVIO has no allocation lock"); + ASSERT_LOG_ONLY(dataVIO->hashLock == NULL, + "complete DataVIO has no hash lock"); + vioDoneCallback(dataVIOAsCompletion(dataVIO)); +} + +/** + * Perform the next step in the process of cleaning up a DataVIO. + * + * @param dataVIO The DataVIO to clean up + * @param stage The cleanup stage to perform + **/ +static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage) +{ + switch (stage) { + case VIO_RELEASE_ALLOCATED: + if (hasAllocation(dataVIO)) { + launchAllocatedZoneCallback(dataVIO, releaseAllocatedLock, + THIS_LOCATION("$F;cb=releaseAllocLock")); + return; + } + // fall through + + case VIO_RELEASE_RECOVERY_LOCKS: + if ((dataVIO->recoverySequenceNumber > 0) + && !isOrWillBeReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier) + && (dataVIOAsCompletion(dataVIO)->result != VDO_READ_ONLY)) { + logWarning("VDO not read-only when cleaning DataVIO with RJ lock"); + } + // fall through + + case VIO_RELEASE_HASH_LOCK: + if (dataVIO->hashLock != NULL) { + launchHashZoneCallback(dataVIO, cleanHashLock, + THIS_LOCATION("$F;cb=cleanHashLock")); + return; + } + // fall through + + case VIO_RELEASE_LOGICAL: + if (!isCompressedWriteDataVIO(dataVIO)) { + launchLogicalCallback(dataVIO, releaseLogicalLock, + THIS_LOCATION("$F;cb=releaseLL")); + return; + } + // fall through + + default: + finishCleanup(dataVIO); + } +} + +/** + * Return a DataVIO that encountered an error to its hash lock so it can + * update the hash lock state accordingly. This continuation is registered in + * abortOnError(), and must be called in the hash zone of the DataVIO. + * + * @param completion The completion of the DataVIO to return to its hash lock + **/ +static void finishWriteDataVIOWithError(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInHashZone(dataVIO); + continueHashLockOnError(dataVIO); +} + +/** + * Check whether a result is an error, and if so abort the DataVIO associated + * with the error. + * + * @param result The result to check + * @param dataVIO The DataVIO + * @param readOnlyAction The conditions under which the VDO should be put + * into read-only mode if the result is an error + * + * @return true if the result is an error + **/ +static bool abortOnError(int result, + DataVIO *dataVIO, + ReadOnlyAction readOnlyAction) +{ + if (result == VDO_SUCCESS) { + return false; + } + + if ((result == VDO_READ_ONLY) + || (readOnlyAction == READ_ONLY) + || ((readOnlyAction == READ_ONLY_IF_ASYNC) && isAsync(dataVIO))) { + ReadOnlyNotifier *notifier = dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier; + if (!isReadOnly(notifier)) { + if (result != VDO_READ_ONLY) { + logErrorWithStringError(result, "Preparing to enter read-only mode:" + " DataVIO for LBN %llu (becoming mapped" + " to %llu, previously mapped" + " to %llu, allocated %llu) is" + " completing with a fatal error after" + " operation %s", dataVIO->logical.lbn, + dataVIO->newMapped.pbn, dataVIO->mapped.pbn, + getDataVIOAllocation(dataVIO), + getOperationName(dataVIO)); + } + + enterReadOnlyMode(notifier, result); + } + } + + if (dataVIO->hashLock != NULL) { + launchHashZoneCallback(dataVIO, finishWriteDataVIOWithError, + THIS_LOCATION(NULL)); + } else { + finishDataVIO(dataVIO, result); + } + return true; +} + +/** + * Return a DataVIO that finished writing, compressing, or deduplicating to + * its hash lock so it can share the result with any DataVIOs waiting in the + * hash lock, or update albireo, or simply release its share of the lock. This + * continuation is registered in updateBlockMapForWrite(), + * updateBlockMapForDedupe(), and abortDeduplication(), and must be called in + * the hash zone of the DataVIO. + * + * @param completion The completion of the DataVIO to return to its hash lock + **/ +static void finishWriteDataVIO(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInHashZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + continueHashLock(dataVIO); +} + +/** + * Abort the data optimization process. + * + * @param dataVIO The DataVIO which does not deduplicate or compress + **/ +static void abortDeduplication(DataVIO *dataVIO) +{ + if (!hasAllocation(dataVIO)) { + // There was no space to write this block and we failed to deduplicate + // or compress it. + finishDataVIO(dataVIO, VDO_NO_SPACE); + return; + } + + if (isAsync(dataVIO)) { + // We failed to deduplicate or compress an async DataVIO, so now we need + // to actually write the data. + writeBlock(dataVIO); + return; + } + + if (dataVIO->hashLock == NULL) { + // We failed to compress a synchronous DataVIO that is a hash collision, + // which means it can't dedpe or be used for dedupe, so it's done now. + finishDataVIO(dataVIO, VDO_SUCCESS); + return; + } + + /* + * This synchronous DataVIO failed to compress and so is finished, but must + * now return to its hash lock so other DataVIOs with the same data can + * deduplicate against the uncompressed block it wrote. + */ + launchHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); +} + +/** + * Update the block map now that we've added an entry in the recovery journal + * for a block we have just shared. This is the callback registered in + * decrementForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void updateBlockMapForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->hashLock != NULL) { + setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); + } else { + completion->callback = completeDataVIO; + } + dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK_FOR_DEDUPE; + putMappedBlockAsync(dataVIO); +} + +/** + * Make a recovery journal increment. + * + * @param dataVIO The DataVIO + * @param lock The PBNLock on the block being incremented + **/ +static void journalIncrement(DataVIO *dataVIO, PBNLock *lock) +{ + setUpReferenceOperationWithLock(DATA_INCREMENT, dataVIO->newMapped.pbn, + dataVIO->newMapped.state, lock, + &dataVIO->operation); + addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, + dataVIO); +} + +/** + * Make a recovery journal decrement entry. + * + * @param dataVIO The DataVIO + **/ +static void journalDecrement(DataVIO *dataVIO) +{ + setUpReferenceOperationWithZone(DATA_DECREMENT, dataVIO->mapped.pbn, + dataVIO->mapped.state, dataVIO->mapped.zone, + &dataVIO->operation); + addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, + dataVIO); +} + +/** + * Make a reference count change. + * + * @param dataVIO The DataVIO + **/ +static void updateReferenceCount(DataVIO *dataVIO) +{ + SlabDepot *depot = getVDOFromDataVIO(dataVIO)->depot; + PhysicalBlockNumber pbn = dataVIO->operation.pbn; + int result = ASSERT(isPhysicalDataBlock(depot, pbn), + "Adding slab journal entry for impossible PBN %" PRIu64 + "for LBN %llu", pbn, dataVIO->logical.lbn); + if (abortOnError(result, dataVIO, READ_ONLY)) { + return; + } + + addSlabJournalEntry(getSlabJournal(depot, pbn), dataVIO); +} + +/** + * Do the decref after a successful dedupe or compression. This is the callback + * registered by journalUnmappingForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void decrementForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInMappedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); + if (allocatingVIO->allocation == dataVIO->mapped.pbn) { + /* + * If we are about to release the reference on the allocated block, + * we must release the PBN lock on it first so that the allocator will + * not allocate a write-locked block. + */ + releaseAllocationLock(allocatingVIO); + } + + setLogicalCallback(dataVIO, updateBlockMapForDedupe, + THIS_LOCATION("$F;js=dec")); + dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_DEDUPE; + updateReferenceCount(dataVIO); +} + +/** + * Write the appropriate journal entry for removing the mapping of logical to + * mapped, for dedupe or compression. This is the callback registered in + * readOldBlockMappingForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void journalUnmappingForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->mapped.pbn == ZERO_BLOCK) { + setLogicalCallback(dataVIO, updateBlockMapForDedupe, + THIS_LOCATION("$F;j=dedupe;js=unmap;cb=updateBM")); + } else { + setMappedZoneCallback(dataVIO, decrementForDedupe, + THIS_LOCATION("$F;j=dedupe;js=unmap;cb=decDedupe")); + } + dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_DEDUPE; + journalDecrement(dataVIO); +} + +/** + * Get the previous PBN mapped to this LBN from the block map, so as to make + * an appropriate journal entry referencing the removal of this LBN->PBN + * mapping, for dedupe or compression. This callback is registered in + * incrementForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void readOldBlockMappingForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_DEDUPE; + setJournalCallback(dataVIO, journalUnmappingForDedupe, + THIS_LOCATION("$F;cb=journalUnmapDedupe")); + getMappedBlockAsync(dataVIO); +} + +/** + * Do the incref after compression. This is the callback registered by + * addRecoveryJournalEntryForCompression(). + * + * @param completion The completion of the write in progress + **/ +static void incrementForCompression(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInNewMappedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state), + "Impossible attempt to update reference counts for a block " + "which was not compressed (logical block %llu)", + dataVIO->logical.lbn); + + /* + * If we are synchronous and allocated a block, we know the one we + * allocated is the block we need to decrement, so there is no need + * to look in the block map. + */ + if (isAsync(dataVIO) || !hasAllocation(dataVIO)) { + setLogicalCallback(dataVIO, readOldBlockMappingForDedupe, + THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe")); + } else { + setJournalCallback(dataVIO, journalUnmappingForDedupe, + THIS_LOCATION("$F;cb=journalUnmappingForDedupe")); + } + dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_COMPRESSION; + updateReferenceCount(dataVIO); +} + +/** + * Add a recovery journal entry for the increment resulting from compression. + * + * @param completion The DataVIO which has been compressed + **/ +static void addRecoveryJournalEntryForCompression(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + if (!isCompressed(dataVIO->newMapped.state)) { + abortDeduplication(dataVIO); + return; + } + + setNewMappedZoneCallback(dataVIO, incrementForCompression, + THIS_LOCATION("$F($dup);js=map/$dup;" + "cb=incCompress($dup)")); + dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_COMPRESSION; + journalIncrement(dataVIO, getDuplicateLock(dataVIO)); +} + +/** + * Attempt to pack the compressed DataVIO into a block. This is the callback + * registered in compressData(). + * + * @param completion The completion of a compressed DataVIO + **/ +static void packCompressedData(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInPackerZone(dataVIO); + + // XXX this is a callback, so there should probably be an error check here + // even if we think compression can't currently return one. + + if (!mayPackDataVIO(dataVIO)) { + abortDeduplication(dataVIO); + return; + } + + setJournalCallback(dataVIO, addRecoveryJournalEntryForCompression, + THIS_LOCATION("$F;cb=update(compress)")); + dataVIO->lastAsyncOperation = PACK_COMPRESSED_BLOCK; + attemptPacking(dataVIO); +} + +/**********************************************************************/ +void compressData(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(!dataVIO->isDuplicate, + "compressing a non-duplicate block"); + if (!mayCompressDataVIO(dataVIO)) { + abortDeduplication(dataVIO); + return; + } + + dataVIO->lastAsyncOperation = COMPRESS_DATA; + setPackerCallback(dataVIO, packCompressedData, THIS_LOCATION("$F;cb=pack")); + dataVIOAsCompletion(dataVIO)->layer->compressDataVIO(dataVIO); +} + +/** + * Do the incref after deduplication. This is the callback registered by + * addRecoveryJournalEntryForDedupe(). + * + * @param completion The completion of the write in progress + **/ +static void incrementForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInNewMappedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + ASSERT_LOG_ONLY(dataVIO->isDuplicate, + "Impossible attempt to update reference counts for a block " + "which was not a duplicate (logical block %llu)", + dataVIO->logical.lbn); + + /* + * If we are synchronous and allocated a block, we know the one we + * allocated is the block we need to decrement, so there is no need + * to look in the block map. + */ + if (isAsync(dataVIO) || !hasAllocation(dataVIO)) { + setLogicalCallback(dataVIO, readOldBlockMappingForDedupe, + THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe")); + } else { + setJournalCallback(dataVIO, journalUnmappingForDedupe, + THIS_LOCATION("$F;cb=journalUnmappingForDedupe")); + } + dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_DEDUPE; + updateReferenceCount(dataVIO); +} + +/** + * Add a recovery journal entry for the increment resulting from deduplication. + * This callback is registered in shareBlock(). + * + * @param completion The DataVIO which has been deduplicated + **/ +static void addRecoveryJournalEntryForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + setNewMappedZoneCallback(dataVIO, incrementForDedupe, + THIS_LOCATION("$F($dup);js=map/$dup;" + "cb=incDedupe($dup)")); + dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_DEDUPE; + journalIncrement(dataVIO, getDuplicateLock(dataVIO)); +} + +/** + * Share a block in the block map if it is a duplicate. This is the lock + * callback registered in acquirePBNReadLock(). This is only public so + * test code can compare the function to the current callback in a completion. + * + * @param completion The completion of the write in progress + **/ +void shareBlock(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInDuplicateZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + if (!dataVIO->isDuplicate) { + compressData(dataVIO); + return; + } + + dataVIO->newMapped = dataVIO->duplicate; + launchJournalCallback(dataVIO, addRecoveryJournalEntryForDedupe, + THIS_LOCATION("$F;cb=addJournalEntryDup")); +} + +/** + * Route the DataVIO to the HashZone responsible for the chunk name to acquire + * a hash lock on that name, or join with a existing hash lock managing + * concurrent dedupe for that name. This is the callback registered in + * resolveHashZone(). + * + * @param completion The DataVIO to lock + **/ +static void lockHashInZone(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInHashZone(dataVIO); + // Shouldn't have had any errors since all we did was switch threads. + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + int result = acquireHashLock(dataVIO); + if (abortOnError(result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->hashLock == NULL) { + // It's extremely unlikely, but in the case of a hash collision, the + // DataVIO will not obtain a reference to the lock and cannot deduplicate. + compressData(dataVIO); + return; + } + + enterHashLock(dataVIO); +} + +/** + * Set the hash zone (and flag the chunk name as set) while still on the + * thread that just hashed the data to set the chunk name. This is the + * callback registered by prepareForDedupe(). + * + * @param completion The DataVIO whose chunk name was just generated, as a + * completion + **/ +static void resolveHashZone(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + // We don't care what thread we are on. + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, "zero blocks should not be hashed"); + + dataVIO->hashZone + = selectHashZone(getVDOFromDataVIO(dataVIO), &dataVIO->chunkName); + dataVIO->lastAsyncOperation = ACQUIRE_HASH_LOCK; + launchHashZoneCallback(dataVIO, lockHashInZone, THIS_LOCATION(NULL)); +} + +/** + * Prepare for the dedupe path after a synchronous write or an asynchronous + * allocation. This callback is registered in updateBlockMapForWrite() for + * sync, and continueWriteAfterAllocation() (via acknowledgeWrite()) for + * async. It is also called directly from the latter when allocation fails. + * + * @param completion The completion of the write in progress + **/ +static void prepareForDedupe(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + // We don't care what thread we are on + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (!isAsync(dataVIO)) { + // Remember which block we wrote so we will decrement the reference to it + // if we deduplicate. This avoids having to look it up in the block map. + dataVIO->mapped = dataVIO->newMapped; + } + + ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, + "must not prepare to dedupe zero blocks"); + + // Before we can dedupe, we need to know the chunk name, so the first step + // is to hash the block data. + dataVIO->lastAsyncOperation = HASH_DATA; + // XXX this is the wrong thread to run this callback, but we don't yet have + // a mechanism for running it on the CPU thread immediately after hashing. + setAllocatedZoneCallback(dataVIO, resolveHashZone, THIS_LOCATION(NULL)); + completion->layer->hashData(dataVIO); +} + +/** + * Update the block map after a data write (or directly for a ZERO_BLOCK write + * or trim). This callback is registered in decrementForWrite() and + * journalUnmappingForWrite(). + * + * @param completion The completion of the write in progress + **/ +static void updateBlockMapForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) { + completion->callback = completeDataVIO; + } else if (!isAsync(dataVIO)) { + // Synchronous DataVIOs branch off to the hash/dedupe path after finishing + // the uncompressed write of their data. + completion->callback = prepareForDedupe; + } else if (dataVIO->hashLock != NULL) { + // Async writes will be finished, but must return to the hash lock to + // allow other DataVIOs with the same data to dedupe against the write. + setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); + } else { + // Async writes without a hash lock (hash collisions) will be finished. + completion->callback = completeDataVIO; + } + + dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK; + putMappedBlockAsync(dataVIO); +} + +/** + * Do the decref after a successful block write. This is the callback + * by journalUnmappingForWrite() if the old mapping was not the zero block. + * + * @param completion The completion of the write in progress + **/ +static void decrementForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInMappedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_WRITE; + setLogicalCallback(dataVIO, updateBlockMapForWrite, THIS_LOCATION(NULL)); + updateReferenceCount(dataVIO); +} + +/** + * Write the appropriate journal entry for unmapping logical to mapped for a + * write. This is the callback registered in readOldBlockMappingForWrite(). + * + * @param completion The completion of the write in progress + **/ +static void journalUnmappingForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + if (dataVIO->mapped.pbn == ZERO_BLOCK) { + setLogicalCallback(dataVIO, updateBlockMapForWrite, + THIS_LOCATION("$F;js=unmap;cb=updateBMwrite")); + } else { + setMappedZoneCallback(dataVIO, decrementForWrite, + THIS_LOCATION("$F;js=unmap;cb=decWrite")); + } + dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_WRITE; + journalDecrement(dataVIO); +} + +/** + * Get the previous PBN mapped to this LBN from the block map for a write, so + * as to make an appropriate journal entry referencing the removal of this + * LBN->PBN mapping. This callback is registered in finishBlockWrite() in the + * async path, and is registered in acknowledgeWrite() in the sync path. + * + * @param completion The completion of the write in progress + **/ +static void readOldBlockMappingForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInLogicalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + setJournalCallback(dataVIO, journalUnmappingForWrite, + THIS_LOCATION("$F;cb=journalUnmapWrite")); + dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_WRITE; + getMappedBlockAsync(dataVIO); +} + +/** + * Acknowledge a write to the requestor. + * + * @param dataVIO The DataVIO being acknowledged + **/ +static void acknowledgeWrite(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(dataVIO->hasFlushGenerationLock, + "write VIO to be acknowledged has a flush generation lock"); + dataVIO->lastAsyncOperation = ACKNOWLEDGE_WRITE; + dataVIOAsCompletion(dataVIO)->layer->acknowledgeDataVIO(dataVIO); +} + +/** + * Acknowledge a write now that we have made an entry in the recovery + * journal. This is the callback registered in finishBlockWrite() in + * synchronous mode. + * + * @param completion The completion of the write in progress + **/ +static void acknowledgeWriteCallback(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + if (abortOnError(completion->result, dataVIO, READ_ONLY)) { + return; + } + + setLogicalCallback(dataVIO, readOldBlockMappingForWrite, + THIS_LOCATION(NULL)); + acknowledgeWrite(dataVIO); +} + +/**********************************************************************/ +static VDOAction *getWriteIncrementCallback(DataVIO *dataVIO) +{ + return (isAsync(dataVIO) + ? readOldBlockMappingForWrite : acknowledgeWriteCallback); +} + +/** + * Do the incref after a successful block write. This is the callback + * registered by finishBlockWrite(). + * + * @param completion The completion of the write in progress + **/ +static void incrementForWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInAllocatedZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + /* + * Now that the data has been written, it's safe to deduplicate against the + * block. Downgrade the allocation lock to a read lock so it can be used + * later by the hash lock (which we don't have yet in sync mode). + */ + downgradePBNWriteLock(dataVIOAsAllocatingVIO(dataVIO)->allocationLock); + + dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_WRITE; + setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO), + THIS_LOCATION(NULL)); + updateReferenceCount(dataVIO); +} + +/** + * Add an entry in the recovery journal after a successful block write. This is + * the callback registered by writeBlock(). It is also registered in + * allocateBlockForWrite(). + * + * @param completion The completion of the write in progress + **/ +static void finishBlockWrite(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + assertInJournalZone(dataVIO); + if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { + return; + } + + if (dataVIO->newMapped.pbn == ZERO_BLOCK) { + setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO), + THIS_LOCATION("$F;js=writeZero")); + } else { + setAllocatedZoneCallback(dataVIO, incrementForWrite, + THIS_LOCATION("$F;js=mapWrite")); + } + dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_WRITE; + journalIncrement(dataVIO, dataVIOAsAllocatingVIO(dataVIO)->allocationLock); +} + +/** + * Write data to the underlying storage. + * + * @param dataVIO The DataVIO to write + **/ +static void writeBlock(DataVIO *dataVIO) +{ + dataVIO->lastAsyncOperation = WRITE_DATA; + setJournalCallback(dataVIO, finishBlockWrite, + THIS_LOCATION("$F(data);cb=finishWrite")); + dataVIOAsCompletion(dataVIO)->layer->writeData(dataVIO); +} + +/** + * Continue the write path for a DataVIO now that block allocation is complete + * (the DataVIO may or may not have actually received an allocation). This + * callback is registered in continueWriteWithBlockMapSlot(). + * + * @param allocatingVIO The DataVIO which has finished the allocation process + * (as an AllocatingVIO) + **/ +static void continueWriteAfterAllocation(AllocatingVIO *allocatingVIO) +{ + DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO); + if (abortOnError(dataVIOAsCompletion(dataVIO)->result, dataVIO, + NOT_READ_ONLY)) { + return; + } + + if (!hasAllocation(dataVIO)) { + prepareForDedupe(dataVIOAsCompletion(dataVIO)); + return; + } + + atomicStoreBool(&dataVIO->hasAllocation, true); + dataVIO->newMapped = (ZonedPBN) { + .zone = allocatingVIO->zone, + .pbn = allocatingVIO->allocation, + .state = MAPPING_STATE_UNCOMPRESSED, + }; + + if (!isAsync(dataVIO)) { + writeBlock(dataVIO); + return; + } + + // XXX prepareForDedupe can run from any thread, so this is a place where + // running the callback on the kernel thread would save a thread switch. + setAllocatedZoneCallback(dataVIO, prepareForDedupe, THIS_LOCATION(NULL)); + if (vioRequiresFlushAfter(allocatingVIOAsVIO(allocatingVIO))) { + invokeCallback(dataVIOAsCompletion(dataVIO)); + return; + } + + acknowledgeWrite(dataVIO); +} + +/** + * Continue the write path for a VIO now that block map slot resolution is + * complete. This callback is registered in launchWriteDataVIO(). + * + * @param completion The DataVIO to write + **/ +static void continueWriteWithBlockMapSlot(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + // We don't care what thread we're on. + if (abortOnError(completion->result, dataVIO, NOT_READ_ONLY)) { + return; + } + + if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) { + int result = ASSERT(isTrimDataVIO(dataVIO), + "dataVIO with no block map page is a trim"); + if (abortOnError(result, dataVIO, READ_ONLY)) { + return; + } + + // This is a trim for a block on a block map page which has not been + // allocated, so there's nothing more we need to do. + finishDataVIO(dataVIO, VDO_SUCCESS); + return; + } + + if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) { + // We don't need to write any data, so skip allocation and just update + // the block map and reference counts (via the journal). + dataVIO->newMapped.pbn = ZERO_BLOCK; + launchJournalCallback(dataVIO, finishBlockWrite, + THIS_LOCATION("$F;cb=finishWrite")); + return; + } + + allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO), + getAllocationSelector(dataVIO->logical.zone), + VIO_WRITE_LOCK, continueWriteAfterAllocation); +} + +/**********************************************************************/ +void launchWriteDataVIO(DataVIO *dataVIO) +{ + if (isReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier)) { + finishDataVIO(dataVIO, VDO_READ_ONLY); + return; + } + + // Write requests join the current flush generation. + int result = acquireFlushGenerationLock(dataVIO); + if (abortOnError(result, dataVIO, NOT_READ_ONLY)) { + return; + } + + // Go find the block map slot for the LBN mapping. + dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT; + findBlockMapSlotAsync(dataVIO, continueWriteWithBlockMapSlot, + getLogicalZoneThreadID(dataVIO->logical.zone)); +} + +/**********************************************************************/ +void cleanupWriteDataVIO(DataVIO *dataVIO) +{ + performCleanupStage(dataVIO, VIO_CLEANUP_START); +} diff --git a/source/vdo/base/vioWrite.h b/source/vdo/base/vioWrite.h new file mode 100644 index 0000000..6effc91 --- /dev/null +++ b/source/vdo/base/vioWrite.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.h#1 $ + */ + +#ifndef VIO_WRITE_H +#define VIO_WRITE_H + +#include "types.h" + +/** + * Release the PBN read lock if it is held. + * + * @param dataVIO The possible lock holder + **/ +void releasePBNReadLock(DataVIO *dataVIO); + +/** + * Start the asynchronous processing of a DataVIO for a write request which has + * acquired a lock on its logical block by joining the current flush generation + * and then attempting to allocate a physical block. + * + * @param dataVIO The DataVIO doing the write + **/ +void launchWriteDataVIO(DataVIO *dataVIO); + +/** + * Clean up a DataVIO which has finished processing a write. + * + * @param dataVIO The DataVIO to clean up + **/ +void cleanupWriteDataVIO(DataVIO *dataVIO); + +/** + * Continue a write by attempting to compress the data. This is a re-entry + * point to vioWrite used by hash locks. + * + * @param dataVIO The DataVIO to be compressed + **/ +void compressData(DataVIO *dataVIO); + +#endif /* VIO_WRITE_H */ diff --git a/source/vdo/base/volumeGeometry.c b/source/vdo/base/volumeGeometry.c new file mode 100644 index 0000000..32b2e5f --- /dev/null +++ b/source/vdo/base/volumeGeometry.c @@ -0,0 +1,564 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/volumeGeometry.c#10 $ + */ + +#include "volumeGeometry.h" + +#include "buffer.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" +#include "permassert.h" + +#include "constants.h" +#include "header.h" +#include "physicalLayer.h" +#include "releaseVersions.h" +#include "statusCodes.h" +#include "types.h" + +enum { + GEOMETRY_BLOCK_LOCATION = 0, + MAGIC_NUMBER_SIZE = 8, +}; + +typedef struct { + char magicNumber[MAGIC_NUMBER_SIZE]; + Header header; + VolumeGeometry geometry; + CRC32Checksum checksum; +} __attribute__((packed)) GeometryBlock; + +static const Header GEOMETRY_BLOCK_HEADER_4_0 = { + .id = GEOMETRY_BLOCK, + .version = { + .majorVersion = 4, + .minorVersion = 0, + }, + // Note: this size isn't just the payload size following the header, like it + // is everywhere else in VDO. + .size = sizeof(GeometryBlock), +}; + +static const byte MAGIC_NUMBER[MAGIC_NUMBER_SIZE + 1] = "dmvdo001"; + +static const ReleaseVersionNumber COMPATIBLE_RELEASE_VERSIONS[] = { + MAGNESIUM_RELEASE_VERSION_NUMBER, +}; + +/** + * Determine whether the supplied release version can be understood by + * the VDO code. + * + * @param version The release version number to check + * + * @return True if the given version can be loaded. + **/ +static inline bool isLoadableReleaseVersion(ReleaseVersionNumber version) +{ + if (version == CURRENT_RELEASE_VERSION_NUMBER) { + return true; + } + + for (unsigned int i = 0; i < COUNT_OF(COMPATIBLE_RELEASE_VERSIONS); i++) { + if (version == COMPATIBLE_RELEASE_VERSIONS[i]) { + return true; + } + } + + return false; +} + +/** + * Decode the on-disk representation of an index configuration from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param config The structure to receive the decoded fields + * + * @return UDS_SUCCESS or an error + **/ +static int decodeIndexConfig(Buffer *buffer, IndexConfig *config) +{ + uint32_t mem; + int result = getUInt32LEFromBuffer(buffer, &mem); + if (result != VDO_SUCCESS) { + return result; + } + + uint32_t checkpointFrequency; + result = getUInt32LEFromBuffer(buffer, &checkpointFrequency); + if (result != VDO_SUCCESS) { + return result; + } + + bool sparse; + result = getBoolean(buffer, &sparse); + if (result != VDO_SUCCESS) { + return result; + } + + *config = (IndexConfig) { + .mem = mem, + .checkpointFrequency = checkpointFrequency, + .sparse = sparse, + }; + return VDO_SUCCESS; +} + +/** + * Encode the on-disk representation of an index configuration into a buffer. + * + * @param config The index configuration to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error + **/ +static int encodeIndexConfig(const IndexConfig *config, Buffer *buffer) +{ + int result = putUInt32LEIntoBuffer(buffer, config->mem); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt32LEIntoBuffer(buffer, config->checkpointFrequency); + if (result != VDO_SUCCESS) { + return result; + } + + return putBoolean(buffer, config->sparse); +} + +/** + * Decode the on-disk representation of a volume region from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param region The structure to receive the decoded fields + * + * @return UDS_SUCCESS or an error + **/ +static int decodeVolumeRegion(Buffer *buffer, VolumeRegion *region) +{ + VolumeRegionID id; + int result = getUInt32LEFromBuffer(buffer, &id); + if (result != VDO_SUCCESS) { + return result; + } + + PhysicalBlockNumber startBlock; + result = getUInt64LEFromBuffer(buffer, &startBlock); + if (result != VDO_SUCCESS) { + return result; + } + + *region = (VolumeRegion) { + .id = id, + .startBlock = startBlock, + }; + return VDO_SUCCESS; +} + +/** + * Encode the on-disk representation of a volume region into a buffer. + * + * @param region The region to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error + **/ +static int encodeVolumeRegion(const VolumeRegion *region, Buffer *buffer) +{ + int result = putUInt32LEIntoBuffer(buffer, region->id); + if (result != VDO_SUCCESS) { + return result; + } + + return putUInt64LEIntoBuffer(buffer, region->startBlock); +} + +/** + * Decode the on-disk representation of a volume geometry from a buffer. + * + * @param buffer A buffer positioned at the start of the encoding + * @param geometry The structure to receive the decoded fields + * + * @return UDS_SUCCESS or an error + **/ +static int decodeVolumeGeometry(Buffer *buffer, VolumeGeometry *geometry) +{ + ReleaseVersionNumber releaseVersion; + int result = getUInt32LEFromBuffer(buffer, &releaseVersion); + if (result != VDO_SUCCESS) { + return result; + } + + Nonce nonce; + result = getUInt64LEFromBuffer(buffer, &nonce); + if (result != VDO_SUCCESS) { + return result; + } + + geometry->releaseVersion = releaseVersion; + geometry->nonce = nonce; + + result = getBytesFromBuffer(buffer, sizeof(UUID), geometry->uuid); + if (result != VDO_SUCCESS) { + return result; + } + + for (VolumeRegionID id = 0; id < VOLUME_REGION_COUNT; id++) { + result = decodeVolumeRegion(buffer, &geometry->regions[id]); + if (result != VDO_SUCCESS) { + return result; + } + } + + return decodeIndexConfig(buffer, &geometry->indexConfig); +} + +/** + * Encode the on-disk representation of a volume geometry into a buffer. + * + * @param geometry The geometry to encode + * @param buffer A buffer positioned at the start of the encoding + * + * @return UDS_SUCCESS or an error + **/ +static int encodeVolumeGeometry(const VolumeGeometry *geometry, Buffer *buffer) +{ + int result = putUInt32LEIntoBuffer(buffer, geometry->releaseVersion); + if (result != VDO_SUCCESS) { + return result; + } + + result = putUInt64LEIntoBuffer(buffer, geometry->nonce); + if (result != VDO_SUCCESS) { + return result; + } + + result = putBytes(buffer, sizeof(UUID), geometry->uuid); + if (result != VDO_SUCCESS) { + return result; + } + + for (VolumeRegionID id = 0; id < VOLUME_REGION_COUNT; id++) { + result = encodeVolumeRegion(&geometry->regions[id], buffer); + if (result != VDO_SUCCESS) { + return result; + } + } + + return encodeIndexConfig(&geometry->indexConfig, buffer); +} + +/** + * Decode the on-disk representation of a geometry block, up to but not + * including the checksum, from a buffer. + * + * @param buffer A buffer positioned at the start of the block + * @param geometry The structure to receive the decoded volume geometry fields + * + * @return UDS_SUCCESS or an error + **/ +static int decodeGeometryBlock(Buffer *buffer, VolumeGeometry *geometry) +{ + if (!hasSameBytes(buffer, MAGIC_NUMBER, MAGIC_NUMBER_SIZE)) { + return VDO_BAD_MAGIC; + } + + int result = skipForward(buffer, MAGIC_NUMBER_SIZE); + if (result != VDO_SUCCESS) { + return result; + } + + Header header; + result = decodeHeader(buffer, &header); + if (result != VDO_SUCCESS) { + return result; + } + + result = validateHeader(&GEOMETRY_BLOCK_HEADER_4_0, &header, true, __func__); + if (result != VDO_SUCCESS) { + return result; + } + + result = decodeVolumeGeometry(buffer, geometry); + if (result != VDO_SUCCESS) { + return result; + } + + // Leave the CRC for the caller to decode and verify. + return ASSERT(header.size + == (uncompactedAmount(buffer) + sizeof(CRC32Checksum)), + "should have decoded up to the geometry checksum"); +} + +/** + * Encode the on-disk representation of a geometry block, up to but not + * including the checksum, into a buffer. + * + * @param geometry The volume geometry to encode into the block + * @param buffer A buffer positioned at the start of the block + * + * @return UDS_SUCCESS or an error + **/ +static int encodeGeometryBlock(const VolumeGeometry *geometry, Buffer *buffer) +{ + int result = putBytes(buffer, MAGIC_NUMBER_SIZE, MAGIC_NUMBER); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeHeader(&GEOMETRY_BLOCK_HEADER_4_0, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + result = encodeVolumeGeometry(geometry, buffer); + if (result != VDO_SUCCESS) { + return result; + } + + // Leave the CRC for the caller to compute and encode. + return ASSERT(GEOMETRY_BLOCK_HEADER_4_0.size + == (contentLength(buffer) + sizeof(CRC32Checksum)), + "should have decoded up to the geometry checksum"); +} + +/** + * Allocate a block-size buffer to read the geometry from the physical layer, + * read the block, and return the buffer. + * + * @param [in] layer The physical layer containing the block to read + * @param [out] blockPtr A pointer to receive the allocated buffer + * + * @return VDO_SUCCESS or an error code + **/ +static int readGeometryBlock(PhysicalLayer *layer, byte **blockPtr) +{ + int result = ASSERT(layer->reader != NULL, "Layer must have a sync reader"); + if (result != VDO_SUCCESS) { + return result; + } + + char *block; + result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", + &block); + if (result != VDO_SUCCESS) { + return result; + } + + result = layer->reader(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); + if (result != VDO_SUCCESS) { + FREE(block); + return result; + } + + *blockPtr = (byte *) block; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int loadVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) +{ + byte *block; + int result = readGeometryBlock(layer, &block); + if (result != VDO_SUCCESS) { + return result; + } + + Buffer *buffer; + result = wrapBuffer(block, VDO_BLOCK_SIZE, VDO_BLOCK_SIZE, &buffer); + if (result != VDO_SUCCESS) { + FREE(block); + return result; + } + + result = decodeGeometryBlock(buffer, geometry); + if (result != VDO_SUCCESS) { + freeBuffer(&buffer); + FREE(block); + return result; + } + + // Checksum everything decoded so far. + CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, block, + uncompactedAmount(buffer)); + CRC32Checksum savedChecksum; + result = getUInt32LEFromBuffer(buffer, &savedChecksum); + if (result != VDO_SUCCESS) { + freeBuffer(&buffer); + FREE(block); + return result; + } + + // Finished all decoding. Everything that follows is validation code. + freeBuffer(&buffer); + FREE(block); + + if (!isLoadableReleaseVersion(geometry->releaseVersion)) { + return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, + "release version %d cannot be loaded", + geometry->releaseVersion); + } + + return ((checksum == savedChecksum) ? VDO_SUCCESS : VDO_CHECKSUM_MISMATCH); +} + +/************************************************************************/ +int computeIndexBlocks(IndexConfig *indexConfig, BlockCount *indexBlocksPtr) +{ + UdsConfiguration udsConfiguration = NULL; + int result = indexConfigToUdsConfiguration(indexConfig, &udsConfiguration); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "error creating index config"); + } + + uint64_t indexBytes; + result = udsComputeIndexSize(udsConfiguration, 0, &indexBytes); + udsFreeConfiguration(udsConfiguration); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "error computing index size"); + } + + BlockCount indexBlocks = indexBytes / VDO_BLOCK_SIZE; + if ((((uint64_t) indexBlocks) * VDO_BLOCK_SIZE) != indexBytes) { + return logErrorWithStringError(VDO_PARAMETER_MISMATCH, "index size must be" + " a multiple of block size %d", + VDO_BLOCK_SIZE); + } + + *indexBlocksPtr = indexBlocks; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int initializeVolumeGeometry(Nonce nonce, + UUID uuid, + IndexConfig *indexConfig, + VolumeGeometry *geometry) +{ + BlockCount indexSize = 0; + if (indexConfig != NULL) { + int result = computeIndexBlocks(indexConfig, &indexSize); + if (result != VDO_SUCCESS) { + return result; + } + } + + *geometry = (VolumeGeometry) { + .releaseVersion = CURRENT_RELEASE_VERSION_NUMBER, + .nonce = nonce, + .regions = { + [INDEX_REGION] = { + .id = INDEX_REGION, + .startBlock = 1, + }, + [DATA_REGION] = { + .id = DATA_REGION, + .startBlock = 1 + indexSize, + } + } + }; + memcpy(geometry->uuid, uuid, sizeof(UUID)); + if (indexSize > 0) { + memcpy(&geometry->indexConfig, indexConfig, sizeof(IndexConfig)); + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int clearVolumeGeometry(PhysicalLayer *layer) +{ + char *block; + int result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", + &block); + if (result != VDO_SUCCESS) { + return result; + } + + result = layer->writer(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); + FREE(block); + return result; +} + +/**********************************************************************/ +int writeVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) +{ + char *block; + int result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", + &block); + if (result != VDO_SUCCESS) { + return result; + } + + Buffer *buffer; + result = wrapBuffer((byte *) block, VDO_BLOCK_SIZE, 0, &buffer); + if (result != VDO_SUCCESS) { + FREE(block); + return result; + } + + result = encodeGeometryBlock(geometry, buffer); + if (result != VDO_SUCCESS) { + freeBuffer(&buffer); + FREE(block); + return result; + } + + // Checksum everything encoded so far and then encode the checksum. + CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, (byte *) block, + contentLength(buffer)); + result = putUInt32LEIntoBuffer(buffer, checksum); + if (result != VDO_SUCCESS) { + freeBuffer(&buffer); + FREE(block); + return result; + } + + // Write it. + result = layer->writer(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); + freeBuffer(&buffer); + FREE(block); + return result; +} + +/************************************************************************/ +int indexConfigToUdsConfiguration(IndexConfig *indexConfig, + UdsConfiguration *udsConfigPtr) +{ + UdsConfiguration udsConfiguration; + int result = udsInitializeConfiguration(&udsConfiguration, indexConfig->mem); + if (result != UDS_SUCCESS) { + return logErrorWithStringError(result, "error initializing configuration"); + } + + udsConfigurationSetSparse(udsConfiguration, indexConfig->sparse); + + *udsConfigPtr = udsConfiguration; + return VDO_SUCCESS; +} + +/************************************************************************/ +void indexConfigToUdsParameters(IndexConfig *indexConfig, + struct uds_parameters *userParams) +{ + userParams->checkpoint_frequency = indexConfig->checkpointFrequency; +} diff --git a/source/vdo/base/volumeGeometry.h b/source/vdo/base/volumeGeometry.h new file mode 100644 index 0000000..c06cdde --- /dev/null +++ b/source/vdo/base/volumeGeometry.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/volumeGeometry.h#5 $ + */ + +#ifndef VOLUME_GEOMETRY_H +#define VOLUME_GEOMETRY_H + +#include "uds.h" + +#include "types.h" + +struct indexConfig { + uint32_t mem; + uint32_t checkpointFrequency; + bool sparse; +} __attribute__((packed)); + +typedef enum { + INDEX_REGION = 0, + DATA_REGION = 1, + VOLUME_REGION_COUNT, +} VolumeRegionID; + +typedef struct { + /** The ID of the region */ + VolumeRegionID id; + /** + * The absolute starting offset on the device. The region continues until + * the next region begins. + */ + PhysicalBlockNumber startBlock; +} __attribute__((packed)) VolumeRegion; + +/** A binary UUID is 16 bytes. */ +typedef unsigned char UUID[16]; + +typedef struct { + /** The release version number of this volume */ + ReleaseVersionNumber releaseVersion; + /** The nonce of this volume */ + Nonce nonce; + /** The UUID of this volume */ + UUID uuid; + /** The regions in ID order */ + VolumeRegion regions[VOLUME_REGION_COUNT]; + /** The index config */ + IndexConfig indexConfig; +} __attribute__((packed)) VolumeGeometry; + +/** + * Get the start of the index region from a geometry. + * + * @param geometry The geometry + * + * @return The start of the index region + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getIndexRegionOffset(VolumeGeometry geometry) +{ + return geometry.regions[INDEX_REGION].startBlock; +} + +/** + * Get the start of the data region from a geometry. + * + * @param geometry The geometry + * + * @return The start of the data region + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getDataRegionOffset(VolumeGeometry geometry) +{ + return geometry.regions[DATA_REGION].startBlock; +} + +/** + * Get the size of the index region from a geometry. + * + * @param geometry The geometry + * + * @return the size of the index region + **/ +__attribute__((warn_unused_result)) +static inline PhysicalBlockNumber getIndexRegionSize(VolumeGeometry geometry) +{ + return getDataRegionOffset(geometry) - getIndexRegionOffset(geometry); +} + +/** + * Read the volume geometry from a layer. + * + * @param layer The layer to read and parse the geometry from + * @param geometry The geometry to be loaded + **/ +int loadVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) +__attribute__((warn_unused_result)); + +/** + * Initialize a VolumeGeometry for a VDO. + * + * @param nonce The nonce for the VDO + * @param uuid The uuid for the VDO + * @param indexConfig The index config of the VDO. + * @param geometry The geometry being initialized + * + * @return VDO_SUCCESS or an error + **/ +int initializeVolumeGeometry(Nonce nonce, + UUID uuid, + IndexConfig *indexConfig, + VolumeGeometry *geometry) + __attribute__((warn_unused_result)); + +/** + * Zero out the geometry on a layer. + * + * @param layer The layer to clear + * + * @return VDO_SUCCESS or an error + **/ +int clearVolumeGeometry(PhysicalLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Write a geometry block for a VDO. + * + * @param layer The layer on which to write. + * @param geometry The VolumeGeometry to be written + * + * @return VDO_SUCCESS or an error + **/ +int writeVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) +__attribute__((warn_unused_result)); + +/** + * Convert a index config to a UDS configuration, which can be used by UDS. + * + * @param [in] indexConfig The index config to convert + * @param [out] udsConfigPtr A pointer to return the UDS configuration + * + * @return VDO_SUCCESS or an error + **/ +int indexConfigToUdsConfiguration(IndexConfig *indexConfig, + UdsConfiguration *udsConfigPtr) +__attribute__((warn_unused_result)); + +/** + * Modify the uds_parameters to match the requested index config. + * + * @param indexConfig The index config to convert + * @param userParams The uds_parameters to modify + **/ +void indexConfigToUdsParameters(IndexConfig *indexConfig, + struct uds_parameters *userParams); + +/** + * Compute the index size in blocks from the IndexConfig. + * + * @param [in] indexConfig The index config + * @param [out] indexBlocksPtr A pointer to return the index size in blocks + * + * @return VDO_SUCCESS or an error + **/ +int computeIndexBlocks(IndexConfig *indexConfig, BlockCount *indexBlocksPtr) +__attribute__((warn_unused_result)); + +/** + * Set load config fields from a volume geometry. + * + * @param [in] geometry The geometry to use + * @param [out] loadConfig The load config to set + **/ +static inline void setLoadConfigFromGeometry(VolumeGeometry *geometry, + VDOLoadConfig *loadConfig) +{ + loadConfig->firstBlockOffset = getDataRegionOffset(*geometry); + loadConfig->releaseVersion = geometry->releaseVersion; + loadConfig->nonce = geometry->nonce; +} + +#endif // VOLUME_GEOMETRY_H diff --git a/source/vdo/base/waitQueue.c b/source/vdo/base/waitQueue.c new file mode 100644 index 0000000..3d7f175 --- /dev/null +++ b/source/vdo/base/waitQueue.c @@ -0,0 +1,184 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/waitQueue.c#1 $ + */ + +#include "waitQueue.h" + +#include "permassert.h" + +#include "statusCodes.h" + +/**********************************************************************/ +int enqueueWaiter(WaitQueue *queue, Waiter *waiter) +{ + int result = ASSERT((waiter->nextWaiter == NULL), + "new waiter must not already be in a waiter queue"); + if (result != VDO_SUCCESS) { + return result; + } + + if (queue->lastWaiter == NULL) { + // The queue is empty, so form the initial circular list by self-linking + // the initial waiter. + waiter->nextWaiter = waiter; + } else { + // Splice the new waiter in at the end of the queue. + waiter->nextWaiter = queue->lastWaiter->nextWaiter; + queue->lastWaiter->nextWaiter = waiter; + } + // In both cases, the waiter we added to the ring becomes the last waiter. + queue->lastWaiter = waiter; + queue->queueLength += 1; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void transferAllWaiters(WaitQueue *fromQueue, WaitQueue *toQueue) +{ + // If the source queue is empty, there's nothing to do. + if (!hasWaiters(fromQueue)) { + return; + } + + if (hasWaiters(toQueue)) { + // Both queues are non-empty. Splice the two circular lists together by + // swapping the next (head) pointers in the list tails. + Waiter *fromHead = fromQueue->lastWaiter->nextWaiter; + Waiter *toHead = toQueue->lastWaiter->nextWaiter; + toQueue->lastWaiter->nextWaiter = fromHead; + fromQueue->lastWaiter->nextWaiter = toHead; + } + + toQueue->lastWaiter = fromQueue->lastWaiter; + toQueue->queueLength += fromQueue->queueLength; + initializeWaitQueue(fromQueue); +} + +/**********************************************************************/ +void notifyAllWaiters(WaitQueue *queue, + WaiterCallback *callback, + void *context) +{ + // Copy and empty the queue first, avoiding the possibility of an infinite + // loop if entries are returned to the queue by the callback function. + WaitQueue waiters; + initializeWaitQueue(&waiters); + transferAllWaiters(queue, &waiters); + + // Drain the copied queue, invoking the callback on every entry. + while (notifyNextWaiter(&waiters, callback, context)) { + // All the work is done by the loop condition. + } +} + +/**********************************************************************/ +Waiter *getFirstWaiter(const WaitQueue *queue) +{ + Waiter *lastWaiter = queue->lastWaiter; + if (lastWaiter == NULL) { + // There are no waiters, so we're done. + return NULL; + } + + // The queue is circular, so the last entry links to the head of the queue. + return lastWaiter->nextWaiter; +} + +/**********************************************************************/ +int dequeueMatchingWaiters(WaitQueue *queue, + WaiterMatch *matchMethod, + void *matchContext, + WaitQueue *matchedQueue) +{ + WaitQueue matchedWaiters; + initializeWaitQueue(&matchedWaiters); + + WaitQueue iterationQueue; + initializeWaitQueue(&iterationQueue); + transferAllWaiters(queue, &iterationQueue); + while (hasWaiters(&iterationQueue)) { + Waiter *waiter = dequeueNextWaiter(&iterationQueue); + int result = VDO_SUCCESS; + if (!matchMethod(waiter, matchContext)) { + result = enqueueWaiter(queue, waiter); + } else { + result = enqueueWaiter(&matchedWaiters, waiter); + } + if (result != VDO_SUCCESS) { + transferAllWaiters(&matchedWaiters, queue); + transferAllWaiters(&iterationQueue, queue); + return result; + } + } + + transferAllWaiters(&matchedWaiters, matchedQueue); + return VDO_SUCCESS; +} + +/**********************************************************************/ +Waiter *dequeueNextWaiter(WaitQueue *queue) +{ + Waiter *firstWaiter = getFirstWaiter(queue); + if (firstWaiter == NULL) { + return NULL; + } + + Waiter *lastWaiter = queue->lastWaiter; + if (firstWaiter == lastWaiter) { + // The queue has a single entry, so just empty it out by nulling the tail. + queue->lastWaiter = NULL; + } else { + // The queue has more than one entry, so splice the first waiter out of + // the circular queue. + lastWaiter->nextWaiter = firstWaiter->nextWaiter; + } + + // The waiter is no longer in a wait queue. + firstWaiter->nextWaiter = NULL; + queue->queueLength -= 1; + return firstWaiter; +} + +/**********************************************************************/ +bool notifyNextWaiter(WaitQueue *queue, + WaiterCallback *callback, + void *context) +{ + Waiter *waiter = dequeueNextWaiter(queue); + if (waiter == NULL) { + return false; + } + + if (callback == NULL) { + callback = waiter->callback; + } + (*callback)(waiter, context); + return true; +} + +/**********************************************************************/ +const Waiter *getNextWaiter(const WaitQueue *queue, const Waiter *waiter) +{ + Waiter *firstWaiter = getFirstWaiter(queue); + if (waiter == NULL) { + return firstWaiter; + } + return ((waiter->nextWaiter != firstWaiter) ? waiter->nextWaiter : NULL); +} diff --git a/source/vdo/base/waitQueue.h b/source/vdo/base/waitQueue.h new file mode 100644 index 0000000..5eb754e --- /dev/null +++ b/source/vdo/base/waitQueue.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/waitQueue.h#1 $ + */ + +#ifndef WAIT_QUEUE_H +#define WAIT_QUEUE_H + +#include "common.h" + +/** + * A wait queue is a circular list of entries waiting to be notified of a + * change in a condition. Keeping a circular list allows the queue structure + * to simply be a pointer to the tail (newest) entry in the queue, supporting + * constant-time enqueue and dequeue operations. A null pointer is an empty + * queue. + * + * An empty queue: + * queue0.lastWaiter -> NULL + * + * A singleton queue: + * queue1.lastWaiter -> entry1 -> entry1 -> [...] + * + * A three-element queue: + * queue2.lastWaiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...] + **/ + +typedef struct waiter Waiter; + +typedef struct { + /** The tail of the queue, the last (most recently added) entry */ + Waiter *lastWaiter; + /** The number of waiters currently in the queue */ + size_t queueLength; +} WaitQueue; + +/** + * Callback type for functions which will be called to resume processing of a + * waiter after it has been removed from its wait queue. + **/ +typedef void WaiterCallback(Waiter *waiter, void *context); + +/** + * Method type for Waiter matching methods. + * + * A WaiterMatch method returns false if the waiter does not match. + **/ +typedef bool WaiterMatch(Waiter *waiter, void *context); + +/** + * The queue entry structure for entries in a WaitQueue. + **/ +struct waiter { + /** + * The next waiter in the queue. If this entry is the last waiter, then this + * is actually a pointer back to the head of the queue. + **/ + struct waiter *nextWaiter; + + /** Optional waiter-specific callback to invoke when waking this waiter. */ + WaiterCallback *callback; +}; + +/** + * Check whether a Waiter is waiting. + * + * @param waiter The waiter to check + * + * @return true if the waiter is on some WaitQueue + **/ +static inline bool isWaiting(Waiter *waiter) +{ + return (waiter->nextWaiter != NULL); +} + +/** + * Initialize a wait queue. + * + * @param queue The queue to initialize + **/ +static inline void initializeWaitQueue(WaitQueue *queue) +{ + *queue = (WaitQueue) { + .lastWaiter = NULL, + .queueLength = 0, + }; +} + +/** + * Check whether a wait queue has any entries waiting in it. + * + * @param queue The queue to query + * + * @return true if there are any waiters in the queue + **/ +__attribute__((warn_unused_result)) +static inline bool hasWaiters(const WaitQueue *queue) +{ + return (queue->lastWaiter != NULL); +} + +/** + * Add a waiter to the tail end of a wait queue. The waiter must not already + * be waiting in a queue. + * + * @param queue The queue to which to add the waiter + * @param waiter The waiter to add to the queue + * + * @return VDO_SUCCESS or an error code + **/ +int enqueueWaiter(WaitQueue *queue, Waiter *waiter) + __attribute__((warn_unused_result)); + +/** + * Notify all the entries waiting in a queue to continue execution by invoking + * a callback function on each of them in turn. The queue is copied and + * emptied before invoking any callbacks, and only the waiters that were in + * the queue at the start of the call will be notified. + * + * @param queue The wait queue containing the waiters to notify + * @param callback The function to call to notify each waiter, or NULL + * to invoke the callback field registered in each waiter + * @param context The context to pass to the callback function + **/ +void notifyAllWaiters(WaitQueue *queue, + WaiterCallback *callback, + void *context); + +/** + * Notify the next entry waiting in a queue to continue execution by invoking + * a callback function on it after removing it from the queue. + * + * @param queue The wait queue containing the waiter to notify + * @param callback The function to call to notify the waiter, or NULL + * to invoke the callback field registered in the waiter + * @param context The context to pass to the callback function + * + * @return true if there was a waiter in the queue + **/ +bool notifyNextWaiter(WaitQueue *queue, + WaiterCallback *callback, + void *context); + +/** + * Transfer all waiters from one wait queue to a second queue, emptying the + * first queue. + * + * @param fromQueue The queue containing the waiters to move + * @param toQueue The queue that will receive the waiters from the + * the first queue + **/ +void transferAllWaiters(WaitQueue *fromQueue, WaitQueue *toQueue); + +/** + * Return the waiter that is at the head end of a wait queue. + * + * @param queue The queue from which to get the first waiter + * + * @return The first (oldest) waiter in the queue, or NULL if + * the queue is empty + **/ +Waiter *getFirstWaiter(const WaitQueue *queue); + +/** + * Remove all waiters that match based on the specified matching method and + * append them to a WaitQueue. + * + * @param queue The wait queue to process + * @param matchMethod The method to determine matching + * @param matchContext Contextual info for the match method + * @param matchedQueue A WaitQueue to store matches + * + * @return VDO_SUCCESS or an error code + **/ +int dequeueMatchingWaiters(WaitQueue *queue, + WaiterMatch *matchMethod, + void *matchContext, + WaitQueue *matchedQueue); + +/** + * Remove the first waiter from the head end of a wait queue. The caller will + * be responsible for waking the waiter by invoking the correct callback + * function to resume its execution. + * + * @param queue The wait queue from which to remove the first entry + * + * @return The first (oldest) waiter in the queue, or NULL if + * the queue is empty + **/ +Waiter *dequeueNextWaiter(WaitQueue *queue); + +/** + * Count the number of waiters in a wait queue. + * + * @param queue The wait queue to query + * + * @return the number of waiters in the queue + **/ +__attribute__((warn_unused_result)) +static inline size_t countWaiters(const WaitQueue *queue) +{ + return queue->queueLength; +} + +/** + * Get the waiter after this one, for debug iteration. + * + * @param queue The wait queue + * @param waiter A waiter + * + * @return the next waiter, or NULL + **/ +const Waiter *getNextWaiter(const WaitQueue *queue, const Waiter *waiter) + __attribute__((warn_unused_result)); + +#endif // WAIT_QUEUE_H diff --git a/source/vdo/kernel/batchProcessor.c b/source/vdo/kernel/batchProcessor.c new file mode 100644 index 0000000..5845960 --- /dev/null +++ b/source/vdo/kernel/batchProcessor.c @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/batchProcessor.c#2 $ + */ + +#include "batchProcessor.h" + +#include "memoryAlloc.h" + +#include "constants.h" + +#include "kernelLayer.h" + +/* + * On memory ordering: + * + * The producer thread does: enqueue item on queue (xchg, which is + * implicitly interlocked, then a store), memory barrier, then atomic + * cmpxchg of the state field. The x86 architecture spec says the + * xchg, store, lock-cmpxchg sequence cannot be reordered, but on + * architectures using load-linked and store-conditional for the + * cmpxchg, like AArch64, the LL can be reordered with the store, so + * we add a barrier. + * + * The consumer thread, when it is running out of work, does: read + * queue (find empty), set state, mfence, read queue again just to be + * sure. The set-state and read-queue cannot be reordered with respect + * to the mfence (but without the mfence, the read could be moved + * before the set). + * + * The xchg and mfence impose a total order across processors, and + * each processor sees the stores done by the other processor in the + * required order. If the xchg happens before the mfence, the + * consumer's "read queue again" operation will see the update. If the + * mfence happens first, the producer's "cmpxchg state" will see its + * updated value. + * + * These are the semantics implemented by memory set to WB (write-back + * caching) mode on x86-64. So, the simple analysis is that no wakeups + * should be missed. + * + * It's a little subtler with funnel queues, since one interrupted or + * delayed enqueue operation (see the commentary in funnelQueuePut) + * can cause another, concurrent enqueue operation to complete without + * actually making the entry visible to the consumer. In essence, one + * update makes no new work items visible to the consumer, and the + * other (when it eventually completes) makes two (or more) work items + * visible, and each one ensures that the consumer will process what + * it has made visible. + */ + +typedef enum batchProcessorState { + BATCH_PROCESSOR_IDLE, + BATCH_PROCESSOR_ENQUEUED, +} BatchProcessorState; + +struct batchProcessor { + spinlock_t consumerLock; + FunnelQueue *queue; + KvdoWorkItem workItem; + atomic_t state; + BatchProcessorCallback callback; + void *closure; + KernelLayer *layer; +}; + +static void scheduleBatchProcessing(BatchProcessor *batch); + +/** + * Apply the batch processing function to the accumulated set of + * objects. + * + * Runs in a "CPU queue". + * + * @param [in] item The work item embedded in the BatchProcessor + **/ +static void batchProcessorWork(KvdoWorkItem *item) +{ + BatchProcessor *batch = container_of(item, BatchProcessor, workItem); + spin_lock(&batch->consumerLock); + while (!isFunnelQueueEmpty(batch->queue)) { + batch->callback(batch, batch->closure); + } + atomic_set(&batch->state, BATCH_PROCESSOR_IDLE); + memoryFence(); + bool needReschedule = !isFunnelQueueEmpty(batch->queue); + spin_unlock(&batch->consumerLock); + if (needReschedule) { + scheduleBatchProcessing(batch); + } +} + +/** + * Ensure that the batch-processing function is scheduled to run. + * + * If we're the thread that switches the BatchProcessor state from + * idle to enqueued, we're the thread responsible for actually + * enqueueing it. If some other thread got there first, or it was + * already enqueued, it's not our problem. + * + * @param [in] batch The BatchProcessor control data + **/ +static void scheduleBatchProcessing(BatchProcessor *batch) +{ + /* + * We want this to be very fast in the common cases. + * + * In testing on our "mgh" class machines (HP ProLiant DL380p Gen8, + * Intel Xeon E5-2690, 2.9GHz), it appears that under some + * conditions it's a little faster to use a memory fence and then + * read the "state" field, skipping the cmpxchg if the state is + * already set to BATCH_PROCESSOR_ENQUEUED. (Sometimes slightly + * faster still if we prefetch the state field first.) Note that the + * read requires the fence, otherwise it could be executed before + * the preceding store by the FunnelQueue code to the "next" + * pointer, which can, very rarely, result in failing to issue a + * wakeup when needed. + * + * However, the gain is small, and in testing on our older "harvard" + * class machines (Intel Xeon X5680, 3.33GHz) it was a clear win to + * skip all of that and go right for the cmpxchg. + * + * Of course, the tradeoffs may be sensitive to the particular work + * going on, cache pressure, etc. + */ + smp_mb(); + BatchProcessorState oldState + = atomic_cmpxchg(&batch->state, BATCH_PROCESSOR_IDLE, + BATCH_PROCESSOR_ENQUEUED); + bool doSchedule = (oldState == BATCH_PROCESSOR_IDLE); + if (doSchedule) { + enqueueCPUWorkQueue(batch->layer, &batch->workItem); + } +} + +/**********************************************************************/ +int makeBatchProcessor(KernelLayer *layer, + BatchProcessorCallback callback, + void *closure, + BatchProcessor **batchPtr) +{ + BatchProcessor *batch; + + int result = ALLOCATE(1, BatchProcessor, "batchProcessor", &batch); + if (result != UDS_SUCCESS) { + return result; + } + result = makeFunnelQueue(&batch->queue); + if (result != UDS_SUCCESS) { + FREE(batch); + return result; + } + + spin_lock_init(&batch->consumerLock); + setupWorkItem(&batch->workItem, batchProcessorWork, + (KvdoWorkFunction) callback, CPU_Q_ACTION_COMPLETE_KVIO); + atomic_set(&batch->state, BATCH_PROCESSOR_IDLE); + batch->callback = callback; + batch->closure = closure; + batch->layer = layer; + + *batchPtr = batch; + return UDS_SUCCESS; +} + +/**********************************************************************/ +void addToBatchProcessor(BatchProcessor *batch, KvdoWorkItem *item) +{ + funnelQueuePut(batch->queue, &item->workQueueEntryLink); + scheduleBatchProcessing(batch); +} + +/**********************************************************************/ +KvdoWorkItem *nextBatchItem(BatchProcessor *batch) +{ + FunnelQueueEntry *fqEntry = funnelQueuePoll(batch->queue); + if (fqEntry == NULL) { + return NULL; + } + + return container_of(fqEntry, KvdoWorkItem, workQueueEntryLink); +} + +/**********************************************************************/ +void condReschedBatchProcessor(BatchProcessor *batch) +{ + cond_resched_lock(&batch->consumerLock); +} + +/**********************************************************************/ +void freeBatchProcessor(BatchProcessor **batchPtr) +{ + BatchProcessor *batch = *batchPtr; + if (batch) { + memoryFence(); + BUG_ON(atomic_read(&batch->state) == BATCH_PROCESSOR_ENQUEUED); + freeFunnelQueue(batch->queue); + FREE(batch); + *batchPtr = NULL; + } +} diff --git a/source/vdo/kernel/batchProcessor.h b/source/vdo/kernel/batchProcessor.h new file mode 100644 index 0000000..5e348c6 --- /dev/null +++ b/source/vdo/kernel/batchProcessor.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/batchProcessor.h#2 $ + */ + +#ifndef BATCHPROCESSOR_H +#define BATCHPROCESSOR_H + +#include "kernelTypes.h" +#include "util/funnelQueue.h" + +/** + * Control data for managing collections of objects to be operated on + * by a specified function. May be used when the work function is + * lightweight enough or cache-contentious enough that it makes sense + * to try to accumulate multiple objects and operate on them all at + * once in one thread. + * + * The work function is run in one of the kernel layer's "CPU queues", + * and care is taken to ensure that only one invocation can be running + * or scheduled at any given time. It can loop calling nextBatchItem + * repeatedly until there are no more objects to operate on. It should + * also call condReschedBatchProcessor now and then, to play nicely + * with the OS scheduler. + * + * Objects to operate on are manipulated through a FunnelQueueEntry + * object which must be contained within them. + **/ +typedef struct batchProcessor BatchProcessor; + +typedef void (*BatchProcessorCallback)(BatchProcessor *batch, void *closure); + +/** + * Creates a batch-processor control structure. + * + * @param [in] layer The kernel layer data, used to enqueue work items + * @param [in] callback A function to process the accumulated objects + * @param [in] closure A private data pointer for use by the callback + * @param [out] batchPtr Where to store the pointer to the new object + * + * @return UDS_SUCCESS or an error code + **/ +int makeBatchProcessor(KernelLayer *layer, + BatchProcessorCallback callback, + void *closure, + BatchProcessor **batchPtr); + +/** + * Adds an object to the processing queue. + * + *

If the callback function is not currently running or scheduled to be run, + * it gets queued up to run. + * + * @param [in] batch The batch-processor data + * @param [in] item The handle on the new object to add + **/ +void addToBatchProcessor(BatchProcessor *batch, KvdoWorkItem *item); + +/** + * Fetches the next object in the processing queue. + * + * @param [in] batch The batch-processor data + * + * @return An object pointer or NULL + **/ +KvdoWorkItem *nextBatchItem(BatchProcessor *batch) + __attribute__((warn_unused_result)); + +/** + * Free the batch-processor data and null out the pointer. + * + * @param [in,out] batchPtr Where the BatchProcessor pointer is stored + **/ +void freeBatchProcessor(BatchProcessor **batchPtr); + +/** + * Yield control to the scheduler if the kernel has indicated that + * other work needs to run on the current processor. + * + * The data structure is needed so that the spin lock can be + * (conditionally) released and re-acquired. + * + * @param [in] batch The batch-processor data + **/ +void condReschedBatchProcessor(BatchProcessor *batch); + +#endif // BATCHPROCESSOR_H diff --git a/source/vdo/kernel/bio.c b/source/vdo/kernel/bio.c new file mode 100644 index 0000000..a8e3a5e --- /dev/null +++ b/source/vdo/kernel/bio.c @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bio.c#8 $ + */ + +#include "bio.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "numeric.h" + +#include "flush.h" +#include "recoveryJournal.h" + +#include "bioIterator.h" +#include "ioSubmitter.h" + +/** + * Gets the raw buffer from a biovec. + * + * @param biovec The biovec in question + * + * @return the buffer + **/ +static char *getBufferForBiovec(struct bio_vec *biovec) +{ + return (page_address(biovec->bv_page) + biovec->bv_offset); +} + +/**********************************************************************/ +void bioCopyDataIn(BIO *bio, char *dataPtr) +{ + struct bio_vec *biovec; + for (BioIterator iter = createBioIterator(bio); + (biovec = getNextBiovec(&iter)) != NULL; + advanceBioIterator(&iter)) { + memcpy(dataPtr, getBufferForBiovec(biovec), biovec->bv_len); + dataPtr += biovec->bv_len; + } +} + +/**********************************************************************/ +void bioCopyDataOut(BIO *bio, char *dataPtr) +{ + struct bio_vec *biovec; + for (BioIterator iter = createBioIterator(bio); + (biovec = getNextBiovec(&iter)) != NULL; + advanceBioIterator(&iter)) { + memcpy(getBufferForBiovec(biovec), dataPtr, biovec->bv_len); + flush_dcache_page(biovec->bv_page); + dataPtr += biovec->bv_len; + } +} + +/**********************************************************************/ +void setBioOperation(BIO *bio, unsigned int operation) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= operation; +#else + + unsigned int OPERATION_MASK = WRITE | REQ_DISCARD | REQ_FLUSH; + + // Clear the relevant bits + bio->bi_rw &= ~OPERATION_MASK; + // Set the operation we care about + bio->bi_rw |= operation; +#endif +} + +/**********************************************************************/ +void freeBio(BIO *bio, KernelLayer *layer) +{ + bio_put(bio); +} + +/**********************************************************************/ +void countBios(AtomicBioStats *bioStats, BIO *bio) +{ + if (isWriteBio(bio)) { + atomic64_inc(&bioStats->write); + } else { + atomic64_inc(&bioStats->read); + } + if (isDiscardBio(bio)) { + atomic64_inc(&bioStats->discard); + } + if (isFlushBio(bio)) { + atomic64_inc(&bioStats->flush); + } + if (isFUABio(bio)) { + atomic64_inc(&bioStats->fua); + } +} + +/** + * The function determines whether a buffer contains all zeroes. + * + * @param buffer The buffer to check + * @param length The length of the buffer + * + * @return true is all zeroes, false otherwise + **/ +static inline bool isAllZeros(const char *buffer, unsigned int length) +{ + /* + * Handle expected common case of even the first word being nonzero, + * without getting into the more expensive (for one iteration) loop + * below. + */ + if (likely(length >= sizeof(uint64_t))) { + if (GET_UNALIGNED(uint64_t, buffer) != 0) { + return false; + } + + unsigned int wordCount = length / sizeof(uint64_t); + + // Unroll to process 64 bytes at a time + unsigned int chunkCount = wordCount / 8; + while (chunkCount-- > 0) { + uint64_t word0 = GET_UNALIGNED(uint64_t, buffer); + uint64_t word1 = GET_UNALIGNED(uint64_t, buffer + 1 * sizeof(uint64_t)); + uint64_t word2 = GET_UNALIGNED(uint64_t, buffer + 2 * sizeof(uint64_t)); + uint64_t word3 = GET_UNALIGNED(uint64_t, buffer + 3 * sizeof(uint64_t)); + uint64_t word4 = GET_UNALIGNED(uint64_t, buffer + 4 * sizeof(uint64_t)); + uint64_t word5 = GET_UNALIGNED(uint64_t, buffer + 5 * sizeof(uint64_t)); + uint64_t word6 = GET_UNALIGNED(uint64_t, buffer + 6 * sizeof(uint64_t)); + uint64_t word7 = GET_UNALIGNED(uint64_t, buffer + 7 * sizeof(uint64_t)); + uint64_t or = (word0 | word1 | word2 | word3 + | word4 | word5 | word6 | word7); + // Prevent compiler from using 8*(cmp;jne). + __asm__ __volatile__ ("" : : "g" (or)); + if (or != 0) { + return false; + } + buffer += 8 * sizeof(uint64_t); + } + wordCount %= 8; + + // Unroll to process 8 bytes at a time. + // (Is this still worthwhile?) + while (wordCount-- > 0) { + if (GET_UNALIGNED(uint64_t, buffer) != 0) { + return false; + } + buffer += sizeof(uint64_t); + } + length %= sizeof(uint64_t); + // Fall through to finish up anything left over. + } + + while (length-- > 0) { + if (*buffer++ != 0) { + return false; + } + } + return true; +} + +/**********************************************************************/ +bool bioIsZeroData(BIO *bio) +{ + struct bio_vec *biovec; + for (BioIterator iter = createBioIterator(bio); + (biovec = getNextBiovec(&iter)) != NULL; + advanceBioIterator(&iter)) { + if (!isAllZeros(getBufferForBiovec(biovec), biovec->bv_len)) { + return false; + } + } + return true; +} + +/**********************************************************************/ +void bioZeroData(BIO *bio) +{ + zero_fill_bio(bio); +} + +/**********************************************************************/ +static void setBioSize(BIO *bio, BlockSize bioSize) +{ +#ifdef USE_BI_ITER + bio->bi_iter.bi_size = bioSize; +#else + bio->bi_size = bioSize; +#endif +} + +/** + * Initialize a bio. + * + * @param bio The bio to initialize + * @param layer The layer to which it belongs. + **/ +static void initializeBio(BIO *bio, KernelLayer *layer) +{ + // Save off important info so it can be set back later + unsigned short vcnt = bio->bi_vcnt; + void *pvt = bio->bi_private; + bio_reset(bio); // Memsets large portion of bio. Reset all needed fields. + bio->bi_private = pvt; + bio->bi_vcnt = vcnt; + bio->bi_end_io = completeAsyncBio; + setBioSector(bio, (sector_t) -1); // Sector will be set later on. + setBioBlockDevice(bio, getKernelLayerBdev(layer)); +} + +/**********************************************************************/ +void resetBio(BIO *bio, KernelLayer *layer) +{ + initializeBio(bio, layer); + setBioSize(bio, VDO_BLOCK_SIZE); +} + +/**********************************************************************/ +int allocateBio(KernelLayer *layer, unsigned int bvecCount, BIO **bioPtr) +{ + BIO *bio = bio_alloc_bioset(GFP_NOIO, bvecCount, layer->bioset); + if (IS_ERR(bio)) { + logError("bio allocation failure %ld", PTR_ERR(bio)); + return PTR_ERR(bio); + } + + initializeBio(bio, layer); + + *bioPtr = bio; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int createBio(KernelLayer *layer, char *data, BIO **bioPtr) +{ + BIO *bio = NULL; + if (data == NULL) { + int result = allocateBio(layer, 0, &bio); + if (result != VDO_SUCCESS) { + return result; + } + + *bioPtr = bio; + return VDO_SUCCESS; + } + + unsigned int len = VDO_BLOCK_SIZE; + unsigned long kaddr = (unsigned long) data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int bvecCount = end - start; + + int result = allocateBio(layer, bvecCount, &bio); + if (result != VDO_SUCCESS) { + return result; + } + + int offset = offset_in_page(kaddr); + for (unsigned int i = 0; (i < bvecCount) && (len > 0); i++) { + unsigned int bytes = PAGE_SIZE - offset; + if (bytes > len) { + bytes = len; + } + + struct page *page + = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); + int bytesAdded = bio_add_page(bio, page, bytes, offset); + if (bytesAdded != bytes) { + freeBio(bio, layer); + return logErrorWithStringError(VDO_BIO_CREATION_FAILED, + "Could only add %i bytes to bio", + bytesAdded); + + } + + data += bytes; + len -= bytes; + offset = 0; + } + + *bioPtr = bio; + return VDO_SUCCESS; +} + +/**********************************************************************/ +void prepareFlushBIO(BIO *bio, + void *context, + struct block_device *device, + bio_end_io_t *endIOCallback) +{ + clearBioOperationAndFlags(bio); + /* + * One would think we could use REQ_OP_FLUSH on new kernels, but some + * layers of the stack don't recognize that as a flush. So do it + * like blkdev_issue_flush() and make it a write+flush. + */ + setBioOperationWrite(bio); + setBioOperationFlagPreflush(bio); + bio->bi_end_io = endIOCallback; + bio->bi_private = context; + bio->bi_vcnt = 0; + setBioBlockDevice(bio, device); + setBioSize(bio, 0); + setBioSector(bio, 0); +} diff --git a/source/vdo/kernel/bio.h b/source/vdo/kernel/bio.h new file mode 100644 index 0000000..1ba8234 --- /dev/null +++ b/source/vdo/kernel/bio.h @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bio.h#6 $ + */ + +#ifndef BIO_H +#define BIO_H + +#include +#include +#include + +#include "kernelTypes.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) +#define USE_BI_ITER 1 +#endif + +/** + * Copy the bio data to a char array. + * + * @param bio The bio to copy the data from + * @param dataPtr The local array to copy the data to + **/ +void bioCopyDataIn(BIO *bio, char *dataPtr); + +/** + * Copy a char array to the bio data. + * + * @param bio The bio to copy the data to + * @param dataPtr The local array to copy the data from + **/ +void bioCopyDataOut(BIO *bio, char *dataPtr); + +/** + * Set the bi_rw or equivalent field of a bio to a particular data + * operation. Intended to be called only by setBioOperationRead() etc. + * + * @param bio The bio to modify + * @param operation The operation to set it to + **/ +void setBioOperation(BIO *bio, unsigned int operation); + +/**********************************************************************/ +static inline void setBioOperationRead(BIO *bio) +{ + setBioOperation(bio, READ); +} + +/**********************************************************************/ +static inline void setBioOperationWrite(BIO *bio) +{ + setBioOperation(bio, WRITE); +} + +/**********************************************************************/ +static inline void clearBioOperationAndFlags(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf = 0; +#else + bio->bi_rw = 0; +#endif +} + +/**********************************************************************/ +static inline void copyBioOperationAndFlags(BIO *to, BIO *from) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + to->bi_opf = from->bi_opf; +#else + to->bi_rw = from->bi_rw; +#endif +} + +/**********************************************************************/ +static inline void setBioOperationFlag(BIO *bio, unsigned int flag) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf |= flag; +#else + bio->bi_rw |= flag; +#endif +} + +/**********************************************************************/ +static inline void clearBioOperationFlag(BIO *bio, unsigned int flag) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf &= ~flag; +#else + bio->bi_rw &= ~flag; +#endif +} + +/**********************************************************************/ +static inline void setBioOperationFlagPreflush(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + setBioOperationFlag(bio, REQ_PREFLUSH); +#else + // Preflushes and empty flushes are not currently distinguished. + setBioOperation(bio, WRITE_FLUSH); +#endif +} + +/**********************************************************************/ +static inline void setBioOperationFlagSync(BIO *bio) +{ + setBioOperationFlag(bio, REQ_SYNC); +} + +/**********************************************************************/ +static inline void clearBioOperationFlagSync(BIO *bio) +{ + clearBioOperationFlag(bio, REQ_SYNC); +} + +/**********************************************************************/ +static inline void setBioOperationFlagFua(BIO *bio) +{ + setBioOperationFlag(bio, REQ_FUA); +} + +/**********************************************************************/ +static inline void clearBioOperationFlagFua(BIO *bio) +{ + clearBioOperationFlag(bio, REQ_FUA); +} + +/**********************************************************************/ +static inline bool isDiscardBio(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + return (bio != NULL) && (bio_op(bio) == REQ_OP_DISCARD); +#else + return (bio != NULL) && ((bio->bi_rw & REQ_DISCARD) != 0); +#endif +} + +/**********************************************************************/ +static inline bool isFlushBio(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + return (bio_op(bio) == REQ_OP_FLUSH) || ((bio->bi_opf & REQ_PREFLUSH) != 0); +#else + return (bio->bi_rw & REQ_FLUSH) != 0; +#endif +} + +/**********************************************************************/ +static inline bool isFUABio(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + return (bio->bi_opf & REQ_FUA) != 0; +#else + return (bio->bi_rw & REQ_FUA) != 0; +#endif +} + +/**********************************************************************/ +static inline bool isReadBio(BIO *bio) +{ + return bio_data_dir(bio) == READ; +} + +/**********************************************************************/ +static inline bool isWriteBio(BIO *bio) +{ + return bio_data_dir(bio) == WRITE; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Get the error from the bio. + * + * @param bio The bio + * + * @return the bio's error if any + **/ +static inline int getBioResult(BIO *bio) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) + return blk_status_to_errno(bio->bi_status); +#else + return bio->bi_error; +#endif +} +#endif // newer than 4.4 + +/** + * Set the block device for a bio. + * + * @param bio The bio to modify + * @param device The new block device for the bio + **/ +static inline void setBioBlockDevice(BIO *bio, struct block_device *device) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0) + bio_set_dev(bio, device); +#else + bio->bi_bdev = device; +#endif +} + +/** + * Get a bio's size. + * + * @param bio The bio + * + * @return the bio's size + **/ +static inline unsigned int getBioSize(BIO *bio) +{ +#ifdef USE_BI_ITER + return bio->bi_iter.bi_size; +#else + return bio->bi_size; +#endif +} + +/** + * Set the bio's sector. + * + * @param bio The bio + * @param sector The sector + **/ +static inline void setBioSector(BIO *bio, sector_t sector) +{ +#ifdef USE_BI_ITER + bio->bi_iter.bi_sector = sector; +#else + bio->bi_sector = sector; +#endif +} + +/** + * Get the bio's sector. + * + * @param bio The bio + * + * @return the sector + **/ +static inline sector_t getBioSector(BIO *bio) +{ +#ifdef USE_BI_ITER + return bio->bi_iter.bi_sector; +#else + return bio->bi_sector; +#endif +} + +/** + * Tell the kernel we've completed processing of this bio. + * + * @param bio The bio to complete + * @param error A system error code, or 0 for success + **/ +static inline void completeBio(BIO *bio, int error) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) + bio->bi_status = errno_to_blk_status(error); + bio_endio(bio); +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + bio->bi_error = error; + bio_endio(bio); +#else + bio_endio(bio, error); +#endif +} + +/** + * Frees up a bio structure + * + * @param bio The bio to free + * @param layer The layer the bio was created in + **/ +void freeBio(BIO *bio, KernelLayer *layer); + +/** + * Count the statistics for the bios. This is used for calls into VDO and + * for calls out of VDO. + * + * @param bioStats Statistics structure to update + * @param bio The bio + **/ +void countBios(AtomicBioStats *bioStats, BIO *bio); + +/** + * Reset a bio so it can be used again. + * + * @param bio The bio to reset + * @param layer The physical layer + **/ +void resetBio(BIO *bio, KernelLayer *layer); + +/** + * Check to see whether a bio's data are all zeroes. + * + * @param bio The bio + * + * @return true if the bio's data are all zeroes + **/ +bool bioIsZeroData(BIO *bio); + +/** + * Set a bio's data to all zeroes. + * + * @param [in] bio The bio + **/ +void bioZeroData(BIO *bio); + +/** + * Create a new bio structure for kernel buffer storage. + * + * @param [in] layer The physical layer + * @param [in] data The buffer (can be NULL) + * @param [out] bioPtr A pointer to hold new bio + * + * @return VDO_SUCCESS or an error + **/ +int createBio(KernelLayer *layer, char *data, BIO **bioPtr); + +/** + * Prepare a BIO to issue a flush to the device below. + * + * @param bio The flush BIO + * @param context The context for the callback + * @param device The device to flush + * @param endIOCallback The function to call when the flush is complete + **/ +void prepareFlushBIO(BIO *bio, + void *context, + struct block_device *device, + bio_end_io_t *endIOCallback); + +/** + * Perform IO with a bio, waiting for completion and returning its result. + * The bio must already have its sector, block device, and operation set. + * + * @param bio The bio to do IO with + * + * @return The bio result + **/ +static inline int submitBioAndWait(BIO *bio) +{ + submit_bio_wait(bio); + return getBioResult(bio); +} + +#endif /* BIO_H */ diff --git a/source/vdo/kernel/bioIterator.h b/source/vdo/kernel/bioIterator.h new file mode 100644 index 0000000..7445261 --- /dev/null +++ b/source/vdo/kernel/bioIterator.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bioIterator.h#1 $ + */ + +#ifndef BIO_ITERATOR_H +#define BIO_ITERATOR_H + +#include + +#include "bio.h" +#include "kernelTypes.h" + +typedef struct { + BIO *bio; +#ifdef USE_BI_ITER + struct bvec_iter iter; + // Needed so we can store the return value of bio_iter_iovec. + struct bio_vec temp; +#else + int index; +#endif +} BioIterator; + +/** + * Create an iterator over a bio's data. + * + * @param bio The bio to iterate over + * + * @return An iterator over a bio + **/ +static BioIterator createBioIterator(BIO *bio) +{ + BioIterator iterator = { + .bio = bio, +#ifdef USE_BI_ITER + .iter = bio->bi_iter, +#else + .index = bio->bi_idx, +#endif + }; + return iterator; +} + +/** + * Get the next biovec from the iterator, or NULL if there are no more. + * + * @param iterator The iterator from which to get data + * + * @return The next biovec from the iterator, or NULL. + **/ +static struct bio_vec *getNextBiovec(BioIterator *iterator) +{ + BIO *bio = iterator->bio; +#ifdef USE_BI_ITER + if (iterator->iter.bi_size == 0) { + return NULL; + } + + iterator->temp = bio_iter_iovec(bio, iterator->iter); + return &iterator->temp; +#else + if (iterator->index >= bio->bi_vcnt) { + return NULL; + } + return bio_iovec_idx(bio, iterator->index); +#endif +} + +/** + * Advance the iterator to the next biovec in the bio. + * + * @param [in,out] iterator The iterator to advance + **/ +static void advanceBioIterator(BioIterator *iterator) +{ +#ifdef USE_BI_ITER + bio_advance_iter(iterator->bio, &iterator->iter, iterator->temp.bv_len); +#else + iterator->index++; +#endif +} + +#endif /* BIO_ITERATOR_H */ diff --git a/source/vdo/kernel/bufferPool.c b/source/vdo/kernel/bufferPool.c new file mode 100644 index 0000000..9c950ca --- /dev/null +++ b/source/vdo/kernel/bufferPool.c @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bufferPool.c#1 $ + */ + +#include "bufferPool.h" + +#include +#include + +#include "logger.h" +#include "memoryAlloc.h" + +#include "statusCodes.h" + +/* + * For list nodes on the free-object list, the data field describes + * the object available for reuse. + * + * For nodes on the "spare" list, the data field is meaningless; + * they're just nodes available for use when we need to add an object + * pointer to the freeObjectList. + * + * These are both "free lists", in a sense; don't get confused! + */ +typedef struct { + struct list_head list; // links in current list + void *data; // element data, if on free list +} BufferElement; + +struct bufferPool { + const char *name; // Pool name + void *data; // Associated pool data + spinlock_t lock; // Locks this object + unsigned int size; // Total number of buffers + struct list_head freeObjectList; // List of free buffers + struct list_head spareListNodes; // Unused list nodes + unsigned int numBusy; // Number of buffers in use + unsigned int maxBusy; // Maximum value of the above + BufferAllocateFunction *alloc; // Allocate function for buffer data + BufferFreeFunction *free; // Free function for buffer data + BufferDumpFunction *dump; // Dump function for buffer data + BufferElement *bhead; // Array of BufferElement structures + void **objects; +}; + +/*************************************************************************/ +int makeBufferPool(const char *poolName, + unsigned int size, + BufferAllocateFunction *allocateFunction, + BufferFreeFunction *freeFunction, + BufferDumpFunction *dumpFunction, + void *poolData, + BufferPool **poolPtr) +{ + BufferPool *pool; + + int result = ALLOCATE(1, BufferPool, "buffer pool", &pool); + if (result != VDO_SUCCESS) { + logError("buffer pool allocation failure %d", result); + return result; + } + + result = ALLOCATE(size, BufferElement, "buffer pool elements", &pool->bhead); + if (result != VDO_SUCCESS) { + logError("buffer element array allocation failure %d", result); + freeBufferPool(&pool); + return result; + } + + result = ALLOCATE(size, void *, "object pointers", &pool->objects); + if (result != VDO_SUCCESS) { + logError("buffer object array allocation failure %d", result); + freeBufferPool(&pool); + return result; + } + + pool->name = poolName; + pool->alloc = allocateFunction; + pool->free = freeFunction; + pool->dump = dumpFunction; + pool->data = poolData; + pool->size = size; + spin_lock_init(&pool->lock); + INIT_LIST_HEAD(&pool->freeObjectList); + INIT_LIST_HEAD(&pool->spareListNodes); + BufferElement *bh = pool->bhead; + for (int i = 0; i < pool->size; i++) { + result = pool->alloc(pool->data, &bh->data); + if (result != VDO_SUCCESS) { + logError("verify buffer data allocation failure %d", result); + freeBufferPool(&pool); + return result; + } + pool->objects[i] = bh->data; + list_add(&bh->list, &pool->freeObjectList); + bh++; + } + pool->numBusy = pool->maxBusy = 0; + + *poolPtr = pool; + return VDO_SUCCESS; +} + +/*************************************************************************/ +void freeBufferPool(BufferPool **poolPtr) +{ + BufferPool *pool = *poolPtr; + if (pool == NULL) { + return; + } + + ASSERT_LOG_ONLY((pool->numBusy == 0), "freeing busy buffer pool, numBusy=%d", + pool->numBusy); + if (pool->objects != NULL) { + for (int i = 0; i < pool->size; i++) { + if (pool->objects[i] != NULL) { + pool->free(pool->data, pool->objects[i]); + } + } + FREE(pool->objects); + } + FREE(pool->bhead); + FREE(pool); + *poolPtr = NULL; +} + +/*************************************************************************/ +static bool inFreeList(BufferPool *pool, void *data) +{ + struct list_head *node; + list_for_each(node, &pool->freeObjectList) { + if (container_of(node, BufferElement, list)->data == data) { + return true; + } + } + return false; +} + +/*************************************************************************/ +void dumpBufferPool(BufferPool *pool, bool dumpElements) +{ + // In order that syslog can empty its buffer, sleep after 35 elements for + // 4ms (till the second clock tick). These numbers chosen in October + // 2012 running on an lfarm. + enum { ELEMENTS_PER_BATCH = 35 }; + enum { SLEEP_FOR_SYSLOG = 4 }; + + if (pool == NULL) { + return; + } + spin_lock(&pool->lock); + logInfo("%s: %u of %u busy (max %u)", pool->name, pool->numBusy, pool->size, + pool->maxBusy); + if (dumpElements && (pool->dump != NULL)) { + int dumped = 0; + for (int i = 0; i < pool->size; i++) { + if (!inFreeList(pool, pool->objects[i])) { + pool->dump(pool->data, pool->objects[i]); + if (++dumped >= ELEMENTS_PER_BATCH) { + spin_unlock(&pool->lock); + dumped = 0; + msleep(SLEEP_FOR_SYSLOG); + spin_lock(&pool->lock); + } + } + } + } + spin_unlock(&pool->lock); +} + +/*************************************************************************/ +int allocBufferFromPool(BufferPool *pool, void **dataPtr) +{ + if (pool == NULL) { + return UDS_INVALID_ARGUMENT; + } + + spin_lock(&pool->lock); + if (unlikely(list_empty(&pool->freeObjectList))) { + spin_unlock(&pool->lock); + logDebug("no free buffers"); + return -ENOMEM; + } + + BufferElement *bh = list_first_entry(&pool->freeObjectList, BufferElement, + list); + list_move(&bh->list, &pool->spareListNodes); + pool->numBusy++; + if (pool->numBusy > pool->maxBusy) { + pool->maxBusy = pool->numBusy; + } + *dataPtr = bh->data; + spin_unlock(&pool->lock); + return VDO_SUCCESS; + +} + +/*************************************************************************/ +static bool freeBufferToPoolInternal(BufferPool *pool, void *data) +{ + if (unlikely(list_empty(&pool->spareListNodes))) { + return false; + } + BufferElement *bh = list_first_entry(&pool->spareListNodes, BufferElement, + list); + list_move(&bh->list, &pool->freeObjectList); + bh->data = data; + pool->numBusy--; + return true; +} + +/*************************************************************************/ +void freeBufferToPool(BufferPool *pool, void *data) +{ + spin_lock(&pool->lock); + bool success = freeBufferToPoolInternal(pool, data); + spin_unlock(&pool->lock); + if (!success) { + logDebug("trying to add to free list when already full"); + } +} + +/*************************************************************************/ +void freeBuffersToPool(BufferPool *pool, void **data, int count) +{ + spin_lock(&pool->lock); + bool success = true; + for (int i = 0; (i < count) && success; i++) { + success = freeBufferToPoolInternal(pool, data[i]); + } + spin_unlock(&pool->lock); + if (!success) { + logDebug("trying to add to free list when already full"); + } +} diff --git a/source/vdo/kernel/bufferPool.h b/source/vdo/kernel/bufferPool.h new file mode 100644 index 0000000..9c505c9 --- /dev/null +++ b/source/vdo/kernel/bufferPool.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bufferPool.h#1 $ + */ +#ifndef BUFFERPOOL_H +#define BUFFERPOOL_H + +/* + * We need bug.h because in 3.10, kernel.h (indirectly) defines + * ARRAY_SIZE as a macro which (indirectly and conditionally) uses + * BUILD_BUG_ON_ZERO, which is defined in bug.h, which is *not* + * included. In earlier versions like 3.2 it Just Worked. + */ +#include +#include +#include + +typedef struct bufferPool BufferPool; + +typedef int BufferAllocateFunction(void *poolData, void **dataPtr); +typedef void BufferFreeFunction(void *poolData, void *data); +typedef void BufferDumpFunction(void *poolData, void *data); + +/** + * Creates a generic pool of buffer data. The elements in the pool are + * allocated up front and placed on a free list, which manages the + * reuse of the individual buffers in the pool. + * + * @param [in] poolName Name of the pool + * @param [in] size The number of elements to create for this pool + * @param [in] allocateFunction The function to call to create the actual data + * for each element + * @param [in] freeFunction The function to call to free the actual data + * for each element + * @param [in] dumpFunction The function to call to dump the actual data + * for each element into the log + * @param [in] poolData A pointer to the pool's associated data + * @param [out] poolPtr A pointer to hold the pool that was created + * + * @return a success or error code + */ +int makeBufferPool(const char *poolName, + unsigned int size, + BufferAllocateFunction *allocateFunction, + BufferFreeFunction *freeFunction, + BufferDumpFunction *dumpFunction, + void *poolData, + BufferPool **poolPtr) + __attribute__((warn_unused_result)); + +/** + * Free a buffer pool and null out the reference to it. This will free + * all the elements of the pool as well. + * + * @param [in] poolPtr The reference to the pool to free + **/ +void freeBufferPool(BufferPool **poolPtr); + +/** + * Dump a buffer pool to the log. + * + * @param [in] pool The buffer pool to allocate from + * @param [in] dumpElements True for complete output, or false for a + * one-line summary + **/ +void dumpBufferPool(BufferPool *pool, bool dumpElements); + +/** + * Acquires a free buffer from the free list of the pool and + * returns it's associated data. + * + * @param [in] pool The buffer pool to allocate from + * @param [out] dataPtr A pointer to hold the buffer data + * + * @return a success or error code + */ +int allocBufferFromPool(BufferPool *pool, void **dataPtr) + __attribute__((warn_unused_result)); + +/** + * Returns a buffer to the free list of a pool + * + * @param [in] pool The buffer pool to return the buffer to + * @param [in] data The buffer data to return + */ +void freeBufferToPool(BufferPool *pool, void *data); + +/** + * Returns a set of buffers to the free list of a pool + * + * @param [in] pool The buffer pool to return the buffer to + * @param [in] data The buffer data to return + * @param [in] count Number of entries in the data array + */ +void freeBuffersToPool(BufferPool *pool, void **data, int count); + +/** + * Control structure for freeing (releasing back to the pool) pointers + * in batches. + * + * Since the objects stored in a buffer pool are completely opaque, + * some external data structure is needed to manage a collection of + * them. This is a simple helper for doing that, since we're freeing + * batches of objects in a couple different places. Within the pool + * itself there's a pair of linked lists, but getting at them requires + * the locking that we're trying to minimize. + * + * We collect pointers until the array is full or until there are no + * more available, and we call freeBuffersToPool to release a batch + * all at once. + **/ +typedef struct freeBufferPointers { + BufferPool *pool; + int index; + void *pointers[30]; // size is arbitrary +} FreeBufferPointers; + +/** + * Initialize the control structure for batching buffer pointers to be + * released to their pool. + * + * @param [out] fbp The (caller-allocated) control structure + * @param [in] pool The buffer pool to return objects to. + **/ +static inline void initFreeBufferPointers(FreeBufferPointers *fbp, + BufferPool *pool) +{ + fbp->index = 0; + fbp->pool = pool; +} + +/** + * Release any buffers left in the collection. + * + * @param [in] fbp The control structure + **/ +static inline void freeBufferPointers(FreeBufferPointers *fbp) +{ + freeBuffersToPool(fbp->pool, fbp->pointers, fbp->index); + fbp->index = 0; +} + +/** + * Add another buffer pointer to the collection, and if we're full, + * release the whole batch to the pool. + * + * @param [in] fbp The control structure + * @param [in] pointer The buffer pointer to release + **/ +static inline void addFreeBufferPointer(FreeBufferPointers *fbp, + void *pointer) +{ + fbp->pointers[fbp->index] = pointer; + fbp->index++; + if (fbp->index == ARRAY_SIZE(fbp->pointers)) { + freeBufferPointers(fbp); + } +} + +#endif /* BUFFERPOOL_H */ diff --git a/source/vdo/kernel/dataKVIO.c b/source/vdo/kernel/dataKVIO.c new file mode 100644 index 0000000..ba9c8e8 --- /dev/null +++ b/source/vdo/kernel/dataKVIO.c @@ -0,0 +1,1192 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dataKVIO.c#18 $ + */ + +#include "dataKVIO.h" + + +#include "logger.h" +#include "memoryAlloc.h" +#include "murmur/MurmurHash3.h" + +#include "dataVIO.h" +#include "compressedBlock.h" +#include "hashLock.h" +#include "lz4.h" + +#include "bio.h" +#include "dedupeIndex.h" +#include "kvdoFlush.h" +#include "kvio.h" +#include "ioSubmitter.h" +#include "vdoCommon.h" +#include "verify.h" + +static void dumpPooledDataKVIO(void *poolData, void *data); + +enum { + WRITE_PROTECT_FREE_POOL = 0, + WP_DATA_KVIO_SIZE = (sizeof(DataKVIO) + PAGE_SIZE - 1 + - ((sizeof(DataKVIO) + PAGE_SIZE - 1) + % PAGE_SIZE)) +}; + +/** + * Alter the write-access permission to a page of memory, so that + * objects in the free pool may no longer be modified. + * + * To do: Deny read access as well. + * + * @param address The starting address to protect, which must be on a + * page boundary + * @param byteCount The number of bytes to protect, which must be a multiple + * of the page size + * @param mode The write protection mode (true means read-only) + **/ +static __always_inline void +setWriteProtect(void *address, + size_t byteCount, + bool mode __attribute__((unused))) +{ + BUG_ON((((long) address) % PAGE_SIZE) != 0); + BUG_ON((byteCount % PAGE_SIZE) != 0); + BUG(); // only works in internal code, sorry +} + +/**********************************************************************/ +static void maybeLogDataKVIOTrace(DataKVIO *dataKVIO) +{ + if (dataKVIO->kvio.layer->traceLogging) { + logKvioTrace(&dataKVIO->kvio); + } +} + +/** + * First tracing hook for VIO completion. + * + * If the SystemTap script vdotrace.stp is in use, it does stage 1 of + * its processing here. We must not call addTraceRecord between the + * two tap functions. + * + * @param dataKVIO The VIO we're finishing up + **/ +static void kvioCompletionTap1(DataKVIO *dataKVIO) +{ + /* + * Ensure that dataKVIO doesn't get optimized out, even under inline + * expansion. Also, make sure the compiler has to emit debug info + * for baseTraceLocation, which some of our SystemTap scripts will + * use here. + * + * First, make it look as though all memory could be clobbered; then + * require that a value be read into a register. That'll force at + * least one instruction to exist (so SystemTap can hook in) where + * dataKVIO is live. We use a field that the caller would've + * accessed recently anyway, so it may be cached. + */ + barrier(); + __asm__ __volatile__("" + : + : "g" (dataKVIO), "g" (baseTraceLocation), + "r" (dataKVIO->kvio.layer)); +} + +/** + * Second tracing hook for VIO completion. + * + * The SystemTap script vdotrace.stp splits its VIO-completion work + * into two stages, to reduce lock contention for script variables. + * Hence, it needs two hooks in the code. + * + * @param dataKVIO The VIO we're finishing up + **/ +static void kvioCompletionTap2(DataKVIO *dataKVIO) +{ + // Hack to ensure variable doesn't get optimized out. + barrier(); + __asm__ __volatile__("" : : "g" (dataKVIO), "r" (dataKVIO->kvio.layer)); +} + +/**********************************************************************/ +static void kvdoAcknowledgeDataKVIO(DataKVIO *dataKVIO) +{ + KernelLayer *layer = dataKVIO->kvio.layer; + ExternalIORequest *externalIORequest = &dataKVIO->externalIORequest; + BIO *bio = externalIORequest->bio; + if (bio == NULL) { + return; + } + + externalIORequest->bio = NULL; + + int error + = mapToSystemError(dataVIOAsCompletion(&dataKVIO->dataVIO)->result); + bio->bi_end_io = externalIORequest->endIO; + bio->bi_private = externalIORequest->private; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio->bi_opf = externalIORequest->rw; +#else + bio->bi_rw = externalIORequest->rw; +#endif + + countBios(&layer->biosAcknowledged, bio); + if (dataKVIO->isPartial) { + countBios(&layer->biosAcknowledgedPartial, bio); + } + + + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + completeBio(bio, error); +} + +/**********************************************************************/ +static noinline void cleanDataKVIO(DataKVIO *dataKVIO, FreeBufferPointers *fbp) +{ + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + kvdoAcknowledgeDataKVIO(dataKVIO); + + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + kvio->bio = NULL; + + if (unlikely(kvio->vio->trace != NULL)) { + maybeLogDataKVIOTrace(dataKVIO); + kvioCompletionTap1(dataKVIO); + kvioCompletionTap2(dataKVIO); + freeTraceToPool(kvio->layer, kvio->vio->trace); + } + + addFreeBufferPointer(fbp, dataKVIO); +} + +/**********************************************************************/ +void returnDataKVIOBatchToPool(BatchProcessor *batch, void *closure) +{ + KernelLayer *layer = closure; + uint32_t count = 0; + ASSERT_LOG_ONLY(batch != NULL, "batch not null"); + ASSERT_LOG_ONLY(layer != NULL, "layer not null"); + + FreeBufferPointers fbp; + initFreeBufferPointers(&fbp, layer->dataKVIOPool); + + KvdoWorkItem *item; + while ((item = nextBatchItem(batch)) != NULL) { + cleanDataKVIO(workItemAsDataKVIO(item), &fbp); + condReschedBatchProcessor(batch); + count++; + } + + if (fbp.index > 0) { + freeBufferPointers(&fbp); + } + + completeManyRequests(layer, count); +} + +/**********************************************************************/ +static void kvdoAcknowledgeThenCompleteDataKVIO(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + kvdoAcknowledgeDataKVIO(dataKVIO); + addToBatchProcessor(dataKVIO->kvio.layer->dataKVIOReleaser, item); +} + +/**********************************************************************/ +void kvdoCompleteDataKVIO(VDOCompletion *completion) +{ + DataKVIO *dataKVIO = dataVIOAsDataKVIO(asDataVIO(completion)); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + if (useBioAckQueue(layer) && USE_BIO_ACK_QUEUE_FOR_READ + && (dataKVIO->externalIORequest.bio != NULL)) { + launchDataKVIOOnBIOAckQueue(dataKVIO, kvdoAcknowledgeThenCompleteDataKVIO, + NULL, BIO_ACK_Q_ACTION_ACK); + } else { + addToBatchProcessor(layer->dataKVIOReleaser, + workItemFromDataKVIO(dataKVIO)); + } +} + +/** + * Copy the uncompressed data from a compressed block read into the user + * bio which requested the read. + * + * @param workItem The DataKVIO which requested the read + **/ +static void copyReadBlockData(KvdoWorkItem *workItem) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(workItem); + + // For a read-modify-write, copy the data into the dataBlock buffer so it + // will be set up for the write phase. + if (isReadModifyWriteVIO(dataKVIO->kvio.vio)) { + bioCopyDataOut(getBIOFromDataKVIO(dataKVIO), dataKVIO->readBlock.data); + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + // For a partial read, the callback will copy the requested data from the + // read block. + if (dataKVIO->isPartial) { + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + // For a full block read, copy the data to the bio and acknowledge. + bioCopyDataOut(getBIOFromDataKVIO(dataKVIO), dataKVIO->readBlock.data); + kvdoAcknowledgeDataVIO(&dataKVIO->dataVIO); +} + +/** + * Finish reading data for a compressed block. + * + * @param dataKVIO The DataKVIO which requested the read + **/ +static void readDataKVIOReadBlockCallback(DataKVIO *dataKVIO) +{ + if (dataKVIO->readBlock.status != VDO_SUCCESS) { + setCompletionResult(dataVIOAsCompletion(&dataKVIO->dataVIO), + dataKVIO->readBlock.status); + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + launchDataKVIOOnCPUQueue(dataKVIO, copyReadBlockData, NULL, + CPU_Q_ACTION_COMPRESS_BLOCK); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Complete and reset a bio that was supplied by the user and then used for a + * read (so that we can complete it with the user's callback). + * + * @param bio The bio to complete + **/ +static void resetUserBio(BIO *bio) +#else +/** + * Complete and reset a bio that was supplied by the user and then used for a + * read (so that we can complete it with the user's callback). + * + * @param bio The bio to complete + * @param error Possible error from underlying block device + **/ +static void resetUserBio(BIO *bio, int error) +#endif +{ +#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)) \ + && (LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0))) + // This is a user bio, and the device just called bio_endio() on it, so + // we need to re-increment bi_remaining so we too can call bio_endio(). + atomic_inc(&bio->bi_remaining); +#endif + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + completeAsyncBio(bio); +#else + completeAsyncBio(bio, error); +#endif +} + +/** + * Uncompress the data that's just been read and then call back the requesting + * DataKVIO. + * + * @param workItem The DataKVIO requesting the data + **/ +static void uncompressReadBlock(KvdoWorkItem *workItem) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(workItem); + ReadBlock *readBlock = &dataKVIO->readBlock; + BlockSize blockSize = VDO_BLOCK_SIZE; + + // The DataKVIO's scratch block will be used to contain the + // uncompressed data. + uint16_t fragmentOffset, fragmentSize; + char *compressedData = readBlock->data; + int result = getCompressedBlockFragment(readBlock->mappingState, + compressedData, blockSize, + &fragmentOffset, + &fragmentSize); + if (result != VDO_SUCCESS) { + logDebug("%s: frag err %d", __func__, result); + readBlock->status = result; + readBlock->callback(dataKVIO); + return; + } + + char *fragment = compressedData + fragmentOffset; + int size = LZ4_uncompress_unknownOutputSize(fragment, dataKVIO->scratchBlock, + fragmentSize, blockSize); + if (size == blockSize) { + readBlock->data = dataKVIO->scratchBlock; + } else { + logDebug("%s: lz4 error", __func__); + readBlock->status = VDO_INVALID_FRAGMENT; + } + + readBlock->callback(dataKVIO); +} + +/** + * Now that we have gotten the data from storage, uncompress the data if + * necessary and then call back the requesting DataKVIO. + * + * @param dataKVIO The DataKVIO requesting the data + * @param result The result of the read operation + **/ +static void completeRead(DataKVIO *dataKVIO, int result) +{ + ReadBlock *readBlock = &dataKVIO->readBlock; + readBlock->status = result; + + if ((result == VDO_SUCCESS) && isCompressed(readBlock->mappingState)) { + launchDataKVIOOnCPUQueue(dataKVIO, uncompressReadBlock, NULL, + CPU_Q_ACTION_COMPRESS_BLOCK); + return; + } + + readBlock->callback(dataKVIO); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Callback for a bio doing a read. + * + * @param bio The bio + */ +static void readBioCallback(BIO *bio) +#else +/** + * Callback for a bio doing a read. + * + * @param bio The bio + * @param result The result of the read operation + */ +static void readBioCallback(BIO *bio, int result) +#endif +{ + KVIO *kvio = (KVIO *) bio->bi_private; + DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); + dataKVIO->readBlock.data = dataKVIO->readBlock.buffer; + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + countCompletedBios(bio); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + completeRead(dataKVIO, getBioResult(bio)); +#else + completeRead(dataKVIO, result); +#endif +} + +/**********************************************************************/ +void kvdoReadBlock(DataVIO *dataVIO, + PhysicalBlockNumber location, + BlockMappingState mappingState, + BioQAction action, + DataKVIOCallback callback) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + ReadBlock *readBlock = &dataKVIO->readBlock; + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + + readBlock->callback = callback; + readBlock->status = VDO_SUCCESS; + readBlock->mappingState = mappingState; + + BUG_ON(getBIOFromDataKVIO(dataKVIO)->bi_private != &dataKVIO->kvio); + // Read the data directly from the device using the read bio. + BIO *bio = readBlock->bio; + resetBio(bio, layer); + setBioSector(bio, blockToSector(layer, location)); + setBioOperationRead(bio); + bio->bi_end_io = readBioCallback; + submitBio(bio, action); +} + +/**********************************************************************/ +void kvdoReadDataVIO(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(!isWriteVIO(dataVIOAsVIO(dataVIO)), + "operation set correctly for data read"); + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F;io=readData")); + + if (isCompressed(dataVIO->mapped.state)) { + kvdoReadBlock(dataVIO, dataVIO->mapped.pbn, dataVIO->mapped.state, + BIO_Q_ACTION_COMPRESSED_DATA, readDataKVIOReadBlockCallback); + return; + } + + KVIO *kvio = dataVIOAsKVIO(dataVIO); + BIO *bio = kvio->bio; + bio->bi_end_io = resetUserBio; + setBioSector(bio, blockToSector(kvio->layer, dataVIO->mapped.pbn)); + submitBio(bio, BIO_Q_ACTION_DATA); +} + +/**********************************************************************/ +static void kvdoAcknowledgeDataKVIOThenContinue(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + kvdoAcknowledgeDataKVIO(dataKVIO); + // Even if we're not using bio-ack threads, we may be in the wrong + // base-code thread. + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/**********************************************************************/ +void kvdoAcknowledgeDataVIO(DataVIO *dataVIO) +{ + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + + // If the remaining discard work is not completely processed by this VIO, + // don't acknowledge it yet. + if (isDiscardBio(dataKVIO->externalIORequest.bio) + && (dataKVIO->remainingDiscard + > (VDO_BLOCK_SIZE - dataKVIO->offset))) { + invokeCallback(dataVIOAsCompletion(dataVIO)); + return; + } + + // We've finished with the KVIO; acknowledge completion of the bio to the + // kernel. + if (useBioAckQueue(layer)) { + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + launchDataKVIOOnBIOAckQueue(dataKVIO, kvdoAcknowledgeDataKVIOThenContinue, + NULL, BIO_ACK_Q_ACTION_ACK); + } else { + kvdoAcknowledgeDataKVIOThenContinue(workItemFromDataKVIO(dataKVIO)); + } +} + +/**********************************************************************/ +void kvdoWriteDataVIO(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(isWriteVIO(dataVIOAsVIO(dataVIO)), + "kvdoWriteDataVIO() called on write DataVIO"); + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F;io=writeData;j=normal")); + + KVIO *kvio = dataVIOAsKVIO(dataVIO); + BIO *bio = kvio->bio; + setBioOperationWrite(bio); + setBioSector(bio, blockToSector(kvio->layer, dataVIO->newMapped.pbn)); + submitBio(bio, BIO_Q_ACTION_DATA); +} + +/**********************************************************************/ +void kvdoModifyWriteDataVIO(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + BIO *bio = dataKVIO->externalIORequest.bio; + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + resetBio(dataKVIO->dataBlockBio, layer); + + if (!isDiscardBio(bio)) { + bioCopyDataIn(bio, dataKVIO->dataBlock + dataKVIO->offset); + } else { + memset(dataKVIO->dataBlock + dataKVIO->offset, '\0', + min(dataKVIO->remainingDiscard, + (DiscardSize) (VDO_BLOCK_SIZE - dataKVIO->offset))); + } + + dataVIO->isZeroBlock = bioIsZeroData(dataKVIO->dataBlockBio); + dataKVIO->dataBlockBio->bi_private = &dataKVIO->kvio; + copyBioOperationAndFlags(dataKVIO->dataBlockBio, bio); + // Make the bio a write, not (potentially) a discard. + setBioOperationWrite(dataKVIO->dataBlockBio); +} + +/**********************************************************************/ +void kvdoZeroDataVIO(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("zeroDataVIO;io=readData")); + bioZeroData(dataVIOAsKVIO(dataVIO)->bio); +} + +/**********************************************************************/ +void kvdoCopyDataVIO(DataVIO *source, DataVIO *destination) +{ + dataVIOAddTraceRecord(destination, THIS_LOCATION(NULL)); + bioCopyDataOut(dataVIOAsKVIO(destination)->bio, + dataVIOAsDataKVIO(source)->dataBlock); +} + +/**********************************************************************/ +static void kvdoCompressWork(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + + char *context = getWorkQueuePrivateData(); + if (unlikely(context == NULL)) { + uint32_t index = atomicAdd32(&layer->compressionContextIndex, 1) - 1; + BUG_ON(index >= layer->deviceConfig->threadCounts.cpuThreads); + context = layer->compressionContext[index]; + setWorkQueuePrivateData(context); + } + + int size = LZ4_compress_ctx_limitedOutput(context, dataKVIO->dataBlock, + dataKVIO->scratchBlock, + VDO_BLOCK_SIZE, + VDO_BLOCK_SIZE); + DataVIO *dataVIO = &dataKVIO->dataVIO; + if (size > 0) { + // The scratch block will be used to contain the compressed data. + dataVIO->compression.data = dataKVIO->scratchBlock; + dataVIO->compression.size = size; + } else { + // Use block size plus one as an indicator for uncompressible data. + dataVIO->compression.size = VDO_BLOCK_SIZE + 1; + } + + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/**********************************************************************/ +void kvdoCompressDataVIO(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, + THIS_LOCATION("compressDataVIO;" + "io=compress;cb=compress")); + + /* + * If the orignal bio was a discard, but we got this far because the discard + * was a partial one (r/m/w), and it is part of a larger discard, we cannot + * compress this VIO. We need to make sure the VIO completes ASAP. + */ + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + if (isDiscardBio(dataKVIO->externalIORequest.bio) + && (dataKVIO->remainingDiscard > 0)) { + dataVIO->compression.size = VDO_BLOCK_SIZE + 1; + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + launchDataKVIOOnCPUQueue(dataKVIO, kvdoCompressWork, NULL, + CPU_Q_ACTION_COMPRESS_BLOCK); +} + +/** + * Construct a DataKVIO. + * + * @param [in] layer The physical layer + * @param [in] bio The bio to associate with this DataKVIO + * @param [out] dataKVIOPtr A pointer to hold the new DataKVIO + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int makeDataKVIO(KernelLayer *layer, BIO *bio, DataKVIO **dataKVIOPtr) +{ + DataKVIO *dataKVIO; + int result = allocBufferFromPool(layer->dataKVIOPool, (void **) &dataKVIO); + if (result != VDO_SUCCESS) { + return logErrorWithStringError(result, "data kvio allocation failure"); + } + + if (WRITE_PROTECT_FREE_POOL) { + setWriteProtect(dataKVIO, WP_DATA_KVIO_SIZE, false); + } + + KVIO *kvio = &dataKVIO->kvio; + kvio->vio = dataVIOAsVIO(&dataKVIO->dataVIO); + memset(&kvio->enqueueable, 0, sizeof(KvdoEnqueueable)); + memset(&dataKVIO->dedupeContext.pendingList, 0, sizeof(struct list_head)); + memset(&dataKVIO->dataVIO, 0, sizeof(DataVIO)); + kvio->bioToSubmit = NULL; + bio_list_init(&kvio->biosMerged); + + // The dataBlock is only needed for writes and some partial reads. + if (isWriteBio(bio) || (getBioSize(bio) < VDO_BLOCK_SIZE)) { + resetBio(dataKVIO->dataBlockBio, layer); + } + + initializeKVIO(kvio, layer, VIO_TYPE_DATA, VIO_PRIORITY_DATA, NULL, bio); + *dataKVIOPtr = dataKVIO; + return VDO_SUCCESS; +} + +/** + * Creates a new DataVIO structure. A DataVIO represents a single logical + * block of data. It is what most VDO operations work with. This function also + * creates a wrapping DataKVIO structure that is used when we want to + * physically read or write the data associated with the DataVIO. + * + * @param [in] layer The physical layer + * @param [in] bio The BIO from the request the new DataKVIO will + * service + * @param [in] arrivalTime The arrival time of the BIO + * @param [out] dataKVIOPtr A pointer to hold the new DataKVIO + * + * @return VDO_SUCCESS or an error + **/ +static int kvdoCreateKVIOFromBio(KernelLayer *layer, + BIO *bio, + Jiffies arrivalTime, + DataKVIO **dataKVIOPtr) +{ + ExternalIORequest externalIORequest = { + .bio = bio, + .private = bio->bi_private, + .endIO = bio->bi_end_io, +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + .rw = bio->bi_opf, +#else + .rw = bio->bi_rw, +#endif + }; + + // We will handle FUA at the end of the request (after we restore the + // bi_rw field from externalIORequest.rw). + clearBioOperationFlagFua(bio); + + DataKVIO *dataKVIO = NULL; + int result = makeDataKVIO(layer, bio, &dataKVIO); + if (result != VDO_SUCCESS) { + return result; + } + + dataKVIO->externalIORequest = externalIORequest; + dataKVIO->offset = sectorToBlockOffset(layer, getBioSector(bio)); + dataKVIO->isPartial = ((getBioSize(bio) < VDO_BLOCK_SIZE) + || (dataKVIO->offset != 0)); + + if (dataKVIO->isPartial) { + countBios(&layer->biosInPartial, bio); + } else { + /* + * Note that we unconditionally fill in the dataBlock array for + * non-read operations. There are places like kvdoCopyVIO that may + * look at kvio->dataBlock for a zero block (and maybe for + * discards?). We could skip filling in dataBlock for such cases, + * but only once we're sure all such places are fixed to check the + * isZeroBlock flag first. + */ + if (isDiscardBio(bio)) { + /* + * This is a discard/trim operation. This is treated much like the zero + * block, but we keep different stats and distinguish it in the block + * map. + */ + memset(dataKVIO->dataBlock, 0, VDO_BLOCK_SIZE); + } else if (bio_data_dir(bio) == WRITE) { + dataKVIO->dataVIO.isZeroBlock = bioIsZeroData(bio); + // Copy the bio data to a char array so that we can continue to use + // the data after we acknowledge the bio. + bioCopyDataIn(bio, dataKVIO->dataBlock); + } + } + + if (dataKVIO->isPartial || isWriteBio(bio)) { + /* + * dataKVIO->bio will point at kvio->dataBlockBio for all writes and + * partial block I/O so the rest of the kernel code doesn't need to + * make a decision as to what to use. + */ + dataKVIO->dataBlockBio->bi_private = &dataKVIO->kvio; + if (dataKVIO->isPartial && isWriteBio(bio)) { + clearBioOperationAndFlags(dataKVIO->dataBlockBio); + setBioOperationRead(dataKVIO->dataBlockBio); + } else { + copyBioOperationAndFlags(dataKVIO->dataBlockBio, bio); + } + dataKVIOAsKVIO(dataKVIO)->bio = dataKVIO->dataBlockBio; + dataKVIO->readBlock.data = dataKVIO->dataBlock; + } + + setBioBlockDevice(bio, getKernelLayerBdev(layer)); + bio->bi_end_io = completeAsyncBio; + *dataKVIOPtr = dataKVIO; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void launchDataKVIOWork(KvdoWorkItem *item) +{ + runCallback(vioAsCompletion(workItemAsKVIO(item)->vio)); +} + +/** + * Continue discard processing for requests that span multiple physical blocks. + * If all have been processed the KVIO is completed. If we have already seen + * an error, we skip the rest of the discard and fail immediately. + * + *

Invoked in a request-queue thread after the discard of a block has + * completed. + * + * @param completion A completion representing the discard KVIO + **/ +static void kvdoContinueDiscardKVIO(VDOCompletion *completion) +{ + DataVIO *dataVIO = asDataVIO(completion); + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); + dataKVIO->remainingDiscard + -= min(dataKVIO->remainingDiscard, + (DiscardSize) (VDO_BLOCK_SIZE - dataKVIO->offset)); + if ((completion->result != VDO_SUCCESS) + || (dataKVIO->remainingDiscard == 0)) { + if (dataKVIO->hasDiscardPermit) { + limiterRelease(&layer->discardLimiter); + dataKVIO->hasDiscardPermit = false; + } + kvdoCompleteDataKVIO(completion); + return; + } + + BIO *bio = getBIOFromDataKVIO(dataKVIO); + resetBio(bio, layer); + dataKVIO->isPartial = (dataKVIO->remainingDiscard < VDO_BLOCK_SIZE); + dataKVIO->offset = 0; + + VIOOperation operation; + if (dataKVIO->isPartial) { + operation = VIO_READ_MODIFY_WRITE; + setBioOperationRead(bio); + } else { + operation = VIO_WRITE; + } + + if (requestorSetFUA(dataKVIO)) { + operation |= VIO_FLUSH_AFTER; + } + + prepareDataVIO(dataVIO, dataVIO->logical.lbn + 1, operation, + !dataKVIO->isPartial, kvdoContinueDiscardKVIO); + enqueueDataKVIO(dataKVIO, launchDataKVIOWork, completion->callback, + REQ_Q_ACTION_MAP_BIO); +} + +/** + * Finish a partial read. + * + * @param completion The partial read KVIO + **/ +static void kvdoCompletePartialRead(VDOCompletion *completion) +{ + DataKVIO *dataKVIO = dataVIOAsDataKVIO(asDataVIO(completion)); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + + bioCopyDataOut(dataKVIO->externalIORequest.bio, + dataKVIO->readBlock.data + dataKVIO->offset); + kvdoCompleteDataKVIO(completion); + return; +} + +/**********************************************************************/ +int kvdoLaunchDataKVIOFromBio(KernelLayer *layer, + BIO *bio, + uint64_t arrivalTime, + bool hasDiscardPermit) +{ + + DataKVIO *dataKVIO = NULL; + int result = kvdoCreateKVIOFromBio(layer, bio, arrivalTime, &dataKVIO); + if (unlikely(result != VDO_SUCCESS)) { + logInfo("%s: KVIO allocation failure", __func__); + if (hasDiscardPermit) { + limiterRelease(&layer->discardLimiter); + } + limiterRelease(&layer->requestLimiter); + return mapToSystemError(result); + } + + /* + * Discards behave very differently than other requests when coming + * in from device-mapper. We have to be able to handle any size discards + * and with various sector offsets within a block. + */ + KVIO *kvio = &dataKVIO->kvio; + VDOAction *callback = kvdoCompleteDataKVIO; + VIOOperation operation = VIO_WRITE; + bool isTrim = false; + if (isDiscardBio(bio)) { + dataKVIO->hasDiscardPermit = hasDiscardPermit; + dataKVIO->remainingDiscard = getBioSize(bio); + callback = kvdoContinueDiscardKVIO; + if (dataKVIO->isPartial) { + operation = VIO_READ_MODIFY_WRITE; + } else { + isTrim = true; + } + } else if (dataKVIO->isPartial) { + if (bio_data_dir(bio) == READ) { + callback = kvdoCompletePartialRead; + operation = VIO_READ; + } else { + operation = VIO_READ_MODIFY_WRITE; + } + } else if (bio_data_dir(bio) == READ) { + operation = VIO_READ; + } + + if (requestorSetFUA(dataKVIO)) { + operation |= VIO_FLUSH_AFTER; + } + + LogicalBlockNumber lbn + = sectorToBlock(layer, getBioSector(bio) - layer->startingSectorOffset); + prepareDataVIO(&dataKVIO->dataVIO, lbn, operation, isTrim, callback); + enqueueKVIO(kvio, launchDataKVIOWork, vioAsCompletion(kvio->vio)->callback, + REQ_Q_ACTION_MAP_BIO); + return VDO_SUCCESS; +} + +/** + * Hash a DataKVIO and set its chunk name. + * + * @param item The DataKVIO to be hashed + **/ +static void kvdoHashDataWork(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + DataVIO *dataVIO = &dataKVIO->dataVIO; + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + + MurmurHash3_x64_128(dataKVIO->dataBlock, VDO_BLOCK_SIZE, 0x62ea60be, + &dataVIO->chunkName); + dataKVIO->dedupeContext.chunkName = &dataVIO->chunkName; + + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/**********************************************************************/ +void kvdoHashDataVIO(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); + launchDataKVIOOnCPUQueue(dataVIOAsDataKVIO(dataVIO), kvdoHashDataWork, NULL, + CPU_Q_ACTION_HASH_BLOCK); +} + +/**********************************************************************/ +void kvdoCheckForDuplication(DataVIO *dataVIO) +{ + dataVIOAddTraceRecord(dataVIO, + THIS_LOCATION("checkForDuplication;dup=post")); + ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, + "zero block not checked for duplication"); + ASSERT_LOG_ONLY(dataVIO->newMapped.state != MAPPING_STATE_UNMAPPED, + "discard not checked for duplication"); + + DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); + if (hasAllocation(dataVIO)) { + postDedupeAdvice(dataKVIO); + } else { + // This block has not actually been written (presumably because we are + // full), so attempt to dedupe without posting bogus advice. + queryDedupeAdvice(dataKVIO); + } +} + +/**********************************************************************/ +void kvdoUpdateDedupeAdvice(DataVIO *dataVIO) +{ + updateDedupeAdvice(dataVIOAsDataKVIO(dataVIO)); +} + +/** + * Implements BufferFreeFunction. + **/ +static void freePooledDataKVIO(void *poolData, void *data) +{ + if (data == NULL) { + return; + } + + DataKVIO *dataKVIO = (DataKVIO *) data; + KernelLayer *layer = (KernelLayer *) poolData; + if (WRITE_PROTECT_FREE_POOL) { + setWriteProtect(dataKVIO, WP_DATA_KVIO_SIZE, false); + } + + if (dataKVIO->dataBlockBio != NULL) { + freeBio(dataKVIO->dataBlockBio, layer); + } + + if (dataKVIO->readBlock.bio != NULL) { + freeBio(dataKVIO->readBlock.bio, layer); + } + + FREE(dataKVIO->readBlock.buffer); + FREE(dataKVIO->dataBlock); + FREE(dataKVIO->scratchBlock); + FREE(dataKVIO); +} + +/** + * Allocate a DataKVIO. This function is the internals of makePooledDataKVIO(). + * + * @param [in] layer The layer in which the DataKVIO will operate + * @param [out] dataKVIOPtr A pointer to hold the newly allocated DataKVIO + * + * @return VDO_SUCCESS or an error + **/ +static int allocatePooledDataKVIO(KernelLayer *layer, DataKVIO **dataKVIOPtr) +{ + DataKVIO *dataKVIO; + int result; + if (WRITE_PROTECT_FREE_POOL) { + STATIC_ASSERT(WP_DATA_KVIO_SIZE >= sizeof(DataKVIO)); + result = allocateMemory(WP_DATA_KVIO_SIZE, 0, __func__, &dataKVIO); + if (result == VDO_SUCCESS) { + BUG_ON((((size_t) dataKVIO) & (PAGE_SIZE - 1)) != 0); + } + } else { + result = ALLOCATE(1, DataKVIO, __func__, &dataKVIO); + } + + if (result != VDO_SUCCESS) { + return logErrorWithStringError(result, "DataKVIO allocation failure"); + } + + STATIC_ASSERT(VDO_BLOCK_SIZE <= PAGE_SIZE); + result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio data", + &dataKVIO->dataBlock); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, "DataKVIO data allocation failure"); + } + + result = createBio(layer, dataKVIO->dataBlock, &dataKVIO->dataBlockBio); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, + "DataKVIO data bio allocation failure"); + } + + result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio read buffer", + &dataKVIO->readBlock.buffer); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, + "DataKVIO read allocation failure"); + } + + result = createBio(layer, dataKVIO->readBlock.buffer, + &dataKVIO->readBlock.bio); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, + "DataKVIO read bio allocation failure"); + } + + dataKVIO->readBlock.bio->bi_private = &dataKVIO->kvio; + + result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio scratch", + &dataKVIO->scratchBlock); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(layer, dataKVIO); + return logErrorWithStringError(result, + "DataKVIO scratch allocation failure"); + } + + *dataKVIOPtr = dataKVIO; + return VDO_SUCCESS; +} + +/** + * Implements BufferAllocateFunction. + **/ +static int makePooledDataKVIO(void *poolData, void **dataPtr) +{ + DataKVIO *dataKVIO = NULL; + int result = allocatePooledDataKVIO((KernelLayer *) poolData, &dataKVIO); + if (result != VDO_SUCCESS) { + freePooledDataKVIO(poolData, dataKVIO); + return result; + } + + *dataPtr = dataKVIO; + return VDO_SUCCESS; +} + +/** + * Dump out the waiters on each DataVIO in the DataVIO buffer pool. + * + * @param queue The queue to check (logical or physical) + * @param waitOn The label to print for queue (logical or physical) + **/ +static void dumpVIOWaiters(WaitQueue *queue, char *waitOn) +{ + Waiter *first = getFirstWaiter(queue); + if (first == NULL) { + return; + } + + DataVIO *dataVIO = waiterAsDataVIO(first); + logInfo(" %s is locked. Waited on by: VIO %" PRIptr " pbn %" PRIu64 + " lbn %llu d-pbn %llu lastOp %s", + waitOn, dataVIO, getDataVIOAllocation(dataVIO), + dataVIO->logical.lbn, dataVIO->duplicate.pbn, + getOperationName(dataVIO)); + + Waiter *waiter; + for (waiter = first->nextWaiter; + waiter != first; + waiter = waiter->nextWaiter) { + dataVIO = waiterAsDataVIO(waiter); + logInfo(" ... and : VIO %" PRIptr " pbn %llu lbn %" + PRIu64 " d-pbn %llu lastOp %s", + dataVIO, getDataVIOAllocation(dataVIO), dataVIO->logical.lbn, + dataVIO->duplicate.pbn, getOperationName(dataVIO)); + } +} + +/** + * Encode various attributes of a VIO as a string of one-character flags for + * dump logging. This encoding is for logging brevity: + * + * R => VIO completion result not VDO_SUCCESS + * W => VIO is on a wait queue + * D => VIO is a duplicate + * + *

The common case of no flags set will result in an empty, null-terminated + * buffer. If any flags are encoded, the first character in the string will be + * a space character. + * + * @param dataVIO The VIO to encode + * @param buffer The buffer to receive a null-terminated string of encoded + * flag character + **/ +static void encodeVIODumpFlags(DataVIO *dataVIO, char buffer[8]) +{ + char *pFlag = buffer; + *pFlag++ = ' '; + if (dataVIOAsCompletion(dataVIO)->result != VDO_SUCCESS) { + *pFlag++ = 'R'; + } + if (dataVIOAsAllocatingVIO(dataVIO)->waiter.nextWaiter != NULL) { + *pFlag++ = 'W'; + } + if (dataVIO->isDuplicate) { + *pFlag++ = 'D'; + } + if (pFlag == &buffer[1]) { + // No flags, so remove the blank space. + pFlag = buffer; + } + *pFlag = '\0'; +} + +/** + * Dump out info on a DataKVIO from the DataKVIO pool. + * + *

Implements BufferDumpFunction. + * + * @param poolData The pool data + * @param data The DataKVIO to dump + **/ +static void dumpPooledDataKVIO(void *poolData __attribute__((unused)), + void *data) +{ + DataKVIO *dataKVIO = (DataKVIO *) data; + DataVIO *dataVIO = &dataKVIO->dataVIO; + + /* + * This just needs to be big enough to hold a queue (thread) name + * and a function name (plus a separator character and NUL). The + * latter is limited only by taste. + * + * In making this static, we're assuming only one "dump" will run at + * a time. If more than one does run, the log output will be garbled + * anyway. + */ + static char vioWorkItemDumpBuffer[100 + MAX_QUEUE_NAME_LEN]; + /* + * We're likely to be logging a couple thousand of these lines, and + * in some circumstances syslogd may have trouble keeping up, so + * keep it BRIEF rather than user-friendly. + */ + dumpWorkItemToBuffer(&dataKVIO->kvio.enqueueable.workItem, + vioWorkItemDumpBuffer, sizeof(vioWorkItemDumpBuffer)); + // Another static buffer... + // log10(256) = 2.408+, round up: + enum { DECIMAL_DIGITS_PER_UINT64_T = (int) (1 + 2.41 * sizeof(uint64_t)) }; + static char vioBlockNumberDumpBuffer[sizeof("P L D") + + 3 * DECIMAL_DIGITS_PER_UINT64_T]; + if (dataVIO->isDuplicate) { + snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), + "P%llu L%llu D%llu", + getDataVIOAllocation(dataVIO), dataVIO->logical.lbn, + dataVIO->duplicate.pbn); + } else if (hasAllocation(dataVIO)) { + snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), + "P%llu L%llu", + getDataVIOAllocation(dataVIO), dataVIO->logical.lbn); + } else { + snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), + "L%llu", + dataVIO->logical.lbn); + } + + static char vioFlushGenerationBuffer[sizeof(" FG") + + DECIMAL_DIGITS_PER_UINT64_T] = ""; + if (dataVIO->flushGeneration != 0) { + snprintf(vioFlushGenerationBuffer, sizeof(vioFlushGenerationBuffer), + " FG%llu", dataVIO->flushGeneration); + } + + // Encode VIO attributes as a string of one-character flags, usually empty. + static char flagsDumpBuffer[8]; + encodeVIODumpFlags(dataVIO, flagsDumpBuffer); + + logInfo(" kvio %" PRIptr " %s%s %s %s%s", + dataKVIO, vioBlockNumberDumpBuffer, vioFlushGenerationBuffer, + getOperationName(dataVIO), vioWorkItemDumpBuffer, flagsDumpBuffer); + // might want info on: wantAlbireoAnswer / operation / status + // might want info on: bio / bioToSubmit / biosMerged + + dumpVIOWaiters(&dataVIO->logical.waiters, "lbn"); + + // might want to dump more info from VIO here +} + +/**********************************************************************/ +int makeDataKVIOBufferPool(KernelLayer *layer, + uint32_t poolSize, + BufferPool **bufferPoolPtr) +{ + return makeBufferPool("DataKVIO Pool", poolSize, + makePooledDataKVIO, freePooledDataKVIO, + dumpPooledDataKVIO, layer, bufferPoolPtr); +} + +/**********************************************************************/ +DataLocation getDedupeAdvice(const DedupeContext *context) +{ + DataKVIO *dataKVIO = container_of(context, DataKVIO, dedupeContext); + return (DataLocation) { + .state = dataKVIO->dataVIO.newMapped.state, + .pbn = dataKVIO->dataVIO.newMapped.pbn, + }; +} + +/**********************************************************************/ +void setDedupeAdvice(DedupeContext *context, const DataLocation *advice) +{ + DataKVIO *dataKVIO = container_of(context, DataKVIO, dedupeContext); + receiveDedupeAdvice(&dataKVIO->dataVIO, advice); +} diff --git a/source/vdo/kernel/dataKVIO.h b/source/vdo/kernel/dataKVIO.h new file mode 100644 index 0000000..c3989f4 --- /dev/null +++ b/source/vdo/kernel/dataKVIO.h @@ -0,0 +1,468 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dataKVIO.h#5 $ + */ + +#ifndef DATA_KVIO_H +#define DATA_KVIO_H + +#include "dataVIO.h" +#include "kvio.h" +#include "uds-block.h" + +typedef struct { + /* + * The BIO which was received from the device mapper to initiate an I/O + * request. This field will be non-NULL only until the request is + * acknowledged. + */ + BIO *bio; + // Cached copies of fields from the bio which will need to be reset after + // we're done. + void *private; + void *endIO; + // This is a copy of the bi_rw field of the BIO which sadly is not just + // a boolean read-write flag, but also includes other flag bits. + unsigned long rw; +} ExternalIORequest; + +/* Dedupe support */ +struct dedupeContext { + UdsRequest udsRequest; + struct list_head pendingList; + Jiffies submissionTime; + Atomic32 requestState; + int status; + bool isPending; + /** Hash of the associated VIO (NULL if not calculated) */ + const UdsChunkName *chunkName; +}; + +typedef struct { + /** + * A pointer to a block that holds the data from the last read operation. + **/ + char *data; + /** + * Temporary storage for doing reads from the underlying device. + **/ + char *buffer; + /** + * A bio structure wrapping the buffer. + **/ + BIO *bio; + /** + * Callback to invoke after completing the read I/O operation. + **/ + DataKVIOCallback callback; + /** + * Mapping state passed to kvdoReadBlock(), used to determine whether + * the data must be uncompressed. + **/ + BlockMappingState mappingState; + /** + * The result code of the read attempt. + **/ + int status; +} ReadBlock; + +struct dataKVIO { + /* The embedded base code's DataVIO */ + DataVIO dataVIO; + /* The embedded KVIO */ + KVIO kvio; + /* The BIO from the request which is being serviced by this KVIO. */ + ExternalIORequest externalIORequest; + /* Dedupe */ + DedupeContext dedupeContext; + /* Read cache */ + ReadBlock readBlock; + /* partial block support */ + BlockSize offset; + bool isPartial; + /* discard support */ + bool hasDiscardPermit; + DiscardSize remainingDiscard; + /** + * A copy of user data written, so we can do additional processing + * (dedupe, compression) after acknowledging the I/O operation and + * thus losing access to the original data. + * + * Also used as buffer space for read-modify-write cycles when + * emulating smaller-than-blockSize I/O operations. + **/ + char *dataBlock; + /** A bio structure describing the #dataBlock buffer. */ + BIO *dataBlockBio; + /** A block used as output during compression or uncompression. */ + char *scratchBlock; +}; + +/** + * Convert a KVIO to a DataKVIO. + * + * @param kvio The KVIO to convert + * + * @return The KVIO as a DataKVIO + **/ +static inline DataKVIO *kvioAsDataKVIO(KVIO *kvio) +{ + ASSERT_LOG_ONLY(isData(kvio), "KVIO is a DataKVIO"); + return container_of(kvio, DataKVIO, kvio); +} + +/** + * Convert a DataKVIO to a KVIO. + * + * @param dataKVIO The DataKVIO to convert + * + * @return The DataKVIO as a KVIO + **/ +static inline KVIO *dataKVIOAsKVIO(DataKVIO *dataKVIO) +{ + return &dataKVIO->kvio; +} + +/** + * Returns a pointer to the DataKVIO wrapping a DataVIO. + * + * @param dataVIO the DataVIO + * + * @return the DataKVIO + **/ +static inline DataKVIO *dataVIOAsDataKVIO(DataVIO *dataVIO) +{ + return container_of(dataVIO, DataKVIO, dataVIO); +} + +/** + * Returns a pointer to the KVIO associated with a DataVIO. + * + * @param dataVIO the DataVIO + * + * @return the KVIO + **/ +static inline KVIO *dataVIOAsKVIO(DataVIO *dataVIO) +{ + return dataKVIOAsKVIO(dataVIOAsDataKVIO(dataVIO)); +} + +/** + * Returns a pointer to the DataKVIO wrapping a work item. + * + * @param item the work item + * + * @return the DataKVIO + **/ +static inline DataKVIO *workItemAsDataKVIO(KvdoWorkItem *item) +{ + return kvioAsDataKVIO(workItemAsKVIO(item)); +} + +/** + * Get the WorkItem from a DataKVIO. + * + * @param dataKVIO The DataKVIO + * + * @return the DataKVIO's work item + **/ +static inline KvdoWorkItem *workItemFromDataKVIO(DataKVIO *dataKVIO) +{ + return &dataKVIOAsKVIO(dataKVIO)->enqueueable.workItem; +} + +/** + * Get the BIO from a DataKVIO. + * + * @param dataKVIO The DataKVIO from which to get the BIO + * + * @return The DataKVIO's BIO + **/ +static inline BIO *getBIOFromDataKVIO(DataKVIO *dataKVIO) +{ + return dataKVIOAsKVIO(dataKVIO)->bio; +} + +/** + * Get the KernelLayer from a DataKVIO. + * + * @param dataKVIO The DataKVIO from which to get the KernelLayer + * + * @return The DataKVIO's KernelLayer + **/ +static inline KernelLayer *getLayerFromDataKVIO(DataKVIO *dataKVIO) +{ + return dataKVIOAsKVIO(dataKVIO)->layer; +} + +/** + * Set up and enqueue a DataKVIO's work item to be processed in the base code + * context. + * + * @param dataKVIO The DataKVIO with the work item to be run + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +static inline void enqueueDataKVIO(DataKVIO *dataKVIO, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + enqueueKVIO(dataKVIOAsKVIO(dataKVIO), work, statsFunction, action); +} + +/** + * Enqueue a DataKVIO on a work queue. + * + * @param queue The queue + * @param dataKVIO The DataKVIO + **/ +static inline void enqueueDataKVIOWork(KvdoWorkQueue *queue, + DataKVIO *dataKVIO) +{ + enqueueKVIOWork(queue, dataKVIOAsKVIO(dataKVIO)); +} + +/** + * Add a trace record for the current source location. + * + * @param dataKVIO The DataKVIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void dataKVIOAddTraceRecord(DataKVIO *dataKVIO, + TraceLocation location) +{ + dataVIOAddTraceRecord(&dataKVIO->dataVIO, location); +} + +/** + * Set up and enqueue a DataKVIO on the CPU queue. + * + * @param dataKVIO The DataKVIO to set up + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +static inline void launchDataKVIOOnCPUQueue(DataKVIO *dataKVIO, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + launchKVIO(kvio, work, statsFunction, action, kvio->layer->cpuQueue); +} + +/** + * Set up and enqueue a DataKVIO on the BIO Ack queue. + * + * @param dataKVIO The DataKVIO to set up + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +static inline void launchDataKVIOOnBIOAckQueue(DataKVIO *dataKVIO, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + launchKVIO(kvio, work, statsFunction, action, kvio->layer->bioAckQueue); +} + +/** + * Move a DataKVIO back to the base threads. + * + * @param dataKVIO The DataKVIO to enqueue + **/ +static inline void kvdoEnqueueDataVIOCallback(DataKVIO *dataKVIO) +{ + kvdoEnqueueVIOCallback(dataKVIOAsKVIO(dataKVIO)); +} + +/** + * Check whether the external request bio had FUA set. + * + * @param dataKVIO The DataKVIO to check + * + * @return true if the external request bio had FUA set + **/ +static inline bool requestorSetFUA(DataKVIO *dataKVIO) +{ + return ((dataKVIO->externalIORequest.rw & REQ_FUA) == REQ_FUA); +} + +/** + * Associate a KVIO with a BIO passed in from the block layer, and start + * processing the KVIO. + * + * If setting up a KVIO fails, a message is logged, and the limiter permits + * (request and maybe discard) released, but the caller is responsible for + * disposing of the bio. + * + * @param layer The physical layer + * @param bio The bio for which to create KVIO + * @param arrivalTime The time (in jiffies) when the external request + * entered the device mapbio function + * @param hasDiscardPermit Whether we got a permit from the discardLimiter + * of the kernel layer + * + * @return VDO_SUCCESS or a system error code + **/ +int kvdoLaunchDataKVIOFromBio(KernelLayer *layer, + BIO *bio, + Jiffies arrivalTime, + bool hasDiscardPermit) + __attribute__((warn_unused_result)); + +/** + * Return a batch of DataKVIOs to the pool. + * + *

Implements BatchProcessorCallback. + * + * @param batch The batch processor + * @param closure The kernal layer + **/ +void returnDataKVIOBatchToPool(BatchProcessor *batch, void *closure); + +/** + * Implements DataVIOZeroer. + * + * @param dataVIO The DataVIO to zero + **/ +void kvdoZeroDataVIO(DataVIO *dataVIO); + +/** + * Implements DataCopier. + * + * @param source The DataVIO to copy from + * @param destination The DataVIO to copy to + **/ +void kvdoCopyDataVIO(DataVIO *source, DataVIO *destination); + +/** + * Fetch the data for a block from storage. The fetched data will be + * uncompressed when the callback is called, and the result of the read + * operation will be stored in the ReadBlock's status field. On success, + * the data will be in the ReadBlock's data pointer. + * + * @param dataVIO The DataVIO to read a block in for + * @param location The physical block number to read from + * @param mappingState The mapping state of the block to read + * @param action The bio queue action + * @param callback The function to call when the read is done + **/ +void kvdoReadBlock(DataVIO *dataVIO, + PhysicalBlockNumber location, + BlockMappingState mappingState, + BioQAction action, + DataKVIOCallback callback); + +/** + * Implements DataReader. + * + * @param dataVIO The DataVIO to read + **/ +void kvdoReadDataVIO(DataVIO *dataVIO); + +/** + * Implements DataWriter. + * + * @param dataVIO The DataVIO to write + **/ +void kvdoWriteDataVIO(DataVIO *dataVIO); + +/** + * Implements DataModifier. + * + * @param dataVIO The DataVIO to modify + **/ +void kvdoModifyWriteDataVIO(DataVIO *dataVIO); + +/** + * Implements DataHasher. + * + * @param dataVIO The DataVIO to hash + **/ +void kvdoHashDataVIO(DataVIO *dataVIO); + +/** + * Implements DuplicationChecker. + * + * @param dataVIO The DataVIO containing the block to check + **/ +void kvdoCheckForDuplication(DataVIO *dataVIO); + +/** + * Implements DataAcknowledger. + * + * @param dataVIO The DataVIO to acknowledge + **/ +void kvdoAcknowledgeDataVIO(DataVIO *dataVIO); + +/** + * Implements DataCompressor. + * + * @param dataVIO The DataVIO to compress + **/ +void kvdoCompressDataVIO(DataVIO *dataVIO); + +/** + * Implements AlbireoUpdater. + * + * @param dataVIO The DataVIO which needs to change the entry for its data + **/ +void kvdoUpdateDedupeAdvice(DataVIO *dataVIO); + +/** + * Allocate a buffer pool of DataKVIOs. + * + * @param [in] layer The layer in which the DataKVIOs will operate + * @param [in] poolSize The number of DataKVIOs in the pool + * @param [out] bufferPoolPtr A pointer to hold the new buffer pool + * + * @return VDO_SUCCESS or an error + **/ +int makeDataKVIOBufferPool(KernelLayer *layer, + uint32_t poolSize, + BufferPool **bufferPoolPtr) + __attribute__((warn_unused_result)); + +/** + * Get the state needed to generate UDS metadata from the DataKVIO + * associated with a DedupeContext. + * + * @param context The DedupeContext + * + * @return the advice to store in the UDS index + **/ +DataLocation getDedupeAdvice(const DedupeContext *context) + __attribute__((warn_unused_result)); + +/** + * Set the result of a dedupe query for the DataKVIO associated with a + * DedupeContext. + * + * @param context The context receiving advice + * @param advice A data location at which the chunk named in the context + * might be stored (will be NULL if no advice was found) + **/ +void setDedupeAdvice(DedupeContext *context, const DataLocation *advice); + +#endif /* DATA_KVIO_H */ diff --git a/source/vdo/kernel/deadlockQueue.c b/source/vdo/kernel/deadlockQueue.c new file mode 100644 index 0000000..2350b35 --- /dev/null +++ b/source/vdo/kernel/deadlockQueue.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deadlockQueue.c#1 $ + */ + +#include "deadlockQueue.h" + +/**********************************************************************/ +void initializeDeadlockQueue(DeadlockQueue *queue) +{ + spin_lock_init(&queue->lock); + bio_list_init(&queue->list); +} + +/**********************************************************************/ +void addToDeadlockQueue(DeadlockQueue *queue, BIO *bio, Jiffies arrivalTime) +{ + spin_lock(&queue->lock); + if (bio_list_empty(&queue->list)) { + /* + * If we get more than one pending at once, this will be inaccurate for + * some of them. Oh well. If we've gotten here, we're trying to avoid a + * deadlock; stats are a secondary concern. + */ + queue->arrivalTime = arrivalTime; + } + bio_list_add(&queue->list, bio); + spin_unlock(&queue->lock); +} diff --git a/source/vdo/kernel/deadlockQueue.h b/source/vdo/kernel/deadlockQueue.h new file mode 100644 index 0000000..85e0b46 --- /dev/null +++ b/source/vdo/kernel/deadlockQueue.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deadlockQueue.h#1 $ + */ + +#ifndef DEADLOCK_QUEUE_H +#define DEADLOCK_QUEUE_H + +#include + +#include "bio.h" + +/** + * A holding space for incoming bios if we're not able to block until VIOs + * become available to process them. + **/ +typedef struct deadlockQueue { + /* Protection for the other fields. */ + spinlock_t lock; + /* List of bios we had to accept but don't have VIOs for. */ + struct bio_list list; + /* + * Arrival time to use for statistics tracking for the above bios, since we + * haven't the space to store individual arrival times for each. + */ + Jiffies arrivalTime; +} DeadlockQueue; + +/** + * Initialize the DeadlockQueue structure. + * + * @param queue The structure to initialize + **/ +void initializeDeadlockQueue(DeadlockQueue *queue); + +/** + * Add an incoming bio to the list of saved-up bios we're not ready to start + * processing yet. + * + * This excess buffering on top of what the caller implements is generally a + * bad idea, and should be used only when necessary, such as to avoid a + * possible deadlock situation. + * + * @param queue The incoming-bio queue structure + * @param bio The new incoming bio to save + * @param arrivalTime The arrival time of this new bio + **/ +void addToDeadlockQueue(DeadlockQueue *queue, BIO *bio, Jiffies arrivalTime); + +/** + * Pull an incoming bio off the queue. + * + * The arrival time returned may be incorrect if multiple bios were saved, as + * there is no per-bio storage used, only one saved arrival time for the whole + * queue. + * + * @param [in] queue The incoming-bio queue + * @param [out] arrivalTime The arrival time to use for this bio + * + * @return a BIO pointer, or NULL if none were queued + **/ +static inline BIO *pollDeadlockQueue(DeadlockQueue *queue, + Jiffies *arrivalTime) +{ + spin_lock(&queue->lock); + BIO *bio = bio_list_pop(&queue->list); + if (unlikely(bio != NULL)) { + *arrivalTime = queue->arrivalTime; + } + spin_unlock(&queue->lock); + return bio; +} + +#endif // DEADLOCK_QUEUE_H diff --git a/source/vdo/kernel/dedupeIndex.c b/source/vdo/kernel/dedupeIndex.c new file mode 100644 index 0000000..811cd93 --- /dev/null +++ b/source/vdo/kernel/dedupeIndex.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dedupeIndex.c#1 $ + */ + +#include "dedupeIndex.h" + +#include "numeric.h" + +#include "udsIndex.h" + +// These times are in milliseconds +unsigned int albireoTimeoutInterval = 5000; +unsigned int minAlbireoTimerInterval = 100; + +// These times are in jiffies +Jiffies albireoTimeoutJiffies = 0; +static Jiffies minAlbireoTimerJiffies = 0; + +/**********************************************************************/ +Jiffies getAlbireoTimeout(Jiffies startJiffies) +{ + return maxULong(startJiffies + albireoTimeoutJiffies, + jiffies + minAlbireoTimerJiffies); +} + +/**********************************************************************/ +void setAlbireoTimeoutInterval(unsigned int value) +{ + // Arbitrary maximum value is two minutes + if (value > 120000) { + value = 120000; + } + // Arbitrary minimum value is 2 jiffies + Jiffies albJiffies = msecs_to_jiffies(value); + if (albJiffies < 2) { + albJiffies = 2; + value = jiffies_to_msecs(albJiffies); + } + albireoTimeoutInterval = value; + albireoTimeoutJiffies = albJiffies; +} + +/**********************************************************************/ +void setMinAlbireoTimerInterval(unsigned int value) +{ + // Arbitrary maximum value is one second + if (value > 1000) { + value = 1000; + } + + // Arbitrary minimum value is 2 jiffies + Jiffies minJiffies = msecs_to_jiffies(value); + if (minJiffies < 2) { + minJiffies = 2; + value = jiffies_to_msecs(minJiffies); + } + + minAlbireoTimerInterval = value; + minAlbireoTimerJiffies = minJiffies; +} + +/**********************************************************************/ +int makeDedupeIndex(DedupeIndex **indexPtr, KernelLayer *layer) +{ + if (albireoTimeoutJiffies == 0) { + setAlbireoTimeoutInterval(albireoTimeoutInterval); + } + + if (minAlbireoTimerJiffies == 0) { + setMinAlbireoTimerInterval(minAlbireoTimerInterval); + } + + return makeUDSIndex(layer, indexPtr); +} diff --git a/source/vdo/kernel/dedupeIndex.h b/source/vdo/kernel/dedupeIndex.h new file mode 100644 index 0000000..31d7631 --- /dev/null +++ b/source/vdo/kernel/dedupeIndex.h @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dedupeIndex.h#5 $ + */ + +#ifndef DEDUPE_INDEX_H +#define DEDUPE_INDEX_H + +#include "dataKVIO.h" + +struct dedupeIndex { + + /** + * Do the dedupe section of dmsetup message vdo0 0 dump ... + * + * @param index The dedupe index + * @param showQueue true to dump a dedupe work queue + **/ + void (*dump)(DedupeIndex *index, bool showQueue); + + /** + * Free a dedupe index. The "finish" method must have been called + * first. + * + * @param index The dedupe index + **/ + void (*free)(DedupeIndex *index); + + /** + * Get the name of the deduplication state + * + * @param index The dedupe index + * + * @return the dedupe state name + **/ + const char *(*getDedupeStateName)(DedupeIndex *index); + + /** + * Get the index statistics + * + * @param index The dedupe index + * @param stats The index statistics + **/ + void (*getStatistics)(DedupeIndex *index, IndexStatistics *stats); + + /** + * Process a dmsetup message directed to the index. + * + * @param index The dedupe index + * @param name The message name + * + * @return 0 or an error code + **/ + int (*message)(DedupeIndex *index, const char *name); + + /** + * Look up the chunkname of the DataKVIO. If found, return the PBN + * previously associated with the name. If not found, associate the + * new PBN with the name. + * + * @param dataKVIO The DataKVIO + **/ + void (*post)(DataKVIO *dataKVIO); + + /** + * Look up the chunkname of the DataKVIO. If found, return the PBN + * previously associated with the name. If not found, do nothing. + * + * @param dataKVIO The DataKVIO + **/ + void (*query)(DataKVIO *dataKVIO); + + /** + * Start the dedupe index. + * + * @param index The dedupe index + * @param createFlag If true, create a new index without first attempting + * to load an existing index + **/ + void (*start)(DedupeIndex *index, bool createFlag); + + /** + * Stop the dedupe index. May be called by any thread, but will wait for + * the shutdown to be completed. + * + * @param index The dedupe index + **/ + void (*stop)(DedupeIndex *index); + + /** + * Suspend the dedupe index. If there are any outstanding index + * requests, wait for them to finish. If the index is doing any + * asynchronous writing, wait for the I/O to complete. If the index + * is not open yet and we are doing a rebuild of the master index, + * pause the rebuild so that it can be resumed later. May be called + * from any thread. + * + * @param index The dedupe index + * @param saveFlag True if we should save the index + **/ + void (*suspend)(DedupeIndex *index, bool saveFlag); + + /** + * Resume a suspended dedupe index. May be called from any thread. + * + * @param index The dedupe index + **/ + void (*resume)(DedupeIndex *index); + + /** + * Finish the dedupe index; shuts it down for good and prepares to + * free resources. After this point, no more requests may be sent to + * it. + * + * @param index The dedupe index + **/ + void (*finish)(DedupeIndex *index); + + /** + * Look up the chunkname of the DataKVIO and associate the new PBN with the + * name. + * + * @param dataKVIO The DataKVIO + **/ + void (*update)(DataKVIO *dataKVIO); +}; + +/** + * Make a dedupe index + * + * @param indexPtr dedupe index returned here + * @param layer the kernel layer + * + * @return VDO_SUCCESS or an error code + **/ +int makeDedupeIndex(DedupeIndex **indexPtr, KernelLayer *layer) + __attribute__((warn_unused_result)); + + +/** + * Do the dedupe section of dmsetup message vdo0 0 dump ... + * + * @param index The dedupe index + * @param showQueue true to dump a dedupe work queue + **/ +static inline void dumpDedupeIndex(DedupeIndex *index, bool showQueue) +{ + index->dump(index, showQueue); +} + +/** + * Free the dedupe index + * + * @param index The dedupe index + **/ +static inline void freeDedupeIndex(DedupeIndex **index) +{ + if (*index != NULL) { + (*index)->free(*index); + *index = NULL; + } +} + +/** + * Get the name of the deduplication state + * + * @param index The dedupe index + * + * @return the dedupe state name + **/ +static inline const char *getDedupeStateName(DedupeIndex *index) +{ + return index->getDedupeStateName(index); +} + +/** + * Get the index statistics + * + * @param index The dedupe index + * @param stats The index statistics + **/ +static inline void getIndexStatistics(DedupeIndex *index, + IndexStatistics *stats) +{ + return index->getStatistics(index, stats); +} + +/** + * Return from a dedupe operation by invoking the callback function + * + * @param dataKVIO The DataKVIO + **/ +static inline void invokeDedupeCallback(DataKVIO *dataKVIO) +{ + + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION("$F($dup);cb=dedupe($dup)")); + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/** + * Process a dmsetup message directed to the index. + * + * @param index The dedupe index + * @param name The message name + * + * @return 0 or an error code + **/ +static inline int messageDedupeIndex(DedupeIndex *index, const char *name) +{ + return index->message(index, name); +} + +/** + * Look up the chunkname of the DataKVIO and identify duplicated chunks. + * + * @param dataKVIO The DataKVIO. These fields are used: + * dedupeContext.chunkName is the chunk name. + * The advice to offer to the index will be obtained + * via getDedupeAdvice(). The advice found in the index + * (or NULL if none) will be returned via setDedupeAdvice(). + * dedupeContext.status is set to the return status code of + * any asynchronous index processing. + **/ +static inline void postDedupeAdvice(DataKVIO *dataKVIO) +{ + KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; + layer->dedupeIndex->post(dataKVIO); +} + +/** + * Look up the chunkname of the DataKVIO and identify duplicated chunks. + * + * @param dataKVIO The DataKVIO. These fields are used: + * dedupeContext.chunkName is the chunk name. + * The advice found in the index (or NULL if none) will + * be returned via setDedupeAdvice(). + * dedupeContext.status is set to the return status code of + * any asynchronous index processing. + **/ +static inline void queryDedupeAdvice(DataKVIO *dataKVIO) +{ + KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; + layer->dedupeIndex->query(dataKVIO); +} + +/** + * Start the dedupe index. + * + * @param index The dedupe index + * @param createFlag If true, create a new index without first attempting + * to load an existing index + **/ +static inline void startDedupeIndex(DedupeIndex *index, bool createFlag) +{ + index->start(index, createFlag); +} + +/** + * Stop the dedupe index. May be called by any thread, but will wait for + * the shutdown to be completed. + * + * @param index The dedupe index + **/ +static inline void stopDedupeIndex(DedupeIndex *index) +{ + return index->stop(index); +} + +/** + * Suspend the dedupe index. If there are any outstanding index + * requests, wait for them to finish. If the index is doing any + * asynchronous writing, wait for the I/O to complete. If the index is + * not open yet and we are doing a rebuild of the master index, pause + * the rebuild so that it can be resumed later. May be called from any + * thread. + * + * @param index The dedupe index + * @param saveFlag True if we should save the index + **/ +static inline void suspendDedupeIndex(DedupeIndex *index, bool saveFlag) +{ + index->suspend(index, saveFlag); +} + +/** + * Resume a suspended dedupe index. May be called from any thread. + * + * @param index The dedupe index + **/ +static inline void resumeDedupeIndex(DedupeIndex *index) +{ + index->resume(index); +} + +/** + * Finish the dedupe index. + * + * @param index The dedupe index + **/ +static inline void finishDedupeIndex(DedupeIndex *index) +{ + return index->finish(index); +} + +/** + * Look up the chunkname of the DataKVIO and associate the new PBN with the + * name. + * + * @param dataKVIO The DataKVIO. These fields are used: + * dedupeContext.chunkName is the chunk name. + * The advice to offer to the index will be obtained + * via getDedupeAdvice(). dedupeContext.status is set to the + * return status code of any asynchronous index processing. + **/ +static inline void updateDedupeAdvice(DataKVIO *dataKVIO) +{ + KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; + layer->dedupeIndex->update(dataKVIO); +} + +// Interval (in milliseconds or jiffies) from submission until switching to +// fast path and skipping Albireo. +extern unsigned int albireoTimeoutInterval; +extern Jiffies albireoTimeoutJiffies; + +// Minimum time interval (in milliseconds) between timer invocations to +// check for requests waiting for Albireo that should now time out. +extern unsigned int minAlbireoTimerInterval; + +/** + * Calculate the actual end of a timer, taking into account the absolute + * start time and the present time. + * + * @param startJiffies The absolute start time, in jiffies + * + * @return the absolute end time for the timer, in jiffies + **/ +Jiffies getAlbireoTimeout(Jiffies startJiffies); + +/** + * Set the interval from submission until switching to fast path and + * skipping Albireo. + * + * @param value The number of milliseconds + **/ +void setAlbireoTimeoutInterval(unsigned int value); + +/** + * Set the minimum time interval between timer invocations to check for + * requests waiting for Albireo that should now time out. + * + * @param value The number of milliseconds + **/ +void setMinAlbireoTimerInterval(unsigned int value); + +#endif /* DEDUPE_INDEX_H */ diff --git a/source/vdo/kernel/deviceConfig.c b/source/vdo/kernel/deviceConfig.c new file mode 100644 index 0000000..08e864c --- /dev/null +++ b/source/vdo/kernel/deviceConfig.c @@ -0,0 +1,769 @@ +/** + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceConfig.c#14 $ + */ + +#include "deviceConfig.h" + +#include + +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" + +#include "kernelLayer.h" +#include "vdoStringUtils.h" + +#include "constants.h" + +enum { + // If we bump this, update the arrays below + TABLE_VERSION = 2, + // Limits used when parsing thread-count config spec strings + BIO_ROTATION_INTERVAL_LIMIT = 1024, + LOGICAL_THREAD_COUNT_LIMIT = 60, + PHYSICAL_THREAD_COUNT_LIMIT = 16, + THREAD_COUNT_LIMIT = 100, + // XXX The bio-submission queue configuration defaults are temporarily + // still being defined here until the new runtime-based thread + // configuration has been fully implemented for managed VDO devices. + + // How many bio submission work queues to use + DEFAULT_NUM_BIO_SUBMIT_QUEUES = 4, + // How often to rotate between bio submission work queues + DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64, +}; + +// arrays for handling different table versions +static const uint8_t REQUIRED_ARGC[] = {10, 12, 9}; +static const uint8_t POOL_NAME_ARG_INDEX[] = {8, 10, 8}; + +/** + * Decide the version number from argv. + * + * @param [in] argc The number of table values + * @param [in] argv The array of table values + * @param [out] errorPtr A pointer to return a error string in + * @param [out] versionPtr A pointer to return the version + * + * @return VDO_SUCCESS or an error code + **/ +static int getVersionNumber(int argc, + char **argv, + char **errorPtr, + TableVersion *versionPtr) +{ + // version, if it exists, is in a form of V + if (sscanf(argv[0], "V%u", versionPtr) == 1) { + if (*versionPtr < 1 || *versionPtr > TABLE_VERSION) { + *errorPtr = "Unknown version number detected"; + return VDO_BAD_CONFIGURATION; + } + } else { + // V0 actually has no version number in the table string + *versionPtr = 0; + } + + // V0 and V1 have no optional parameters. There will always be + // a parameter for thread config, even if its a "." to show + // its an empty list. + if (*versionPtr <= 1) { + if (argc != REQUIRED_ARGC[*versionPtr]) { + *errorPtr = "Incorrect number of arguments for version"; + return VDO_BAD_CONFIGURATION; + } + } else if (argc < REQUIRED_ARGC[*versionPtr]) { + *errorPtr = "Incorrect number of arguments for version"; + return VDO_BAD_CONFIGURATION; + } + + if (*versionPtr != TABLE_VERSION) { + logWarning("Detected version mismatch between kernel module and tools " + " kernel: %d, tool: %d", TABLE_VERSION, *versionPtr); + logWarning("Please consider upgrading management tools to match kernel."); + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +int getPoolNameFromArgv(int argc, + char **argv, + char **errorPtr, + char **poolNamePtr) +{ + TableVersion version; + int result = getVersionNumber(argc, argv, errorPtr, &version); + if (result != VDO_SUCCESS) { + return result; + } + *poolNamePtr = argv[POOL_NAME_ARG_INDEX[version]]; + return VDO_SUCCESS; +} + +/** + * Resolve the config with write policy, physical size, and other unspecified + * fields based on the device, if needed. + * + * @param [in,out] config The config possibly missing values + * @param [in] verbose Whether to log about the underlying device + **/ +static void resolveConfigWithDevice(DeviceConfig *config, + bool verbose) +{ + struct dm_dev *dev = config->ownedDevice; + struct request_queue *requestQueue = bdev_get_queue(dev->bdev); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,7,0) + bool flushSupported + = ((requestQueue->queue_flags & (1ULL << QUEUE_FLAG_WC)) != 0); + bool fuaSupported + = ((requestQueue->queue_flags & (1ULL << QUEUE_FLAG_FUA)) != 0); +#else + bool flushSupported = ((requestQueue->flush_flags & REQ_FLUSH) == REQ_FLUSH); + bool fuaSupported = ((requestQueue->flush_flags & REQ_FUA) == REQ_FUA); +#endif + if (verbose) { + logInfo("underlying device, REQ_FLUSH: %s, REQ_FUA: %s", + (flushSupported ? "supported" : "not supported"), + (fuaSupported ? "supported" : "not supported")); + } else { + // We should probably always log, but need to make sure that makes sense + // before changing behavior. + } + + if (config->writePolicy == WRITE_POLICY_AUTO) { + config->writePolicy + = (flushSupported ? WRITE_POLICY_ASYNC : WRITE_POLICY_SYNC); + logInfo("Using write policy %s automatically.", + getConfigWritePolicyString(config)); + } else { + logInfo("Using write policy %s.", getConfigWritePolicyString(config)); + } + + if (flushSupported && (config->writePolicy == WRITE_POLICY_SYNC)) { + logWarning("WARNING: Running in sync mode atop a device supporting flushes" + " is dangerous!"); + } + + if (config->version == 0) { + uint64_t deviceSize = i_size_read(dev->bdev->bd_inode); + config->physicalBlocks = deviceSize / VDO_BLOCK_SIZE; + } +} + +/** + * Parse a two-valued option into a bool. + * + * @param [in] boolStr The string value to convert to a bool + * @param [in] trueStr The string value which should be converted to true + * @param [in] falseStr The string value which should be converted to false + * @param [out] boolPtr A pointer to return the bool value in + * + * @return VDO_SUCCESS or an error if boolStr is neither trueStr nor falseStr + **/ +__attribute__((warn_unused_result)) +static inline int parseBool(const char *boolStr, + const char *trueStr, + const char *falseStr, + bool *boolPtr) +{ + bool value = false; + if (strcmp(boolStr, trueStr) == 0) { + value = true; + } else if (strcmp(boolStr, falseStr) == 0) { + value = false; + } else { + return VDO_BAD_CONFIGURATION; + } + + *boolPtr = value; + return VDO_SUCCESS; +} + +/** + * Process one component of a thread parameter configuration string and + * update the configuration data structure. + * + * If the thread count requested is invalid, a message is logged and + * -EINVAL returned. If the thread name is unknown, a message is logged + * but no error is returned. + * + * @param threadParamType The type of thread specified + * @param count The thread count requested + * @param config The configuration data structure to update + * + * @return VDO_SUCCESS or -EINVAL + **/ +static int processOneThreadConfigSpec(const char *threadParamType, + unsigned int count, + ThreadCountConfig *config) +{ + // Handle limited thread parameters + if (strcmp(threadParamType, "bioRotationInterval") == 0) { + if (count == 0) { + logError("thread config string error:" + " 'bioRotationInterval' of at least 1 is required"); + return -EINVAL; + } else if (count > BIO_ROTATION_INTERVAL_LIMIT) { + logError("thread config string error:" + " 'bioRotationInterval' cannot be higher than %d", + BIO_ROTATION_INTERVAL_LIMIT); + return -EINVAL; + } + config->bioRotationInterval = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "logical") == 0) { + if (count > LOGICAL_THREAD_COUNT_LIMIT) { + logError("thread config string error: at most %d 'logical' threads" + " are allowed", + LOGICAL_THREAD_COUNT_LIMIT); + return -EINVAL; + } + config->logicalZones = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "physical") == 0) { + if (count > PHYSICAL_THREAD_COUNT_LIMIT) { + logError("thread config string error: at most %d 'physical' threads" + " are allowed", + PHYSICAL_THREAD_COUNT_LIMIT); + return -EINVAL; + } + config->physicalZones = count; + return VDO_SUCCESS; + } else { + // Handle other thread count parameters + if (count > THREAD_COUNT_LIMIT) { + logError("thread config string error: at most %d '%s' threads" + " are allowed", + THREAD_COUNT_LIMIT, threadParamType); + return -EINVAL; + } + + if (strcmp(threadParamType, "hash") == 0) { + config->hashZones = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "cpu") == 0) { + if (count == 0) { + logError("thread config string error:" + " at least one 'cpu' thread required"); + return -EINVAL; + } + config->cpuThreads = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "ack") == 0) { + config->bioAckThreads = count; + return VDO_SUCCESS; + } else if (strcmp(threadParamType, "bio") == 0) { + if (count == 0) { + logError("thread config string error:" + " at least one 'bio' thread required"); + return -EINVAL; + } + config->bioThreads = count; + return VDO_SUCCESS; + } + } + + // Don't fail, just log. This will handle version mismatches between + // user mode tools and kernel. + logInfo("unknown thread parameter type \"%s\"", threadParamType); + return VDO_SUCCESS; +} + +/** + * Parse one component of a thread parameter configuration string and + * update the configuration data structure. + * + * @param spec The thread parameter specification string + * @param config The configuration data to be updated + **/ +static int parseOneThreadConfigSpec(const char *spec, + ThreadCountConfig *config) +{ + char **fields; + int result = splitString(spec, '=', &fields); + if (result != UDS_SUCCESS) { + return result; + } + if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) { + logError("thread config string error:" + " expected thread parameter assignment, saw \"%s\"", + spec); + freeStringArray(fields); + return -EINVAL; + } + + unsigned int count; + result = stringToUInt(fields[1], &count); + if (result != UDS_SUCCESS) { + logError("thread config string error: integer value needed, found \"%s\"", + fields[1]); + freeStringArray(fields); + return result; + } + + result = processOneThreadConfigSpec(fields[0], count, config); + freeStringArray(fields); + return result; +} + +/** + * Parse the configuration string passed and update the specified + * counts and other parameters of various types of threads to be created. + * + * The configuration string should contain one or more comma-separated specs + * of the form "typename=number"; the supported type names are "cpu", "ack", + * "bio", "bioRotationInterval", "logical", "physical", and "hash". + * + * If an error occurs during parsing of a single key/value pair, we deem + * it serious enough to stop further parsing. + * + * This function can't set the "reason" value the caller wants to pass + * back, because we'd want to format it to say which field was + * invalid, and we can't allocate the "reason" strings dynamically. So + * if an error occurs, we'll log the details and pass back an error. + * + * @param string Thread parameter configuration string + * @param config The thread configuration data to update + * + * @return VDO_SUCCESS or -EINVAL or -ENOMEM + **/ +static int parseThreadConfigString(const char *string, + ThreadCountConfig *config) +{ + int result = VDO_SUCCESS; + + char **specs; + if (strcmp(".", string) != 0) { + result = splitString(string, ',', &specs); + if (result != UDS_SUCCESS) { + return result; + } + for (unsigned int i = 0; specs[i] != NULL; i++) { + result = parseOneThreadConfigSpec(specs[i], config); + if (result != VDO_SUCCESS) { + break; + } + } + freeStringArray(specs); + } + return result; +} + +/** + * Process one component of an optional parameter string and + * update the configuration data structure. + * + * If the value requested is invalid, a message is logged and + * -EINVAL returned. If the key is unknown, a message is logged + * but no error is returned. + * + * @param key The optional parameter key name + * @param value The optional parameter value + * @param config The configuration data structure to update + * + * @return VDO_SUCCESS or -EINVAL + **/ +static int processOneKeyValuePair(const char *key, + unsigned int value, + DeviceConfig *config) +{ + // Non thread optional parameters + if (strcmp(key, "maxDiscard") == 0) { + if (value == 0) { + logError("optional parameter error:" + " at least one max discard block required"); + return -EINVAL; + } + // Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 + if (value > (UINT_MAX / VDO_BLOCK_SIZE)) { + logError("optional parameter error: at most %d max discard" + " blocks are allowed", UINT_MAX / VDO_BLOCK_SIZE); + return -EINVAL; + } + config->maxDiscardBlocks = value; + return VDO_SUCCESS; + } + // Handles unknown key names + return processOneThreadConfigSpec(key, value, &config->threadCounts); +} + +/** + * Parse one key/value pair and update the configuration + * data structure. + * + * @param key The optional key name + * @param value The optional value + * @param config The configuration data to be updated + * + * @return VDO_SUCCESS or error + **/ +static int parseOneKeyValuePair(const char *key, + const char *value, + DeviceConfig *config) +{ + if (strcmp(key, "deduplication") == 0) { + return parseBool(value, "on", "off", &config->deduplication); + } + + // The remaining arguments must have integral values. + unsigned int count; + int result = stringToUInt(value, &count); + if (result != UDS_SUCCESS) { + logError("optional config string error: integer value needed, found \"%s\"", + value); + return result; + } + return processOneKeyValuePair(key, count, config); +} + +/** + * Parse all key/value pairs from a list of arguments. + * + * If an error occurs during parsing of a single key/value pair, we deem + * it serious enough to stop further parsing. + * + * This function can't set the "reason" value the caller wants to pass + * back, because we'd want to format it to say which field was + * invalid, and we can't allocate the "reason" strings dynamically. So + * if an error occurs, we'll log the details and return the error. + * + * @param argc The total number of arguments in list + * @param argv The list of key/value pairs + * @param config The device configuration data to update + * + * @return VDO_SUCCESS or error + **/ +static int parseKeyValuePairs(int argc, + char **argv, + DeviceConfig *config) +{ + int result = VDO_SUCCESS; + while (argc) { + result = parseOneKeyValuePair(argv[0], argv[1], config); + if (result != VDO_SUCCESS) { + break; + } + + argc -= 2; + argv += 2; + } + + return result; +} + +/** + * Parse the configuration string passed in for optional arguments. + * + * For V0/V1 configurations, there will only be one optional parameter; + * the thread configuration. The configuration string should contain + * one or more comma-separated specs of the form "typename=number"; the + * supported type names are "cpu", "ack", "bio", "bioRotationInterval", + * "logical", "physical", and "hash". + * + * For V2 configurations and beyond, there could be any number of + * arguments. They should contain one or more key/value pairs + * separated by a space. + * + * @param argSet The structure holding the arguments to parse + * @param errorPtr Pointer to a buffer to hold the error string + * @param config Pointer to device configuration data to update + * + * @return VDO_SUCCESS or error + */ +int parseOptionalArguments(struct dm_arg_set *argSet, + char **errorPtr, + DeviceConfig *config) +{ + int result = VDO_SUCCESS; + + if (config->version == 0 || config->version == 1) { + result = parseThreadConfigString(argSet->argv[0], + &config->threadCounts); + if (result != VDO_SUCCESS) { + *errorPtr = "Invalid thread-count configuration"; + return VDO_BAD_CONFIGURATION; + } + } else { + if ((argSet->argc % 2) != 0) { + *errorPtr = "Odd number of optional arguments given but they" + " should be pairs"; + return VDO_BAD_CONFIGURATION; + } + result = parseKeyValuePairs(argSet->argc, argSet->argv, config); + if (result != VDO_SUCCESS) { + *errorPtr = "Invalid optional argument configuration"; + return VDO_BAD_CONFIGURATION; + } + } + return result; +} + +/** + * Handle a parsing error. + * + * @param configPtr A pointer to the config to free + * @param errorPtr A place to store a constant string about the error + * @param errorStr A constant string to store in errorPtr + **/ +static void handleParseError(DeviceConfig **configPtr, + char **errorPtr, + char *errorStr) +{ + freeDeviceConfig(configPtr); + *errorPtr = errorStr; +} + +/**********************************************************************/ +int parseDeviceConfig(int argc, + char **argv, + struct dm_target *ti, + bool verbose, + DeviceConfig **configPtr) +{ + char **errorPtr = &ti->error; + DeviceConfig *config = NULL; + int result = ALLOCATE(1, DeviceConfig, "DeviceConfig", &config); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Could not allocate config structure"); + return VDO_BAD_CONFIGURATION; + } + + config->owningTarget = ti; + initializeRing(&config->configNode); + + // Save the original string. + result = joinStrings(argv, argc, ' ', &config->originalString); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Could not populate string"); + return VDO_BAD_CONFIGURATION; + } + + // Set defaults. + // + // XXX Defaults for bioThreads and bioRotationInterval are currently defined + // using the old configuration scheme of constants. These values are relied + // upon for performance testing on MGH machines currently. + // This should be replaced with the normally used testing defaults being + // defined in the file-based thread-configuration settings. The values used + // as defaults internally should really be those needed for VDO in its + // default shipped-product state. + config->threadCounts = (ThreadCountConfig) { + .bioAckThreads = 1, + .bioThreads = DEFAULT_NUM_BIO_SUBMIT_QUEUES, + .bioRotationInterval = DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL, + .cpuThreads = 1, + .logicalZones = 0, + .physicalZones = 0, + .hashZones = 0, + }; + config->maxDiscardBlocks = 1; + config->deduplication = true; + + struct dm_arg_set argSet; + + argSet.argc = argc; + argSet.argv = argv; + + result = getVersionNumber(argc, argv, errorPtr, &config->version); + if (result != VDO_SUCCESS) { + // getVersionNumber sets errorPtr itself. + handleParseError(&config, errorPtr, *errorPtr); + return result; + } + // Move the arg pointer forward only if the argument was there. + if (config->version >= 1) { + dm_shift_arg(&argSet); + } + + result = duplicateString(dm_shift_arg(&argSet), "parent device name", + &config->parentDeviceName); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Could not copy parent device name"); + return VDO_BAD_CONFIGURATION; + } + + // Get the physical blocks, if known. + if (config->version >= 1) { + result = kstrtoull(dm_shift_arg(&argSet), 10, &config->physicalBlocks); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid physical block count"); + return VDO_BAD_CONFIGURATION; + } + } + + // Get the logical block size and validate + bool enable512e; + result = parseBool(dm_shift_arg(&argSet), "512", "4096", &enable512e); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid logical block size"); + return VDO_BAD_CONFIGURATION; + } + config->logicalBlockSize = (enable512e ? 512 : 4096); + + // Skip past the two no longer used read cache options. + if (config->version <= 1) { + dm_consume_args(&argSet, 2); + } + + // Get the page cache size. + result = stringToUInt(dm_shift_arg(&argSet), &config->cacheSize); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid block map page cache size"); + return VDO_BAD_CONFIGURATION; + } + + // Get the block map era length. + result = stringToUInt(dm_shift_arg(&argSet), &config->blockMapMaximumAge); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid block map maximum age"); + return VDO_BAD_CONFIGURATION; + } + + // Get the MD RAID5 optimization mode and validate + result = parseBool(dm_shift_arg(&argSet), "on", "off", + &config->mdRaid5ModeEnabled); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Invalid MD RAID5 mode"); + return VDO_BAD_CONFIGURATION; + } + + // Get the write policy and validate. + if (strcmp(argSet.argv[0], "async") == 0) { + config->writePolicy = WRITE_POLICY_ASYNC; + } else if (strcmp(argSet.argv[0], "async-unsafe") == 0) { + config->writePolicy = WRITE_POLICY_ASYNC_UNSAFE; + } else if (strcmp(argSet.argv[0], "sync") == 0) { + config->writePolicy = WRITE_POLICY_SYNC; + } else if (strcmp(argSet.argv[0], "auto") == 0) { + config->writePolicy = WRITE_POLICY_AUTO; + } else { + handleParseError(&config, errorPtr, "Invalid write policy"); + return VDO_BAD_CONFIGURATION; + } + dm_shift_arg(&argSet); + + // Make sure the enum to get the pool name from argv directly is still in + // sync with the parsing of the table line. + if (&argSet.argv[0] != &argv[POOL_NAME_ARG_INDEX[config->version]]) { + handleParseError(&config, errorPtr, "Pool name not in expected location"); + return VDO_BAD_CONFIGURATION; + } + + // Get the address where the albserver is running. Check for validation + // is done in dedupe.c code during startKernelLayer call + result = duplicateString(dm_shift_arg(&argSet), "pool name", + &config->poolName); + if (result != VDO_SUCCESS) { + handleParseError(&config, errorPtr, "Could not copy pool name"); + return VDO_BAD_CONFIGURATION; + } + + // Get the optional arguments and validate. + result = parseOptionalArguments(&argSet, errorPtr, config); + if (result != VDO_SUCCESS) { + // parseOptionalArguments sets errorPtr itself. + handleParseError(&config, errorPtr, *errorPtr); + return result; + } + + /* + * Logical, physical, and hash zone counts can all be zero; then we get one + * thread doing everything, our older configuration. If any zone count is + * non-zero, the others must be as well. + */ + if (((config->threadCounts.logicalZones == 0) + != (config->threadCounts.physicalZones == 0)) + || ((config->threadCounts.physicalZones == 0) + != (config->threadCounts.hashZones == 0)) + ) { + handleParseError(&config, errorPtr, + "Logical, physical, and hash zones counts must all be" + " zero or all non-zero"); + return VDO_BAD_CONFIGURATION; + } + + result = dm_get_device(ti, config->parentDeviceName, + dm_table_get_mode(ti->table), &config->ownedDevice); + if (result != 0) { + logError("couldn't open device \"%s\": error %d", + config->parentDeviceName, result); + handleParseError(&config, errorPtr, "Unable to open storage device"); + return VDO_BAD_CONFIGURATION; + } + + resolveConfigWithDevice(config, verbose); + + *configPtr = config; + return result; +} + +/**********************************************************************/ +void freeDeviceConfig(DeviceConfig **configPtr) +{ + if (configPtr == NULL) { + return; + } + + DeviceConfig *config = *configPtr; + if (config == NULL) { + *configPtr = NULL; + return; + } + + if (config->ownedDevice != NULL) { + dm_put_device(config->owningTarget, config->ownedDevice); + } + + FREE(config->poolName); + FREE(config->parentDeviceName); + FREE(config->originalString); + + // Reduce the chance a use-after-free (as in BZ 1669960) happens to work. + memset(config, 0, sizeof(*config)); + + FREE(config); + *configPtr = NULL; +} + +/**********************************************************************/ +const char *getConfigWritePolicyString(DeviceConfig *config) +{ + switch (config->writePolicy) { + case WRITE_POLICY_AUTO: + return "auto"; + case WRITE_POLICY_ASYNC: + return "async"; + case WRITE_POLICY_ASYNC_UNSAFE: + return "async-unsafe"; + case WRITE_POLICY_SYNC: + return "sync"; + default: + return "unknown"; + } +} + +/**********************************************************************/ +void setDeviceConfigLayer(DeviceConfig *config, KernelLayer *layer) +{ + unspliceRingNode(&config->configNode); + if (layer != NULL) { + pushRingNode(&layer->deviceConfigRing, &config->configNode); + } + config->layer = layer; +} diff --git a/source/vdo/kernel/deviceConfig.h b/source/vdo/kernel/deviceConfig.h new file mode 100644 index 0000000..36199dd --- /dev/null +++ b/source/vdo/kernel/deviceConfig.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceConfig.h#11 $ + */ +#ifndef DEVICE_CONFIG_H +#define DEVICE_CONFIG_H + +#include + +#include "ringNode.h" + +#include "kernelTypes.h" + +// This structure is memcmp'd for equality. Keep it +// packed and don't add any fields that are not +// properly set in both extant and parsed configs. +typedef struct { + int bioAckThreads; + int bioThreads; + int bioRotationInterval; + int cpuThreads; + int logicalZones; + int physicalZones; + int hashZones; +} __attribute__((packed)) ThreadCountConfig; + +typedef uint32_t TableVersion; + +typedef struct { + struct dm_target *owningTarget; + struct dm_dev *ownedDevice; + KernelLayer *layer; + /** All configs referencing a layer are kept on a ring in the layer */ + RingNode configNode; + char *originalString; + TableVersion version; + char *parentDeviceName; + BlockCount physicalBlocks; + unsigned int logicalBlockSize; + WritePolicy writePolicy; + unsigned int cacheSize; + unsigned int blockMapMaximumAge; + bool mdRaid5ModeEnabled; + bool deduplication; + char *poolName; + ThreadCountConfig threadCounts; + BlockCount maxDiscardBlocks; +} DeviceConfig; + +/** + * Convert a RingNode to the DeviceConfig that contains it. + * + * @param node The RingNode to convert + * + * @return The DeviceConfig wrapping the RingNode + **/ +static inline DeviceConfig *asDeviceConfig(RingNode *node) +{ + if (node == NULL) { + return NULL; + } + return (DeviceConfig *) ((byte *) node - offsetof(DeviceConfig, configNode)); +} + +/** + * Grab a pointer to the pool name out of argv. + * + * @param [in] argc The number of table values + * @param [in] argv The array of table values + * @param [out] errorPtr A pointer to return a error string in + * @param [out] poolNamePtr A pointer to return the pool name + * + * @return VDO_SUCCESS or an error code + **/ +int getPoolNameFromArgv(int argc, + char **argv, + char **errorPtr, + char **poolNamePtr) + __attribute__((warn_unused_result)); + +/** + * Convert the dmsetup table into a DeviceConfig. + * + * @param [in] argc The number of table values + * @param [in] argv The array of table values + * @param [in] ti The target structure for this table + * @param [in] verbose Whether to log about the underlying device + * @param [out] configPtr A pointer to return the allocated config + * + * @return VDO_SUCCESS or an error code + **/ +int parseDeviceConfig(int argc, + char **argv, + struct dm_target *ti, + bool verbose, + DeviceConfig **configPtr) + __attribute__((warn_unused_result)); + +/** + * Free a device config created by parseDeviceConfig(). + * + * @param configPtr The pointer holding the config, which will be nulled + **/ +void freeDeviceConfig(DeviceConfig **configPtr); + +/** + * Get the text describing the write policy. + * + * @param config The device config + * + * @returns a pointer to a string describing the write policy + **/ +const char *getConfigWritePolicyString(DeviceConfig *config) + __attribute__((warn_unused_result)); + +/** + * Acquire or release a reference from the config to a kernel layer. + * + * @param config The config in question + * @param layer The kernel layer in question + **/ +void setDeviceConfigLayer(DeviceConfig *config, KernelLayer *layer); + +#endif // DEVICE_CONFIG_H diff --git a/source/vdo/kernel/deviceRegistry.c b/source/vdo/kernel/deviceRegistry.c new file mode 100644 index 0000000..13764b4 --- /dev/null +++ b/source/vdo/kernel/deviceRegistry.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceRegistry.c#3 $ + */ + +#include "deviceRegistry.h" + +#include +#include +#include + +#include "memoryAlloc.h" + +/* + * We don't expect this set to ever get really large, so a linked list + * is adequate. We can use a PointerMap if we need to later. + */ +typedef struct { + struct list_head links; + rwlock_t lock; +} DeviceRegistry; + +typedef struct { + struct list_head links; + KernelLayer *layer; +} RegisteredDevice; + +static DeviceRegistry registry; + +/**********************************************************************/ +void initializeDeviceRegistryOnce(void) +{ + INIT_LIST_HEAD(®istry.links); + rwlock_init(®istry.lock); +} + +/** + * Implements LayerFilter. + **/ +static bool layerIsEqual(KernelLayer *layer, void *context) +{ + return ((void *) layer == context); +} + +/** + * Find a layer in the registry if it exists there. Must be called holding + * the lock. + * + * @param filter The filter function to apply to devices + * @param context A bit of context to provide the filter. + * + * @return the layer object found, if any + **/ +__attribute__((warn_unused_result)) +static KernelLayer *filterLayersLocked(LayerFilter *filter, void *context) +{ + RegisteredDevice *device; + list_for_each_entry(device, ®istry.links, links) { + if (filter(device->layer, context)) { + return device->layer; + } + } + return NULL; +} + +/**********************************************************************/ +int addLayerToDeviceRegistry(KernelLayer *layer) +{ + RegisteredDevice *newDevice; + int result = ALLOCATE(1, RegisteredDevice, __func__, &newDevice); + if (result != VDO_SUCCESS) { + return result; + } + + INIT_LIST_HEAD(&newDevice->links); + newDevice->layer = layer; + + write_lock(®istry.lock); + KernelLayer *oldLayer = filterLayersLocked(layerIsEqual, layer); + result = ASSERT(oldLayer == NULL, "Layer not already registered"); + if (result == VDO_SUCCESS) { + list_add_tail(&newDevice->links, ®istry.links); + } + write_unlock(®istry.lock); + + return result; +} + +/**********************************************************************/ +void removeLayerFromDeviceRegistry(KernelLayer *layer) +{ + write_lock(®istry.lock); + RegisteredDevice *device = NULL; + list_for_each_entry(device, ®istry.links, links) { + if (device->layer == layer) { + list_del_init(&device->links); + FREE(device); + break; + } + } + write_unlock(®istry.lock); +} + +/**********************************************************************/ +KernelLayer *findLayerMatching(LayerFilter *filter, void *context) +{ + read_lock(®istry.lock); + KernelLayer *layer = filterLayersLocked(filter, context); + read_unlock(®istry.lock); + return layer; +} diff --git a/source/vdo/kernel/deviceRegistry.h b/source/vdo/kernel/deviceRegistry.h new file mode 100644 index 0000000..94c1635 --- /dev/null +++ b/source/vdo/kernel/deviceRegistry.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceRegistry.h#2 $ + */ + +#ifndef DEVICE_REGISTRY_H +#define DEVICE_REGISTRY_H + +#include "kernelTypes.h" + +/** + * Initialize the necessary structures for the device registry. + **/ +void initializeDeviceRegistryOnce(void); + +/** + * Add a layer to the device registry. The layer must not already exist in the + * registry. + * + * @param layer The layer to add + * + * @return VDO_SUCCESS or an error + **/ +int addLayerToDeviceRegistry(KernelLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Remove a layer from the device registry. + * + * @param layer The layer to remove + **/ +void removeLayerFromDeviceRegistry(KernelLayer *layer); + +/** + * Find and return the first (if any) layer matching a given filter function. + * + * @param filter The filter function to apply to layers + * @param context A bit of context to provide the filter. + **/ +KernelLayer *findLayerMatching(LayerFilter *filter, void *context); + +#endif // DEVICE_REGISTRY_H diff --git a/source/vdo/kernel/dmvdo.c b/source/vdo/kernel/dmvdo.c new file mode 100644 index 0000000..a6c7b98 --- /dev/null +++ b/source/vdo/kernel/dmvdo.c @@ -0,0 +1,889 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dmvdo.c#42 $ + */ + +#include "dmvdo.h" + +#include + +#include "logger.h" +#include "memoryAlloc.h" + +#include "constants.h" +#include "ringNode.h" +#include "threadConfig.h" +#include "vdo.h" + +#include "dedupeIndex.h" +#include "deviceRegistry.h" +#include "dump.h" +#include "instanceNumber.h" +#include "ioSubmitter.h" +#include "kernelLayer.h" +#include "kvdoFlush.h" +#include "memoryUsage.h" +#include "statusProcfs.h" +#include "stringUtils.h" +#include "sysfs.h" +#include "threadDevice.h" +#include "threadRegistry.h" + +struct kvdoDevice kvdoDevice; // global driver state (poorly named) + +/* + * Pre kernel version 4.3, we use the functionality in blkdev_issue_discard + * and the value in max_discard_sectors to split large discards into smaller + * ones. 4.3 to 4.18 kernels have removed the code in blkdev_issue_discard + * and so in place of that, we use the code in device mapper itself to + * split the discards. Unfortunately, it uses the same value to split large + * discards as it does to split large data bios. + * + * In kernel version 4.18, support for splitting discards was added + * back into blkdev_issue_discard. Since this mode of splitting + * (based on max_discard_sectors) is preferable to splitting always + * on 4k, we are turning off the device mapper splitting from 4.18 + * on. + */ +#define HAS_NO_BLKDEV_SPLIT LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) \ + && LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0) + +/**********************************************************************/ + +/** + * Get the kernel layer associated with a dm target structure. + * + * @param ti The dm target structure + * + * @return The kernel layer, or NULL. + **/ +static KernelLayer *getKernelLayerForTarget(struct dm_target *ti) +{ + return ((DeviceConfig *) ti->private)->layer; +} + +/** + * Begin VDO processing of a bio. This is called by the device mapper + * through the "map" function, and has resulted from a call to either + * submit_bio or generic_make_request. + * + * @param ti The dm_target. We only need the "private" member to give + * us the KernelLayer. + * @param bio The bio. + * + * @return One of these values: + * + * negative A negative value is an error code. + * Usually -EIO. + * + * DM_MAPIO_SUBMITTED VDO will take care of this I/O, either + * processing it completely and calling + * bio_endio, or forwarding it onward by + * calling generic_make_request. + * + * DM_MAPIO_REMAPPED VDO has modified the bio and the device + * mapper will immediately forward the bio + * onward using generic_make_request. + * + * DM_MAPIO_REQUEUE We do not use this. It is used by device + * mapper devices to defer an I/O request + * during suspend/resume processing. + **/ +static int vdoMapBio(struct dm_target *ti, BIO *bio) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + return kvdoMapBio(layer, bio); +} + +/**********************************************************************/ +static void vdoIoHints(struct dm_target *ti, struct queue_limits *limits) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + + limits->logical_block_size = layer->deviceConfig->logicalBlockSize; + limits->physical_block_size = VDO_BLOCK_SIZE; + + // The minimum io size for random io + blk_limits_io_min(limits, VDO_BLOCK_SIZE); + // The optimal io size for streamed/sequential io + blk_limits_io_opt(limits, VDO_BLOCK_SIZE); + + /* + * Sets the maximum discard size that will be passed into VDO. This value + * comes from a table line value passed in during dmsetup create. + * + * The value 1024 is the largest usable value on HD systems. A 2048 sector + * discard on a busy HD system takes 31 seconds. We should use a value no + * higher than 1024, which takes 15 to 16 seconds on a busy HD system. + * + * But using large values results in 120 second blocked task warnings in + * /var/log/kern.log. In order to avoid these warnings, we choose to use the + * smallest reasonable value. See VDO-3062 and VDO-3087. + * + * We allow setting of the value for max_discard_sectors even in situations + * where we only split on 4k (see comments for HAS_NO_BLKDEV_SPLIT) as the + * value is still used in other code, like sysfs display of queue limits and + * most especially in dm-thin to determine whether to pass down discards. + */ + limits->max_discard_sectors + = layer->deviceConfig->maxDiscardBlocks * VDO_SECTORS_PER_BLOCK; + + limits->discard_granularity = VDO_BLOCK_SIZE; +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0) + limits->discard_zeroes_data = 1; +#endif +} + +/**********************************************************************/ +static int vdoIterateDevices(struct dm_target *ti, + iterate_devices_callout_fn fn, + void *data) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + sector_t len = blockToSector(layer, layer->deviceConfig->physicalBlocks); + + return fn(ti, layer->deviceConfig->ownedDevice, 0, len, data); +} + +/* + * Status line is: + * + * + */ + +/**********************************************************************/ +static void vdoStatus(struct dm_target *ti, + status_type_t status_type, + unsigned int status_flags, + char *result, + unsigned int maxlen) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + char nameBuffer[BDEVNAME_SIZE]; + // N.B.: The DMEMIT macro uses the variables named "sz", "result", "maxlen". + int sz = 0; + + switch (status_type) { + case STATUSTYPE_INFO: + // Report info for dmsetup status + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + VDOStatistics *stats = &layer->vdoStatsStorage; + DMEMIT("/dev/%s %s %s %s %s %llu %llu", + bdevname(getKernelLayerBdev(layer), nameBuffer), + stats->mode, + stats->inRecoveryMode ? "recovering" : "-", + getDedupeStateName(layer->dedupeIndex), + getKVDOCompressing(&layer->kvdo) ? "online" : "offline", + stats->dataBlocksUsed + stats->overheadBlocksUsed, + stats->physicalBlocks); + mutex_unlock(&layer->statsMutex); + break; + + case STATUSTYPE_TABLE: + // Report the string actually specified in the beginning. + DMEMIT("%s", ((DeviceConfig *) ti->private)->originalString); + break; + } + +// spin_unlock_irqrestore(&layer->lock, flags); +} + + +/** + * Get the size of the underlying device, in blocks. + * + * @param [in] layer The layer + * + * @return The size in blocks + **/ +static BlockCount getUnderlyingDeviceBlockCount(KernelLayer *layer) +{ + uint64_t physicalSize = i_size_read(getKernelLayerBdev(layer)->bd_inode); + return physicalSize / VDO_BLOCK_SIZE; +} + +/**********************************************************************/ +static int vdoPrepareToGrowLogical(KernelLayer *layer, char *sizeString) +{ + BlockCount logicalCount; + if (sscanf(sizeString, "%llu", &logicalCount) != 1) { + logWarning("Logical block count \"%s\" is not a number", sizeString); + return -EINVAL; + } + + if (logicalCount > MAXIMUM_LOGICAL_BLOCKS) { + logWarning("Logical block count \"%llu\" exceeds the maximum (%" + PRIu64 ")", logicalCount, MAXIMUM_LOGICAL_BLOCKS); + return -EINVAL; + } + + return prepareToResizeLogical(layer, logicalCount); +} + +/** + * Process a dmsetup message now that we know no other message is being + * processed. + * + * @param layer The layer to which the message was sent + * @param argc The argument count of the message + * @param argv The arguments to the message + * + * @return -EINVAL if the message is unrecognized or the result of processing + * the message + **/ +__attribute__((warn_unused_result)) +static int processVDOMessageLocked(KernelLayer *layer, + unsigned int argc, + char **argv) +{ + // Messages with variable numbers of arguments. + if (strncasecmp(argv[0], "x-", 2) == 0) { + int result = performKVDOExtendedCommand(&layer->kvdo, argc, argv); + if (result == VDO_UNKNOWN_COMMAND) { + logWarning("unknown extended command '%s' to dmsetup message", argv[0]); + result = -EINVAL; + } + + return result; + } + + // Messages with fixed numbers of arguments. + switch (argc) { + case 1: + if (strcasecmp(argv[0], "sync-dedupe") == 0) { + waitForNoRequestsActive(layer); + return 0; + } + + if (strcasecmp(argv[0], "trace-on") == 0) { + logInfo("Tracing on"); + layer->traceLogging = true; + return 0; + } + + if (strcasecmp(argv[0], "trace-off") == 0) { + logInfo("Tracing off"); + layer->traceLogging = false; + return 0; + } + + if (strcasecmp(argv[0], "prepareToGrowPhysical") == 0) { + return prepareToResizePhysical(layer, + getUnderlyingDeviceBlockCount(layer)); + } + + if (strcasecmp(argv[0], "growPhysical") == 0) { + // The actual growPhysical will happen when the device is resumed. + + if (layer->deviceConfig->version != 0) { + // XXX Uncomment this branch when new VDO manager is updated to not + // send this message. + + // Old style message on new style table is unexpected; it means the + // user started the VDO with new manager and is growing with old. + // logInfo("Mismatch between growPhysical method and table version."); + // return -EINVAL; + } else { + layer->deviceConfig->physicalBlocks + = getUnderlyingDeviceBlockCount(layer); + } + return 0; + } + + break; + + case 2: + if (strcasecmp(argv[0], "compression") == 0) { + if (strcasecmp(argv[1], "on") == 0) { + setKVDOCompressing(&layer->kvdo, true); + return 0; + } + + if (strcasecmp(argv[1], "off") == 0) { + setKVDOCompressing(&layer->kvdo, false); + return 0; + } + + logWarning("invalid argument '%s' to dmsetup compression message", + argv[1]); + return -EINVAL; + } + + if (strcasecmp(argv[0], "prepareToGrowLogical") == 0) { + return vdoPrepareToGrowLogical(layer, argv[1]); + } + + break; + + + default: + break; + } + + logWarning("unrecognized dmsetup message '%s' received", argv[0]); + return -EINVAL; +} + +/** + * Process a dmsetup message. If the message is a dump, just do it. Otherwise, + * check that no other message is being processed, and only proceed if so. + * + * @param layer The layer to which the message was sent + * @param argc The argument count of the message + * @param argv The arguments to the message + * + * @return -EBUSY if another message is being processed or the result of + * processsing the message + **/ +__attribute__((warn_unused_result)) +static int processVDOMessage(KernelLayer *layer, + unsigned int argc, + char **argv) +{ + /* + * All messages which may be processed in parallel with other messages should + * be handled here before the atomic check below. Messages which should be + * exclusive should be processed in processVDOMessageLocked(). + */ + + // Dump messages should always be processed + if (strcasecmp(argv[0], "dump") == 0) { + return vdoDump(layer, argc, argv, "dmsetup message"); + } + + if (argc == 1) { + if (strcasecmp(argv[0], "dump-on-shutdown") == 0) { + layer->dumpOnShutdown = true; + return 0; + } + + // Index messages should always be processed + if ((strcasecmp(argv[0], "index-close") == 0) + || (strcasecmp(argv[0], "index-create") == 0) + || (strcasecmp(argv[0], "index-disable") == 0) + || (strcasecmp(argv[0], "index-enable") == 0)) { + return messageDedupeIndex(layer->dedupeIndex, argv[0]); + } + + // XXX - the "connect" messages are misnamed for the kernel index. These + // messages should go away when all callers have been fixed to use + // "index-enable" or "index-disable". + if (strcasecmp(argv[0], "reconnect") == 0) { + return messageDedupeIndex(layer->dedupeIndex, "index-enable"); + } + + if (strcasecmp(argv[0], "connect") == 0) { + return messageDedupeIndex(layer->dedupeIndex, "index-enable"); + } + + if (strcasecmp(argv[0], "disconnect") == 0) { + return messageDedupeIndex(layer->dedupeIndex, "index-disable"); + } + } + + if (!compareAndSwapBool(&layer->processingMessage, false, true)) { + return -EBUSY; + } + + int result = processVDOMessageLocked(layer, argc, argv); + atomicStoreBool(&layer->processingMessage, false); + return result; +} + +/**********************************************************************/ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0) +static int vdoMessage(struct dm_target *ti, + unsigned int argc, + char **argv, + char *resultBuffer, + unsigned int maxlen) +#else +static int vdoMessage(struct dm_target *ti, unsigned int argc, char **argv) +#endif +{ + if (argc == 0) { + logWarning("unspecified dmsetup message"); + return -EINVAL; + } + + KernelLayer *layer = getKernelLayerForTarget(ti); + RegisteredThread allocatingThread, instanceThread; + registerAllocatingThread(&allocatingThread, NULL); + registerThreadDevice(&instanceThread, layer); + int result = processVDOMessage(layer, argc, argv); + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return mapToSystemError(result); +} + +/** + * Configure the dm_target with our capabilities. + * + * @param ti The device mapper target representing our device + * @param layer The kernel layer to get the write policy from + **/ +static void configureTargetCapabilities(struct dm_target *ti, + KernelLayer *layer) +{ + ti->discards_supported = 1; + + /** + * This may appear to indicate we don't support flushes in sync mode. + * However, dm will set up the request queue to accept flushes if any + * device in the stack accepts flushes. Hence if the device under VDO + * accepts flushes, we will receive flushes. + **/ + ti->flush_supported = shouldProcessFlush(layer); + ti->num_discard_bios = 1; + ti->num_flush_bios = 1; + + // If this value changes, please make sure to update the + // value for maxDiscardSectors accordingly. + BUG_ON(dm_set_target_max_io_len(ti, VDO_SECTORS_PER_BLOCK) != 0); + +/* + * Please see comments above where the macro is defined. + */ +#if HAS_NO_BLKDEV_SPLIT + ti->split_discard_bios = 1; +#endif +} + +/** + * Handle a vdoInitialize failure, freeing all appropriate structures. + * + * @param ti The device mapper target representing our device + * @param threadConfig The thread config (possibly NULL) + * @param layer The kernel layer (possibly NULL) + * @param instance The instance number to be released + * @param why The reason for failure + **/ +static void cleanupInitialize(struct dm_target *ti, + ThreadConfig *threadConfig, + KernelLayer *layer, + unsigned int instance, + char *why) +{ + if (threadConfig != NULL) { + freeThreadConfig(&threadConfig); + } + if (layer != NULL) { + // This releases the instance number too. + freeKernelLayer(layer); + } else { + // With no KernelLayer taking ownership we have to release explicitly. + releaseKVDOInstance(instance); + } + + ti->error = why; +} + +/** + * Initializes a single VDO instance and loads the data from disk + * + * @param ti The device mapper target representing our device + * @param instance The device instantiation counter + * @param config The parsed config for the instance + * + * @return VDO_SUCCESS or an error code + * + **/ +static int vdoInitialize(struct dm_target *ti, + unsigned int instance, + DeviceConfig *config) +{ + logInfo("loading device '%s'", config->poolName); + + uint64_t blockSize = VDO_BLOCK_SIZE; + uint64_t logicalSize = to_bytes(ti->len); + BlockCount logicalBlocks = logicalSize / blockSize; + + logDebug("Logical block size = %llu", + (uint64_t) config->logicalBlockSize); + logDebug("Logical blocks = %llu", logicalBlocks); + logDebug("Physical block size = %llu", (uint64_t) blockSize); + logDebug("Physical blocks = %llu", config->physicalBlocks); + logDebug("Block map cache blocks = %u", config->cacheSize); + logDebug("Block map maximum age = %u", config->blockMapMaximumAge); + logDebug("MD RAID5 mode = %s", (config->mdRaid5ModeEnabled + ? "on" : "off")); + logDebug("Write policy = %s", getConfigWritePolicyString(config)); + logDebug("Deduplication = %s", (config->deduplication + ? "on" : "off")); + + // The threadConfig will be copied by the VDO if it's successfully + // created. + VDOLoadConfig loadConfig = { + .cacheSize = config->cacheSize, + .threadConfig = NULL, + .writePolicy = config->writePolicy, + .maximumAge = config->blockMapMaximumAge, + }; + + char *failureReason; + KernelLayer *layer; + int result = makeKernelLayer(ti->begin, instance, config, + &kvdoDevice.kobj, &loadConfig.threadConfig, + &failureReason, &layer); + if (result != VDO_SUCCESS) { + logError("Could not create kernel physical layer. (VDO error %d," + " message %s)", result, failureReason); + cleanupInitialize(ti, loadConfig.threadConfig, NULL, instance, + failureReason); + return result; + } + + // Now that we have read the geometry, we can finish setting up the + // VDOLoadConfig. + setLoadConfigFromGeometry(&layer->geometry, &loadConfig); + + if (config->cacheSize < (2 * MAXIMUM_USER_VIOS + * loadConfig.threadConfig->logicalZoneCount)) { + logWarning("Insufficient block map cache for logical zones"); + cleanupInitialize(ti, loadConfig.threadConfig, layer, instance, + "Insufficient block map cache for logical zones"); + return VDO_BAD_CONFIGURATION; + } + + // Henceforth it is the kernel layer's responsibility to clean up the + // ThreadConfig. + result = preloadKernelLayer(layer, &loadConfig, &failureReason); + if (result != VDO_SUCCESS) { + logError("Could not start kernel physical layer. (VDO error %d," + " message %s)", result, failureReason); + cleanupInitialize(ti, NULL, layer, instance, failureReason); + return result; + } + + setDeviceConfigLayer(config, layer); + setKernelLayerActiveConfig(layer, config); + ti->private = config; + configureTargetCapabilities(ti, layer); + return VDO_SUCCESS; +} + +/**********************************************************************/ +static int vdoCtr(struct dm_target *ti, unsigned int argc, char **argv) +{ + int result = VDO_SUCCESS; + + RegisteredThread allocatingThread; + registerAllocatingThread(&allocatingThread, NULL); + + const char *deviceName = dm_device_name(dm_table_get_md(ti->table)); + KernelLayer *oldLayer = findLayerMatching(layerIsNamed, (void *)deviceName); + unsigned int instance; + if (oldLayer == NULL) { + result = allocateKVDOInstance(&instance); + if (result != VDO_SUCCESS) { + unregisterAllocatingThread(); + return -ENOMEM; + } + } else { + instance = oldLayer->instance; + } + + RegisteredThread instanceThread; + registerThreadDeviceID(&instanceThread, &instance); + + bool verbose = (oldLayer == NULL); + DeviceConfig *config = NULL; + result = parseDeviceConfig(argc, argv, ti, verbose, &config); + if (result != VDO_SUCCESS) { + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + if (oldLayer == NULL) { + releaseKVDOInstance(instance); + } + return -EINVAL; + } + + // Is there already a device of this name? + if (oldLayer != NULL) { + /* + * To preserve backward compatibility with old VDO Managers, we need to + * allow this to happen when either suspended or not. We could assert + * that if the config is version 0, we are suspended, and if not, we + * are not, but we can't do that till new VDO Manager does the right + * order. + */ + logInfo("preparing to modify device '%s'", config->poolName); + result = prepareToModifyKernelLayer(oldLayer, config, &ti->error); + if (result != VDO_SUCCESS) { + result = mapToSystemError(result); + freeDeviceConfig(&config); + } else { + setDeviceConfigLayer(config, oldLayer); + ti->private = config; + configureTargetCapabilities(ti, oldLayer); + } + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return result; + } + + result = vdoInitialize(ti, instance, config); + if (result != VDO_SUCCESS) { + // vdoInitialize calls into various VDO routines, so map error + result = mapToSystemError(result); + freeDeviceConfig(&config); + } + + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return result; +} + +/**********************************************************************/ +static void vdoDtr(struct dm_target *ti) +{ + DeviceConfig *config = ti->private; + KernelLayer *layer = config->layer; + + setDeviceConfigLayer(config, NULL); + + if (isRingEmpty(&layer->deviceConfigRing)) { + // This was the last config referencing the layer. Free it. + unsigned int instance = layer->instance; + RegisteredThread allocatingThread, instanceThread; + registerThreadDeviceID(&instanceThread, &instance); + registerAllocatingThread(&allocatingThread, NULL); + + waitForNoRequestsActive(layer); + logInfo("stopping device '%s'", config->poolName); + + if (layer->dumpOnShutdown) { + vdoDumpAll(layer, "device shutdown"); + } + + freeKernelLayer(layer); + logInfo("device '%s' stopped", config->poolName); + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + } else if (config == layer->deviceConfig) { + // The layer still references this config. Give it a reference to a + // config that isn't being destroyed. + layer->deviceConfig = asDeviceConfig(layer->deviceConfigRing.next); + } + + freeDeviceConfig(&config); + ti->private = NULL; +} + +/**********************************************************************/ +static void vdoPresuspend(struct dm_target *ti) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + RegisteredThread instanceThread; + registerThreadDevice(&instanceThread, layer); + if (dm_noflush_suspending(ti)) { + layer->noFlushSuspend = true; + } + unregisterThreadDeviceID(); +} + +/**********************************************************************/ +static void vdoPostsuspend(struct dm_target *ti) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + RegisteredThread instanceThread; + registerThreadDevice(&instanceThread, layer); + const char *poolName = layer->deviceConfig->poolName; + logInfo("suspending device '%s'", poolName); + int result = suspendKernelLayer(layer); + if (result == VDO_SUCCESS) { + logInfo("device '%s' suspended", poolName); + } else { + logError("suspend of device '%s' failed with error: %d", poolName, result); + } + layer->noFlushSuspend = false; + unregisterThreadDeviceID(); +} + +/**********************************************************************/ +static int vdoPreresume(struct dm_target *ti) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + DeviceConfig *config = ti->private; + RegisteredThread instanceThread; + + BlockCount backingBlocks = getUnderlyingDeviceBlockCount(layer); + if (backingBlocks < config->physicalBlocks) { + logError("resume of device '%s' failed: backing device has %" PRIu64 + " blocks but VDO physical size is %llu blocks", + config->poolName, backingBlocks, config->physicalBlocks); + return -EINVAL; + } + + registerThreadDevice(&instanceThread, layer); + + if (getKernelLayerState(layer) == LAYER_STARTING) { + // This is the first time this device has been resumed, so run it. + logInfo("starting device '%s'", config->poolName); + char *failureReason; + int result = startKernelLayer(layer, &failureReason); + if (result != VDO_SUCCESS) { + logError("Could not run kernel physical layer. (VDO error %d," + " message %s)", result, failureReason); + setKVDOReadOnly(&layer->kvdo, result); + unregisterThreadDeviceID(); + return mapToSystemError(result); + } + + logInfo("device '%s' started", config->poolName); + } + + logInfo("resuming device '%s'", config->poolName); + + // This is a noop if nothing has changed, and by calling it every time + // we capture old-style growPhysicals, which change the config in place. + int result = modifyKernelLayer(layer, config); + if (result != VDO_SUCCESS) { + logErrorWithStringError(result, "Commit of modifications to device '%s'" + " failed", config->poolName); + setKernelLayerActiveConfig(layer, config); + setKVDOReadOnly(&layer->kvdo, result); + } else { + setKernelLayerActiveConfig(layer, config); + result = resumeKernelLayer(layer); + if (result != VDO_SUCCESS) { + logError("resume of device '%s' failed with error: %d", + layer->deviceConfig->poolName, result); + } + } + unregisterThreadDeviceID(); + return mapToSystemError(result); +} + +/**********************************************************************/ +static void vdoResume(struct dm_target *ti) +{ + KernelLayer *layer = getKernelLayerForTarget(ti); + RegisteredThread instanceThread; + registerThreadDevice(&instanceThread, layer); + logInfo("device '%s' resumed", layer->deviceConfig->poolName); + unregisterThreadDeviceID(); +} + +/* + * If anything changes that affects how user tools will interact + * with vdo, update the version number and make sure + * documentation about the change is complete so tools can + * properly update their management code. + */ +static struct target_type vdoTargetBio = { + .features = DM_TARGET_SINGLETON, + .name = "vdo", + .version = {6, 2, 3}, + .module = THIS_MODULE, + .ctr = vdoCtr, + .dtr = vdoDtr, + .io_hints = vdoIoHints, + .iterate_devices = vdoIterateDevices, + .map = vdoMapBio, + .message = vdoMessage, + .status = vdoStatus, + .presuspend = vdoPresuspend, + .postsuspend = vdoPostsuspend, + .preresume = vdoPreresume, + .resume = vdoResume, +}; + +static bool dmRegistered = false; +static bool sysfsInitialized = false; + +/**********************************************************************/ +static void vdoDestroy(void) +{ + logDebug("in %s", __func__); + + kvdoDevice.status = SHUTTING_DOWN; + + if (sysfsInitialized) { + vdoPutSysfs(&kvdoDevice.kobj); + } + vdoDestroyProcfs(); + + kvdoDevice.status = UNINITIALIZED; + + if (dmRegistered) { + dm_unregister_target(&vdoTargetBio); + } + + cleanUpInstanceNumberTracking(); + + logInfo("unloaded version %s", CURRENT_VERSION); +} + +/**********************************************************************/ +static int __init vdoInit(void) +{ + int result = 0; + + initializeThreadDeviceRegistry(); + initializeStandardErrorBlocks(); + initializeDeviceRegistryOnce(); + logInfo("loaded version %s", CURRENT_VERSION); + + result = dm_register_target(&vdoTargetBio); + if (result < 0) { + logError("dm_register_target failed %d", result); + vdoDestroy(); + return result; + } + dmRegistered = true; + + kvdoDevice.status = UNINITIALIZED; + + vdoInitProcfs(); + /* + * Set up global sysfs stuff + */ + result = vdoInitSysfs(&kvdoDevice.kobj); + if (result < 0) { + logError("sysfs initialization failed %d", result); + vdoDestroy(); + // vdoInitSysfs only returns system error codes + return result; + } + sysfsInitialized = true; + + initWorkQueueOnce(); + initializeTraceLoggingOnce(); + initKernelVDOOnce(); + initializeInstanceNumberTracking(); + + kvdoDevice.status = READY; + return result; +} + +/**********************************************************************/ +static void __exit vdoExit(void) +{ + vdoDestroy(); +} + +module_init(vdoInit); +module_exit(vdoExit); + +MODULE_DESCRIPTION(DM_NAME " target for transparent deduplication"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); +MODULE_VERSION(CURRENT_VERSION); diff --git a/source/vdo/kernel/dmvdo.h b/source/vdo/kernel/dmvdo.h new file mode 100644 index 0000000..a71e39d --- /dev/null +++ b/source/vdo/kernel/dmvdo.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dmvdo.h#2 $ + */ + +#ifndef DMVDO_H +#define DMVDO_H + +#include +#include +#include + +#include "kernelLayer.h" + +typedef enum { + UNINITIALIZED = 0, + READY, + SHUTTING_DOWN, +} KVDOStatus; + +/* + * The internal representation of our device. + */ +struct kvdoDevice { + KVDOStatus status; + struct kobject kobj; +}; + +extern struct kvdoDevice kvdoDevice; + +#endif /* DMVDO_H */ diff --git a/source/vdo/kernel/dump.c b/source/vdo/kernel/dump.c new file mode 100644 index 0000000..b9b02e2 --- /dev/null +++ b/source/vdo/kernel/dump.c @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dump.c#2 $ + */ + +#include "dump.h" + +#include + +#include "memoryAlloc.h" +#include "typeDefs.h" + +#include "constants.h" +#include "vdo.h" + +#include "dedupeIndex.h" +#include "histogram.h" +#include "ioSubmitter.h" +#include "logger.h" + +enum dumpOptions { + // WorkQueues + SHOW_ALBIREO_QUEUE, + SHOW_BIO_ACK_QUEUE, + SHOW_BIO_QUEUE, + SHOW_CPU_QUEUES, + SHOW_REQUEST_QUEUE, + // MemoryPools + SHOW_VIO_POOL, + // Others + SHOW_VDO_STATUS, + // This one means an option overrides the "default" choices, instead + // of altering them. + SKIP_DEFAULT +}; + +enum dumpOptionFlags { + // WorkQueues + FLAG_SHOW_ALBIREO_QUEUE = (1 << SHOW_ALBIREO_QUEUE), + FLAG_SHOW_BIO_ACK_QUEUE = (1 << SHOW_BIO_ACK_QUEUE), + FLAG_SHOW_BIO_QUEUE = (1 << SHOW_BIO_QUEUE), + FLAG_SHOW_CPU_QUEUES = (1 << SHOW_CPU_QUEUES), + FLAG_SHOW_REQUEST_QUEUE = (1 << SHOW_REQUEST_QUEUE), + // MemoryPools + FLAG_SHOW_VIO_POOL = (1 << SHOW_VIO_POOL), + // Others + FLAG_SHOW_VDO_STATUS = (1 << SHOW_VDO_STATUS), + // Special + FLAG_SKIP_DEFAULT = (1 << SKIP_DEFAULT) + }; + +enum { + FLAGS_ALL_POOLS = (FLAG_SHOW_VIO_POOL), + FLAGS_ALL_QUEUES = (FLAG_SHOW_REQUEST_QUEUE + | FLAG_SHOW_ALBIREO_QUEUE + | FLAG_SHOW_BIO_ACK_QUEUE + | FLAG_SHOW_BIO_QUEUE + | FLAG_SHOW_CPU_QUEUES), + FLAGS_ALL_THREADS = (FLAGS_ALL_QUEUES), + DEFAULT_DUMP_FLAGS = (FLAGS_ALL_THREADS | FLAG_SHOW_VDO_STATUS) +}; + +/**********************************************************************/ +static inline bool isArgString(const char *arg, const char *thisOption) +{ + // device-mapper convention seems to be case-independent options + return strncasecmp(arg, thisOption, strlen(thisOption)) == 0; +} + +/**********************************************************************/ +static void doDump(KernelLayer *layer, + unsigned int dumpOptionsRequested, + const char *why) +{ + logInfo("%s dump triggered via %s", THIS_MODULE->name, why); + // XXX Add in number of outstanding requests being processed by vdo + uint32_t active, maximum; + getLimiterValuesAtomically(&layer->requestLimiter, &active, &maximum); + int64_t outstanding = atomic64_read(&layer->biosSubmitted) + - atomic64_read(&layer->biosCompleted); + logInfo("%" PRIu32 " device requests outstanding (max %" PRIu32 "), " + "%" PRId64 " bio requests outstanding, poolName '%s'", + active, maximum, outstanding, layer->deviceConfig->poolName); + if ((dumpOptionsRequested & FLAG_SHOW_REQUEST_QUEUE) != 0) { + dumpKVDOWorkQueue(&layer->kvdo); + } + if ((dumpOptionsRequested & FLAG_SHOW_BIO_QUEUE) != 0) { + dumpBioWorkQueue(layer->ioSubmitter); + } + if (useBioAckQueue(layer) + && ((dumpOptionsRequested & FLAG_SHOW_BIO_ACK_QUEUE) != 0)) { + dumpWorkQueue(layer->bioAckQueue); + } + if ((dumpOptionsRequested & FLAG_SHOW_CPU_QUEUES) != 0) { + dumpWorkQueue(layer->cpuQueue); + } + dumpDedupeIndex(layer->dedupeIndex, + (dumpOptionsRequested & FLAG_SHOW_ALBIREO_QUEUE) != 0); + dumpBufferPool(layer->dataKVIOPool, + (dumpOptionsRequested & FLAG_SHOW_VIO_POOL) != 0); + if ((dumpOptionsRequested & FLAG_SHOW_VDO_STATUS) != 0) { + // Options should become more fine-grained when we have more to + // display here. + dumpKVDOStatus(&layer->kvdo); + } + reportMemoryUsage(); + logInfo("end of %s dump", THIS_MODULE->name); +} + +/**********************************************************************/ +static int parseDumpOptions(unsigned int argc, + char * const *argv, + unsigned int *dumpOptionsRequestedPtr) +{ + unsigned int dumpOptionsRequested = 0; + + static const struct { + const char *name; + unsigned int flags; + } optionNames[] = { + // Should "albireo" mean sending queue + receiving thread + outstanding? + { "dedupe", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, + { "dedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, + { "kvdodedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, + { "bioack", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, + { "kvdobioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, + { "bioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, + { "bio", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, + { "kvdobioq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, + { "bioq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, + { "cpu", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, + { "kvdocpuq", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, + { "cpuq", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, + { "request", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, + { "kvdoreqq", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, + { "reqq", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, + { "viopool", FLAG_SKIP_DEFAULT | FLAG_SHOW_VIO_POOL }, + { "vdo", FLAG_SKIP_DEFAULT | FLAG_SHOW_VDO_STATUS }, + + { "pools", FLAG_SKIP_DEFAULT | FLAGS_ALL_POOLS }, + { "queues", FLAG_SKIP_DEFAULT | FLAGS_ALL_QUEUES }, + { "threads", FLAG_SKIP_DEFAULT | FLAGS_ALL_THREADS }, + { "default", FLAG_SKIP_DEFAULT | DEFAULT_DUMP_FLAGS }, + { "all", ~0 }, + }; + + bool optionsOkay = true; + for (int i = 1; i < argc; i++) { + int j; + for (j = 0; j < COUNT_OF(optionNames); j++) { + if (isArgString(argv[i], optionNames[j].name)) { + dumpOptionsRequested |= optionNames[j].flags; + break; + } + } + if (j == COUNT_OF(optionNames)) { + logWarning("dump option name '%s' unknown", argv[i]); + optionsOkay = false; + } + } + if (!optionsOkay) { + return -EINVAL; + } + if ((dumpOptionsRequested & FLAG_SKIP_DEFAULT) == 0) { + dumpOptionsRequested |= DEFAULT_DUMP_FLAGS; + } + *dumpOptionsRequestedPtr = dumpOptionsRequested; + return 0; +} + +/**********************************************************************/ +int vdoDump(KernelLayer *layer, + unsigned int argc, + char * const *argv, + const char *why) +{ + unsigned int dumpOptionsRequested = 0; + int result = parseDumpOptions(argc, argv, &dumpOptionsRequested); + if (result != 0) { + return result; + } + doDump(layer, dumpOptionsRequested, why); + return 0; +} + +/**********************************************************************/ +void vdoDumpAll(KernelLayer *layer, const char *why) +{ + doDump(layer, ~0, why); +} diff --git a/source/vdo/kernel/dump.h b/source/vdo/kernel/dump.h new file mode 100644 index 0000000..5187d4f --- /dev/null +++ b/source/vdo/kernel/dump.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dump.h#1 $ + */ + +#ifndef DUMP_H +#define DUMP_H + +#include "kernelLayer.h" + +/** + * Dump internal state and/or statistics to the kernel log, as + * specified by zero or more string arguments. + * + * @param layer The kernel layer + * @param argc Number of arguments + * @param argv The argument list + * @param why Reason for doing the dump + **/ +int vdoDump(KernelLayer *layer, + unsigned int argc, + char * const *argv, + const char *why); + +/** + * Dump lots of internal state and statistics to the kernel log. + * Identical to "dump all", without each caller needing to set up the + * argument list. + * + * @param layer The kernel layer + * @param why Reason for doing the dump + **/ +void vdoDumpAll(KernelLayer *layer, const char *why); + +#endif // DUMP_H diff --git a/source/vdo/kernel/errors.c b/source/vdo/kernel/errors.c new file mode 100644 index 0000000..dc9303e --- /dev/null +++ b/source/vdo/kernel/errors.c @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/errors.c#2 $ + */ + +#include "errors.h" + +#include +#include +#include + +#include "permassert.h" +#include "statusCodes.h" + +static const struct errorInfo errorList[] = { + { "UDS_UNINITIALIZED", "UDS library is not initialized" }, + { "UDS_SHUTTINGDOWN", "UDS library is shutting down" }, + { "UDS_EMODULE_LOAD", "Could not load modules" }, + { "UDS_ENOTHREADS", "Could not create a new thread" }, + { "UDS_NOCONTEXT", "Could not find the requested library context" }, + { "UDS_DISABLED", "UDS library context is disabled" }, + { "UDS_CORRUPT_FILE", "Corrupt file" }, + { "UDS_UNKNOWN_ERROR", "Unknown error" }, + { "UDS_GRID_NO_SERVERS", "No servers in grid configuration" }, + { "UDS_GRID_CONFIG_INCONSISTENT", "Grid configuration inconsistent" }, + { "UDS_UNSUPPORTED_VERSION", "Unsupported version" }, + { "UDS_NO_INDEXSESSION", "Index session not known" }, + { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" }, + { "UDS_SHORT_READ", "Could not read requested number of bytes" }, + { "UDS_AI_ERROR", "Network address and service translation error" }, + { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" }, + { "UDS_WRONG_CONTEXT_TYPE", "Context type mismatch" }, + { "UDS_BLOCK_ADDRESS_REQUIRED", "A block address is required" }, + { "UDS_CHUNK_DATA_REQUIRED", "Block data is required" }, + { "UDS_CHUNK_NAME_REQUIRED", "A chunk name is required" }, + { "UDS_CONF_PTR_REQUIRED", "A configuration pointer is required" }, + { "UDS_INDEX_STATS_PTR_REQUIRED", "An index stats pointer is required" }, + { "UDS_CONTEXT_STATS_PTR_REQUIRED", "A context stats pointer is required" }, + { "UDS_CONTEXT_PTR_REQUIRED", "A context pointer is required" }, + { "UDS_FILEID_REQUIRED", "A file ID is required" }, + { "UDS_STREAM_REQUIRED", "A stream is required" }, + { "UDS_STREAMID_REQUIRED", "A stream ID is required" }, + { "UDS_STREAM_PTR_REQUIRED", "A stream pointer is required" }, + { "UDS_INVALID_MEMORY_SIZE", + "Configured memory too small or unsupported size" }, + { "UDS_INVALID_METADATA_SIZE", "Invalid metadata size" }, + { "UDS_INDEX_NAME_REQUIRED", "An index name is required" }, + { "UDS_CONF_REQUIRED", "A configuration is required" }, + { "UDS_BAD_FILE_DESCRIPTOR", "Bad file descriptor" }, + { "UDS_INDEX_EXISTS", "Index already exists" }, + { "UDS_REQUESTS_OUT_OF_RANGE", "Maximum request value out of range" }, + { "UDS_BAD_NAMESPACE", "Bad namespace" }, + { "UDS_MIGRATOR_MISMATCH", + "Migrator arguments do not match reader arguments" }, + { "UDS_NO_INDEX", "No index found" }, + { "UDS_BAD_CHECKPOINT_FREQUENCY", "Checkpoint frequency out of range" }, + { "UDS_WRONG_INDEX_CONFIG", "Wrong type of index configuration" }, + { "UDS_INDEX_PATH_NOT_DIR", "Index path does not point to a directory" }, + { "UDS_ALREADY_OPEN", "Open invoked on already opened connection" }, + { "UDS_CALLBACK_ALREADY_REGISTERED", "Callback already registered" }, + { "UDS_INDEX_PATH_TOO_LONG", "Index path too long" }, + { "UDS_END_OF_FILE", "Unexpected end of file" }, + { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" }, +}; + +static const struct errorInfo internalErrorList[] = { + { "UDS_PROTOCOL_ERROR", "Client/server protocol error" }, + { "UDS_OVERFLOW", "Index overflow" }, + { "UDS_FILLDONE", "Fill phase done" }, + { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" }, + { "UDS_BAD_STATE", "UDS data structures are in an invalid state" }, + { "UDS_DUPLICATE_NAME", + "Attempt to enter the same name into a delta index twice" }, + { "UDS_UNEXPECTED_RESULT", "Unexpected result from internal routine" }, + { "UDS_INJECTED_ERROR", "Injected error" }, + { "UDS_ASSERTION_FAILED", "Assertion failed" }, + { "UDS_UNSCANNABLE", "Unscannable" }, + { "UDS_QUEUED", "Request queued" }, + { "UDS_QUEUE_ALREADY_CONNECTED", "Queue already connected" }, + { "UDS_BAD_FILL_PHASE", "Fill phase not supported" }, + { "UDS_BUFFER_ERROR", "Buffer error" }, + { "UDS_CONNECTION_LOST", "Lost connection to peer" }, + { "UDS_TIMEOUT", "A time out has occurred" }, + { "UDS_NO_DIRECTORY", "Expected directory is missing" }, + { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" }, + { "UDS_INVALID_RUN_ID", "Invalid albGenTest server run ID" }, + { "UDS_RUN_CANCELED", "albGenTest server run canceled" }, + { "UDS_ALREADY_REGISTERED", "error range already registered" }, +}; + +/** Error attributes - or into top half of error code */ +enum { + UDS_UNRECOVERABLE = (1 << 17) +}; + +typedef struct errorBlock { + const char *name; + int base; + int last; + int max; + const ErrorInfo *infos; +} ErrorBlock; + +enum { + MAX_ERROR_BLOCKS = 6 // needed for testing +}; + +static struct errorInformation { + int allocated; + int count; + ErrorBlock blocks[MAX_ERROR_BLOCKS]; +} registeredErrors; + +/**********************************************************************/ +void initializeStandardErrorBlocks(void) +{ + registeredErrors.allocated = MAX_ERROR_BLOCKS; + registeredErrors.count = 0; + + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = "UDS Error", + .base = UDS_ERROR_CODE_BASE, + .last = UDS_ERROR_CODE_LAST, + .max = UDS_ERROR_CODE_BLOCK_END, + .infos = errorList, + }; + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = "UDS Internal Error", + .base = UDS_INTERNAL_ERROR_CODE_BASE, + .last = UDS_INTERNAL_ERROR_CODE_LAST, + .max = UDS_INTERNAL_ERROR_CODE_BLOCK_END, + .infos = internalErrorList, + }; + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = THIS_MODULE->name, + .base = VDO_BLOCK_START, + .last = VDO_STATUS_CODE_LAST, + .max = VDO_BLOCK_END, + .infos = vdoStatusList, + }; +} + +/** + * Fetch the error info (if any) for the error number. + * + * @param errnum the error number + * @param infoPtr the place to store the info for this error (if known), + * otherwise set to NULL + * + * @return the name of the error block (if known), NULL otherwise + **/ +static const char *getErrorInfo(int errnum, const ErrorInfo **infoPtr) +{ + for (ErrorBlock *block = registeredErrors.blocks; + block < registeredErrors.blocks + registeredErrors.count; + ++block) { + if ((errnum >= block->base) && (errnum < block->last)) { + if (infoPtr != NULL) { + *infoPtr = block->infos + (errnum - block->base); + } + return block->name; + } else if ((errnum >= block->last) && (errnum < block->max)) { + if (infoPtr != NULL) { + *infoPtr = NULL; + } + return block->name; + } + } + if (infoPtr != NULL) { + *infoPtr = NULL; + } + return NULL; +} + +/*****************************************************************************/ +const char *stringError(int errnum, char *buf, size_t buflen) +{ + if (buf == NULL) { + return NULL; + } + + const ErrorInfo *info = NULL; + const char *blockName = getErrorInfo(errnum, &info); + + if (blockName != NULL) { + if (info != NULL) { + snprintf(buf, buflen, "%s: %s", blockName, info->message); + } else { + snprintf(buf, buflen, "Unknown %s %d", blockName, errnum); + } + } else { + snprintf(buf, buflen, "System error %d", errnum); + } + return buf; +} + +/*****************************************************************************/ +const char *stringErrorName(int errnum, char *buf, size_t buflen) +{ + const ErrorInfo *info = NULL; + const char *blockName = getErrorInfo(errnum, &info); + + if (blockName != NULL) { + if (info != NULL) { + snprintf(buf, buflen, "%s: %s", blockName, info->name); + } else { + snprintf(buf, buflen, "Unknown %s %d", blockName, errnum); + } + } else { + snprintf(buf, buflen, "System error %d", errnum); + } + return buf; +} + +/*****************************************************************************/ +int makeUnrecoverable(int resultCode) +{ + return ((resultCode == UDS_SUCCESS) + ? resultCode + : (resultCode | UDS_UNRECOVERABLE)); +} + +/*****************************************************************************/ +int sansUnrecoverable(int resultCode) +{ + return resultCode & ~UDS_UNRECOVERABLE; +} + +/*****************************************************************************/ +bool isUnrecoverable(int resultCode) +{ + return (bool)(resultCode & UDS_UNRECOVERABLE); +} + +/*****************************************************************************/ +int registerErrorBlock(const char *blockName, + int firstError, + int lastReservedError, + const ErrorInfo *infos, + size_t infoSize) +{ + int result = ASSERT(firstError < lastReservedError, + "bad error block range"); + if (result != UDS_SUCCESS) { + return result; + } + + if (registeredErrors.count == registeredErrors.allocated) { + // could reallocate and grow, but should never happen + return UDS_OVERFLOW; + } + + for (ErrorBlock *block = registeredErrors.blocks; + block < registeredErrors.blocks + registeredErrors.count; + ++block) { + if (strcmp(blockName, block->name) == 0) { + return UDS_DUPLICATE_NAME; + } + // check for overlap in error ranges + if ((firstError < block->max) && (lastReservedError > block->base)) { + return UDS_ALREADY_REGISTERED; + } + } + + registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { + .name = blockName, + .base = firstError, + .last = firstError + (infoSize / sizeof(ErrorInfo)), + .max = lastReservedError, + .infos = infos + }; + + return UDS_SUCCESS; +} diff --git a/source/vdo/kernel/errors.h b/source/vdo/kernel/errors.h new file mode 100644 index 0000000..acfb777 --- /dev/null +++ b/source/vdo/kernel/errors.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/errors.h#1 $ + */ + +#ifndef ERRORS_H +#define ERRORS_H + +#include +#include "uds-error.h" + +enum udsInternalErrorCodes { + /** Used as a base value for reporting internal errors */ + UDS_INTERNAL_ERROR_CODE_BASE = 66560, + /** Client/server protocol framing error */ + UDS_PROTOCOL_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 0, + /** Index overflow */ + UDS_OVERFLOW = UDS_INTERNAL_ERROR_CODE_BASE + 1, + /** Fill phase done (intended for albfill only) */ + UDS_FILLDONE = UDS_INTERNAL_ERROR_CODE_BASE + 2, + /** Invalid argument passed to internal routine */ + UDS_INVALID_ARGUMENT = UDS_INTERNAL_ERROR_CODE_BASE + 3, + /** UDS data structures are in an invalid state */ + UDS_BAD_STATE = UDS_INTERNAL_ERROR_CODE_BASE + 4, + /** Attempt to enter the same name into an internal structure twice */ + UDS_DUPLICATE_NAME = UDS_INTERNAL_ERROR_CODE_BASE + 5, + /** An internal protocol violation between system components */ + UDS_UNEXPECTED_RESULT = UDS_INTERNAL_ERROR_CODE_BASE + 6, + /** An error created by test case processing */ + UDS_INJECTED_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 7, + /** An assertion failed */ + UDS_ASSERTION_FAILED = UDS_INTERNAL_ERROR_CODE_BASE + 8, + /** A file or stream is not scannable with the current scanner */ + UDS_UNSCANNABLE = UDS_INTERNAL_ERROR_CODE_BASE + 9, + /** Not an actual error, but reporting that the result will be delayed */ + UDS_QUEUED = UDS_INTERNAL_ERROR_CODE_BASE + 10, + /** Queue already connected */ + UDS_QUEUE_ALREADY_CONNECTED = UDS_INTERNAL_ERROR_CODE_BASE + 11, + /** Fill phase not supported */ + UDS_BAD_FILL_PHASE = UDS_INTERNAL_ERROR_CODE_BASE + 12, + /** A problem has occurred with a Buffer */ + UDS_BUFFER_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 13, + /** A network connection was lost */ + UDS_CONNECTION_LOST = UDS_INTERNAL_ERROR_CODE_BASE + 14, + /** A time out has occurred */ + UDS_TIMEOUT = UDS_INTERNAL_ERROR_CODE_BASE + 15, + /** No directory was found where one was expected */ + UDS_NO_DIRECTORY = UDS_INTERNAL_ERROR_CODE_BASE + 16, + /** Checkpoint not completed */ + UDS_CHECKPOINT_INCOMPLETE = UDS_INTERNAL_ERROR_CODE_BASE + 17, + /** Invalid albGenTest server run ID */ + UDS_INVALID_RUN_ID = UDS_INTERNAL_ERROR_CODE_BASE + 18, + /** albGenTest server run canceled */ + UDS_RUN_CANCELED = UDS_INTERNAL_ERROR_CODE_BASE + 19, + /** this error range has already been registered */ + UDS_ALREADY_REGISTERED = UDS_INTERNAL_ERROR_CODE_BASE + 20, + /** One more than the last UDS_INTERNAL error code */ + UDS_INTERNAL_ERROR_CODE_LAST, + /** One more than the last error this block will ever use */ + UDS_INTERNAL_ERROR_CODE_BLOCK_END = UDS_INTERNAL_ERROR_CODE_BASE + 440 +}; + +enum { + ERRBUF_SIZE = 128 // default size for buffer passed to stringError +}; + +const char *stringError(int errnum, char *buf, size_t buflen); +const char *stringErrorName(int errnum, char *buf, size_t buflen); + +int makeUnrecoverable(int resultCode) __attribute__((warn_unused_result)); +bool isUnrecoverable(int resultCode) __attribute__((warn_unused_result)); +int sansUnrecoverable(int resultCode) __attribute__((warn_unused_result)); + +typedef struct errorInfo { + const char *name; + const char *message; +} ErrorInfo; + +/** + * Initialize UDS error code blocks. + * + * @note Must be called once, before any of the other routines in this + * file. + **/ +void initializeStandardErrorBlocks(void); + +/** + * Register an error code block for stringError and stringErrorName. + * + * @param blockName the name of the block of error codes + * @param firstError the first error code in the block + * @param lastReservedError one past the highest possible error in the block + * @param infos a pointer to the error info array for the block + * @param infoSize the size of the error info array, which determines + * the last actual error for which information is + * available + * + * @return a success or error code, particularly UDS_DUPLICATE_NAME if the + * block name is already present, or UDS_ALREADY_REGISTERED if a + * block with the specified error code is present + **/ +int registerErrorBlock(const char *blockName, + int firstError, + int lastReservedError, + const ErrorInfo *infos, + size_t infoSize); + +#endif /* ERRORS_H */ diff --git a/source/vdo/kernel/histogram.c b/source/vdo/kernel/histogram.c new file mode 100644 index 0000000..0e1a6ae --- /dev/null +++ b/source/vdo/kernel/histogram.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/histogram.c#2 $ + */ + +#include + +#include "memoryAlloc.h" +#include "typeDefs.h" + +#include "histogram.h" +#include "logger.h" +#include "numUtils.h" + +/* + * Set NO_BUCKETS to streamline the histogram code by reducing it to + * tracking just minimum, maximum, mean, etc. Only one bucket counter + * (the final one for "bigger" values) will be used, no range checking + * is needed to find the right bucket, and no histogram will be + * reported. With newer compilers, the histogram output code will be + * optimized out. + */ +enum { + NO_BUCKETS = 1 +}; + +/* + * Support histogramming in the VDO code. + * + * This is not a complete and general histogram package. It follows the XP + * practice of implementing the "customer" requirements, and no more. We can + * support other requirements after we know what they are. + * + * The code was originally borrowed from Albireo, and includes both linear and + * logarithmic histograms. VDO only uses the logarithmic histograms. + * + * All samples are uint64_t values. + * + * A unit conversion option is supported internally to allow sample values to + * be supplied in "jiffies" and results to be reported via /sys in + * milliseconds. Depending on the system configuration, this could mean a + * factor of four (a bucket for values of 1 jiffy is reported as 4-7 + * milliseconds). In theory it could be a non-integer ratio (including less + * than one), but as the x86-64 platforms we've encountered appear to use 1 or + * 4 milliseconds per jiffy, we don't support non-integer values yet. + * + * All internal processing uses the values as passed to enterHistogramSample. + * Conversions only affect the values seen or input through the /sys interface, + * including possibly rounding a "limit" value entered. + */ + +struct histogram { + // These fields are ordered so that enterHistogramSample touches + // only the first cache line. + atomic64_t *counters; // Counter for each bucket + uint64_t limit; // We want to know how many samples are larger + atomic64_t sum; // Sum of all the samples + atomic64_t count; // Number of samples + atomic64_t minimum; // Minimum value + atomic64_t maximum; // Maximum value + atomic64_t unacceptable; // Number of samples that exceed the limit + int numBuckets; // The number of buckets + bool logFlag; // True if the y scale should be logarithmic + // These fields are used only when reporting results. + const char *label; // Histogram label + const char *countedItems; // Name for things being counted + const char *metric; // Term for value used to divide into buckets + const char *sampleUnits; // Unit for measuring metric; NULL for count + unsigned int conversionFactor; // Converts input units to reporting units + struct kobject kobj; +}; + +/* + * Fixed table defining the top value for each bucket of a logarithmic + * histogram. We arbitrarily limit the histogram to 12 orders of magnitude. + */ +enum { MAX_LOG_SIZE = 12 }; +static const uint64_t bottomValue[1 + 10 * MAX_LOG_SIZE] = { + // 0 to 10 - The first 10 buckets are linear + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + // 10 to 100 - From this point on, the Nth entry of the table is + // floor(exp10((double)N/10.0)). + 12, 15, 19, 25, 31, 39, 50, 63, 79, 100, + // 100 to 1K + 125, 158, 199, 251, 316, 398, 501, 630, 794, 1000, + // 1K to 10K + 1258, 1584, 1995, 2511, 3162, 3981, 5011, 6309, 7943, 10000, + // 10K to 100K + 12589, 15848, 19952, 25118, 31622, 39810, 50118, 63095, 79432, 100000, + // 100K to 1M + 125892, 158489, 199526, 251188, 316227, + 398107, 501187, 630957, 794328, 1000000, + // 1M to 10M + 1258925, 1584893, 1995262, 2511886, 3162277, + 3981071, 5011872, 6309573, 7943282, 10000000, + // 10M to 100M + 12589254, 15848931, 19952623, 25118864, 31622776, + 39810717, 50118723, 63095734, 79432823, 100000000, + // 100M to 1G + 125892541, 158489319, 199526231, 251188643, 316227766, + 398107170, 501187233, 630957344, 794328234, 1000000000, + // 1G to 10G + 1258925411L, 1584893192L, 1995262314L, 2511886431L, 3162277660L, + 3981071705L, 5011872336L, 6309573444L, 7943282347L, 10000000000L, + // 10G to 100G + 12589254117L, 15848931924L, 19952623149L, 25118864315L, 31622776601L, + 39810717055L, 50118723362L, 63095734448L, 79432823472L, 100000000000L, + // 100G to 1T + 125892541179L, 158489319246L, 199526231496L, 251188643150L, 316227766016L, + 398107170553L, 501187233627L, 630957344480L, 794328234724L, 1000000000000L, +}; + +/***********************************************************************/ +static unsigned int divideRoundingToNearest(uint64_t number, uint64_t divisor) +{ + number += divisor / 2; + return number / divisor; +} + +/***********************************************************************/ +static int maxBucket(Histogram *h) +{ + int max = h->numBuckets; + while ((max >= 0) && (atomic64_read(&h->counters[max]) == 0)) { + max--; + } + // max == -1 means that there were no samples + return max; +} + +/***********************************************************************/ + +typedef struct { + struct attribute attr; + ssize_t (*show)(Histogram *h, char *buf); + ssize_t (*store)(Histogram *h, const char *buf, size_t length); +} HistogramAttribute; + +/***********************************************************************/ +static void histogramKobjRelease(struct kobject *kobj) +{ + Histogram *h = container_of(kobj, Histogram, kobj); + FREE(h->counters); + FREE(h); +} + +/***********************************************************************/ +static ssize_t histogramShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + HistogramAttribute *ha = container_of(attr, HistogramAttribute, attr); + if (ha->show == NULL) { + return -EINVAL; + } + Histogram *h = container_of(kobj, Histogram, kobj); + return ha->show(h, buf); +} + +/***********************************************************************/ +static ssize_t histogramStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + HistogramAttribute *ha = container_of(attr, HistogramAttribute, attr); + if (ha->show == NULL) { + return -EINVAL; + } + Histogram *h = container_of(kobj, Histogram, kobj); + return ha->store(h, buf, length); +} + +/***********************************************************************/ +static ssize_t histogramShowCount(Histogram *h, char *buf) +{ + int64_t count = atomic64_read(&h->count); + return sprintf(buf, "%" PRId64 "\n", count); +} + +/***********************************************************************/ +static ssize_t histogramShowHistogram(Histogram *h, char *buffer) +{ + /* + * We're given one page in which to write. The caller logs a complaint if we + * report that we've written too much, so we'll truncate to PAGE_SIZE-1. + */ + size_t bufferSize = PAGE_SIZE; + bool bars = true; + ssize_t length = 0; + int max = maxBucket(h); + // If max is -1, we'll fall through to reporting the total of zero. + + enum { BAR_SIZE = 50 }; + char bar[BAR_SIZE + 2]; + bar[0] = ' '; + memset(bar + 1, '=', BAR_SIZE); + bar[BAR_SIZE + 1] = '\0'; + + uint64_t total = 0; + for (int i = 0; i <= max; i++) { + total += atomic64_read(&h->counters[i]); + } + + length += snprintf(buffer, bufferSize, "%s Histogram - number of %s by %s", + h->label, h->countedItems, h->metric); + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + if (h->sampleUnits != NULL) { + length += snprintf(buffer + length, bufferSize - length, " (%s)", + h->sampleUnits); + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + } + length += snprintf(buffer + length, bufferSize - length, "\n"); + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + for (int i = 0; i <= max; i++) { + uint64_t value = atomic64_read(&h->counters[i]); + + unsigned int barLength; + if (bars && (total != 0)) { + // +1 for the space at the beginning + barLength = (divideRoundingToNearest(value * BAR_SIZE, total) + 1); + if (barLength == 1) { + // Don't bother printing just the initial space. + barLength = 0; + } + } else { + // 0 means skip the space and the bar + barLength = 0; + } + + if (h->logFlag) { + if (i == h->numBuckets) { + length += snprintf(buffer + length, bufferSize - length, "%-16s", + "Bigger"); + } else { + unsigned int lower = h->conversionFactor * bottomValue[i]; + unsigned int upper = h->conversionFactor * bottomValue[i + 1] - 1; + length += snprintf(buffer + length, bufferSize - length, "%6u - %7u", + lower, upper); + } + } else { + if (i == h->numBuckets) { + length += snprintf(buffer + length, bufferSize - length, "%6s", + "Bigger"); + } else { + length += snprintf(buffer + length, bufferSize - length, "%6d", i); + } + } + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + length += snprintf(buffer + length, bufferSize - length, + " : %12llu%.*s\n", value, barLength, bar); + if (length >= (bufferSize - 1)) { + return bufferSize - 1; + } + } + + length += snprintf(buffer + length, bufferSize - length, + "total %llu\n", total); + return minSizeT(bufferSize - 1, length); +} + +/***********************************************************************/ +static ssize_t histogramShowMaximum(Histogram *h, char *buf) +{ + // Maximum is initialized to 0. + unsigned long value = atomic64_read(&h->maximum); + return sprintf(buf, "%lu\n", h->conversionFactor * value); +} + +/***********************************************************************/ +static ssize_t histogramShowMinimum(Histogram *h, char *buf) +{ + // Minimum is initialized to -1. + unsigned long value = ((atomic64_read(&h->count) > 0) + ? atomic64_read(&h->minimum) + : 0); + return sprintf(buf, "%lu\n", h->conversionFactor * value); +} + +/***********************************************************************/ +static ssize_t histogramShowLimit(Histogram *h, char *buf) +{ + // Display the limit in the reporting units + return sprintf(buf, "%u\n", (unsigned int)(h->conversionFactor * h->limit)); +} + +/***********************************************************************/ +static ssize_t histogramStoreLimit(Histogram *h, + const char *buf, + size_t length) +{ + unsigned int value; + if ((length > 12) || (sscanf(buf, "%u", &value) != 1)) { + return -EINVAL; + } + /* + * Convert input from reporting units (e.g., milliseconds) to internal + * recording units (e.g., jiffies). + * + * computeBucketCount could also be called "divideRoundingUp". + */ + h->limit = computeBucketCount(value, h->conversionFactor); + atomic64_set(&h->unacceptable, 0); + return length; +} + +/***********************************************************************/ +static ssize_t histogramShowMean(Histogram *h, char *buf) +{ + uint64_t count = atomic64_read(&h->count); + if (count == 0) { + return sprintf(buf, "0/0\n"); + } + // Compute mean, scaled up by 1000, in reporting units + unsigned long sumTimes1000InReportingUnits + = h->conversionFactor * atomic64_read(&h->sum) * 1000; + unsigned int meanTimes1000 + = divideRoundingToNearest(sumTimes1000InReportingUnits, count); + // Print mean with fractional part + return sprintf(buf, "%u.%03u\n", meanTimes1000 / 1000, + meanTimes1000 % 1000); +} + +/***********************************************************************/ +static ssize_t histogramShowUnacceptable(Histogram *h, char *buf) +{ + int64_t count = atomic64_read(&h->unacceptable); + return sprintf(buf, "%" PRId64 "\n", count); +} + +/***********************************************************************/ +static ssize_t histogramShowLabel(Histogram *h, char *buf) +{ + return sprintf(buf, "%s\n", h->label); +} + +/***********************************************************************/ +static ssize_t histogramShowUnit(Histogram *h, char *buf) +{ + if (h->sampleUnits != NULL) { + return sprintf(buf, "%s\n", h->sampleUnits); + } else { + *buf = 0; + return 0; + } +} + +/***********************************************************************/ + +static struct sysfs_ops histogramSysfsOps = { + .show = histogramShow, + .store = histogramStore, +}; + +static HistogramAttribute countAttribute = { + .attr = { .name = "count", .mode = 0444, }, + .show = histogramShowCount, +}; + +static HistogramAttribute histogramAttribute = { + .attr = { .name = "histogram", .mode = 0444, }, + .show = histogramShowHistogram, +}; + +static HistogramAttribute labelAttribute = { + .attr = { .name = "label", .mode = 0444, }, + .show = histogramShowLabel, +}; + +static HistogramAttribute maximumAttribute = { + .attr = { .name = "maximum", .mode = 0444, }, + .show = histogramShowMaximum, +}; + +static HistogramAttribute minimumAttribute = { + .attr = { .name = "minimum", .mode = 0444, }, + .show = histogramShowMinimum, +}; + +static HistogramAttribute limitAttribute = { + .attr = { .name = "limit", .mode = 0644, }, + .show = histogramShowLimit, + .store = histogramStoreLimit, +}; + +static HistogramAttribute meanAttribute = { + .attr = { .name = "mean", .mode = 0444, }, + .show = histogramShowMean, +}; + +static HistogramAttribute unacceptableAttribute = { + .attr = { .name = "unacceptable", .mode = 0444, }, + .show = histogramShowUnacceptable, +}; + +static HistogramAttribute unitAttribute = { + .attr = { .name = "unit", .mode = 0444, }, + .show = histogramShowUnit, +}; + +// "Real" histogram plotting. +static struct attribute *histogramAttributes[] = { + &countAttribute.attr, + &histogramAttribute.attr, + &labelAttribute.attr, + &limitAttribute.attr, + &maximumAttribute.attr, + &meanAttribute.attr, + &minimumAttribute.attr, + &unacceptableAttribute.attr, + &unitAttribute.attr, + NULL, +}; + +static struct kobj_type histogramKobjType = { + .release = histogramKobjRelease, + .sysfs_ops = &histogramSysfsOps, + .default_attrs = histogramAttributes, +}; + +static struct attribute *bucketlessHistogramAttributes[] = { + &countAttribute.attr, + &labelAttribute.attr, + &maximumAttribute.attr, + &meanAttribute.attr, + &minimumAttribute.attr, + &unitAttribute.attr, + NULL, +}; + +static struct kobj_type bucketlessHistogramKobjType = { + .release = histogramKobjRelease, + .sysfs_ops = &histogramSysfsOps, + .default_attrs = bucketlessHistogramAttributes, +}; + +/***********************************************************************/ +static Histogram *makeHistogram(struct kobject *parent, + const char *name, + const char *label, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int numBuckets, + unsigned long conversionFactor, + bool logFlag) +{ + Histogram *h; + if (ALLOCATE(1, Histogram, "histogram", &h) != UDS_SUCCESS) { + return NULL; + } + + if (NO_BUCKETS) { + numBuckets = 0; // plus 1 for "bigger" bucket + } + + if (numBuckets <= 10) { + /* + * The first buckets in a "logarithmic" histogram are still + * linear, but the bucket-search mechanism is a wee bit slower + * than for linear, so change the type. + */ + logFlag = false; + } + + h->label = label; + h->countedItems = countedItems; + h->metric = metric; + h->sampleUnits = sampleUnits; + h->logFlag = logFlag; + h->numBuckets = numBuckets; + h->conversionFactor = conversionFactor; + atomic64_set(&h->minimum, -1UL); + + if (ALLOCATE(h->numBuckets + 1, atomic64_t, "histogram counters", + &h->counters) != UDS_SUCCESS) { + histogramKobjRelease(&h->kobj); + return NULL; + } + + kobject_init(&h->kobj, + ((numBuckets > 0) + ? &histogramKobjType + : &bucketlessHistogramKobjType)); + if (kobject_add(&h->kobj, parent, name) != 0) { + histogramKobjRelease(&h->kobj); + return NULL; + } + return h; +} + +/***********************************************************************/ +Histogram *makeLinearHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int size) +{ + return makeHistogram(parent, name, initLabel, countedItems, + metric, sampleUnits, size, 1, false); +} + + +/** + * Intermediate routine for creating logarithmic histograms. + * + * Limits the histogram size, and computes the bucket count from the + * orders-of-magnitude count. + * + * @param parent The parent kobject. + * @param name The short name of the histogram. This label is + * used for the sysfs node. + * @param initLabel The label for the sampled data. This label is used + * when we plot the data. + * @param countedItems A name (plural) for the things being counted. + * @param metric The measure being used to divide samples into + * buckets. + * @param sampleUnits The units (plural) for the metric, or NULL if it's + * a simple counter. + * @param logSize The number of buckets. There are buckets for a + * range of sizes up to 10^logSize, and an extra + * bucket for larger samples. + * @param conversionFactor Unit conversion factor for reporting. + * + * @return the histogram + **/ +static Histogram * +makeLogarithmicHistogramWithConversionFactor(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int logSize, + uint64_t conversionFactor) +{ + if (logSize > MAX_LOG_SIZE) { + logSize = MAX_LOG_SIZE; + } + return makeHistogram(parent, name, + initLabel, countedItems, metric, sampleUnits, + 10 * logSize, conversionFactor, true); +} + +/***********************************************************************/ +Histogram *makeLogarithmicHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int logSize) +{ + return makeLogarithmicHistogramWithConversionFactor(parent, name, initLabel, + countedItems, + metric, sampleUnits, + logSize, 1); +} + +/***********************************************************************/ +Histogram *makeLogarithmicJiffiesHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + int logSize) +{ + /* + * If these fail, we have a jiffy duration that is not an integral number of + * milliseconds, and the unit conversion code needs updating. + */ + STATIC_ASSERT(HZ <= MSEC_PER_SEC); + STATIC_ASSERT((MSEC_PER_SEC % HZ) == 0); + return makeLogarithmicHistogramWithConversionFactor(parent, name, initLabel, + countedItems, + metric, "milliseconds", + logSize, + jiffies_to_msecs(1)); +} + +/***********************************************************************/ +void enterHistogramSample(Histogram *h, uint64_t sample) +{ + int bucket; + if (h->logFlag) { + int lo = 0; + int hi = h->numBuckets; + while (lo < hi) { + int middle = (lo + hi) / 2; + if (sample < bottomValue[middle + 1]) { + hi = middle; + } else { + lo = middle + 1; + } + } + bucket = lo; + } else { + bucket = sample < h->numBuckets ? sample : h->numBuckets; + } + atomic64_inc(&h->counters[bucket]); + atomic64_inc(&h->count); + atomic64_add(sample, &h->sum); + if ((h->limit > 0) && (sample > h->limit)) { + atomic64_inc(&h->unacceptable); + } + + /* + * Theoretically this could loop a lot; in practice it should rarely + * do more than a single read, with no memory barrier, from a cache + * line we've already referenced above. + */ + uint64_t oldMaximum = atomic64_read(&h->maximum); + while (oldMaximum < sample) { + uint64_t readValue = atomic64_cmpxchg(&h->maximum, oldMaximum, sample); + if (readValue == oldMaximum) { + break; + } + oldMaximum = readValue; + } + + uint64_t oldMinimum = atomic64_read(&h->minimum); + while (oldMinimum > sample) { + uint64_t readValue = atomic64_cmpxchg(&h->minimum, oldMinimum, sample); + if (readValue == oldMinimum) { + break; + } + oldMinimum = readValue; + } +} + +/***********************************************************************/ +void freeHistogram(Histogram **hp) +{ + if (*hp != NULL) { + Histogram *h = *hp; + kobject_put(&h->kobj); + *hp = NULL; + } +} diff --git a/source/vdo/kernel/histogram.h b/source/vdo/kernel/histogram.h new file mode 100644 index 0000000..a177e0a --- /dev/null +++ b/source/vdo/kernel/histogram.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/histogram.h#1 $ + */ + +#ifndef HISTOGRAM_H +#define HISTOGRAM_H + +#include + +typedef struct histogram Histogram; + +/** + * Allocate and initialize a histogram that uses linearly sized buckets. + * + * The histogram label reported via /sys is constructed from several of the + * values passed here; it will be something like "Init Label Histogram - number + * of countedItems grouped by metric (sampleUnits)", e.g., "Flush Forwarding + * Histogram - number of flushes grouped by latency (milliseconds)". Thus + * countedItems and sampleUnits should be plural. + * + * The sampleUnits string will also be reported separately via another /sys + * entry to aid in programmatic processing of the results, so the strings used + * should be consistent (e.g., always "milliseconds" and not "ms" for + * milliseconds). + * + * @param parent The parent kobject. + * @param name The short name of the histogram. This label is used + * for the sysfs node. + * @param initLabel The label for the sampled data. This label is used + * when we plot the data. + * @param countedItems A name (plural) for the things being counted. + * @param metric The measure being used to divide samples into buckets. + * @param sampleUnits The unit (plural) for the metric, or NULL if it's a + * simple counter. + * @param size The number of buckets. There are buckets for every + * value from 0 up to size (but not including) size. + * There is an extra bucket for larger samples. + * + * @return the histogram + **/ +Histogram *makeLinearHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int size); + +/** + * Allocate and initialize a histogram that uses logarithmically sized + * buckets. + * + * @param parent The parent kobject. + * @param name The short name of the histogram. This label is used + * for the sysfs node. + * @param initLabel The label for the sampled data. This label is used + * when we plot the data. + * @param countedItems A name (plural) for the things being counted. + * @param metric The measure being used to divide samples into buckets. + * @param sampleUnits The unit (plural) for the metric, or NULL if it's a + * simple counter. + * @param logSize The number of buckets. There are buckets for a range + * of sizes up to 10^logSize, and an extra bucket for + * larger samples. + * + * @return the histogram + **/ +Histogram *makeLogarithmicHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + const char *sampleUnits, + int logSize); + +/** + * Allocate and initialize a histogram that uses logarithmically sized + * buckets. Values are entered that count in jiffies, and they are + * reported in milliseconds. + * + * @param parent The parent kobject. + * @param name The short name of the histogram. This label is used + * for the sysfs node. + * @param initLabel The label for the sampled data. This label is used + * when we plot the data. + * @param countedItems A name (plural) for the things being counted. + * @param metric The measure being used to divide samples into buckets. + * @param logSize The number of buckets. There are buckets for a range + * of sizes up to 10^logSize, and an extra bucket for + * larger samples. + * + * @return the histogram + **/ +Histogram *makeLogarithmicJiffiesHistogram(struct kobject *parent, + const char *name, + const char *initLabel, + const char *countedItems, + const char *metric, + int logSize); + +/** + * Enter a sample into a histogram + * + * @param h The histogram + * @param sample The sample + **/ +void enterHistogramSample(Histogram *h, uint64_t sample); + +/** + * Free a histogram and null out the reference to it. + * + * @param hp The reference to the histogram. + **/ +void freeHistogram(Histogram **hp); + +#endif /* HISTOGRAM_H */ diff --git a/source/vdo/kernel/instanceNumber.c b/source/vdo/kernel/instanceNumber.c new file mode 100644 index 0000000..178fd92 --- /dev/null +++ b/source/vdo/kernel/instanceNumber.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/instanceNumber.c#1 $ + */ + +#include "instanceNumber.h" + +#include +#include + +#include "memoryAlloc.h" +#include "numUtils.h" +#include "permassert.h" + +/* + * Track in-use instance numbers using a flat bit array. + * + * O(n) run time isn't ideal, but if we have 1000 VDO devices in use + * simultaneously we still only need to scan 16 words, so it's not + * likely to be a big deal compared to other resource usage. + */ + +enum { + /** + * This minimum size for the bit array creates a numbering space of 0-999, + * which allows successive starts of the same volume to have different + * instance numbers in any reasonably-sized test. Changing instances on + * restart allows vdoMonReport to detect that the ephemeral stats have reset + * to zero. + **/ + BIT_COUNT_MINIMUM = 1000, + /** Grow the bit array by this many bits when needed */ + BIT_COUNT_INCREMENT = 100, +}; + +static struct mutex instanceNumberLock; +static unsigned int bitCount; +static unsigned long *words; +static unsigned int instanceCount; +static unsigned int nextInstance; + +/** + * Return the number of bytes needed to store a bit array of the specified + * capacity in an array of unsigned longs. + * + * @param bitCount The number of bits the array must hold + * + * @return the number of bytes needed for the array reperesentation + **/ +static size_t getBitArraySize(unsigned int bitCount) +{ + // Round up to a multiple of the word size and convert to a byte count. + return (computeBucketCount(bitCount, BITS_PER_LONG) * sizeof(unsigned long)); +} + +/** + * Re-allocate the bitmap word array so there will more instance numbers that + * can be allocated. Since the array is initially NULL, this also initializes + * the array the first time we allocate an instance number. + * + * @return UDS_SUCCESS or an error code from the allocation + **/ +static int growBitArray(void) +{ + unsigned int newCount = maxUInt(bitCount + BIT_COUNT_INCREMENT, + BIT_COUNT_MINIMUM); + unsigned long *newWords; + int result = reallocateMemory(words, + getBitArraySize(bitCount), + getBitArraySize(newCount), + "instance number bit array", + &newWords); + if (result != UDS_SUCCESS) { + return result; + } + + bitCount = newCount; + words = newWords; + return UDS_SUCCESS; +} + +/**********************************************************************/ +static int allocateKVDOInstanceLocked(unsigned int *instancePtr) +{ + // If there are no unallocated instances, grow the bit array. + if (instanceCount >= bitCount) { + int result = growBitArray(); + if (result != UDS_SUCCESS) { + return result; + } + } + + // There must be a zero bit somewhere now. Find it, starting just after the + // last instance allocated. + unsigned int instance = find_next_zero_bit(words, bitCount, nextInstance); + if (instance >= bitCount) { + // Nothing free after nextInstance, so wrap around to instance zero. + instance = find_first_zero_bit(words, bitCount); + int result = ASSERT(instance < bitCount, "impossibly, no zero bit found"); + if (result != UDS_SUCCESS) { + return result; + } + } + + __set_bit(instance, words); + instanceCount += 1; + nextInstance = instance + 1; + *instancePtr = instance; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int allocateKVDOInstance(unsigned int *instancePtr) +{ + mutex_lock(&instanceNumberLock); + int result = allocateKVDOInstanceLocked(instancePtr); + mutex_unlock(&instanceNumberLock); + return result; +} + +/**********************************************************************/ +void releaseKVDOInstance(unsigned int instance) +{ + mutex_lock(&instanceNumberLock); + if (instance >= bitCount) { + ASSERT_LOG_ONLY(false, "instance number %u must be less than bit count %u", + instance, bitCount); + } else if (test_bit(instance, words) == 0) { + ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance); + } else { + __clear_bit(instance, words); + instanceCount -= 1; + } + mutex_unlock(&instanceNumberLock); +} + +/**********************************************************************/ +void initializeInstanceNumberTracking(void) +{ + mutex_init(&instanceNumberLock); +} + +/**********************************************************************/ +void cleanUpInstanceNumberTracking(void) +{ + ASSERT_LOG_ONLY(instanceCount == 0, + "should have no instance numbers still in use, but have %u", + instanceCount); + FREE(words); + words = NULL; + bitCount = 0; + instanceCount = 0; + nextInstance = 0; + mutex_destroy(&instanceNumberLock); +} diff --git a/source/vdo/kernel/instanceNumber.h b/source/vdo/kernel/instanceNumber.h new file mode 100644 index 0000000..6d96bad --- /dev/null +++ b/source/vdo/kernel/instanceNumber.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/instanceNumber.h#1 $ + */ + +/** + * Allocate an instance number. + * + * @param [out] instancePtr An integer to hold the allocated instance number + * + * @result UDS_SUCCESS or an error code + **/ +int allocateKVDOInstance(unsigned int *instancePtr); + +/** + * Release an instance number previously allocated. + * + * @param instance The instance number to release + **/ +void releaseKVDOInstance(unsigned int instance); + +/** + * Initialize the instance-number tracking data structures. + **/ +void initializeInstanceNumberTracking(void); + +/** + * Free up the instance-number tracking data structures. + **/ +void cleanUpInstanceNumberTracking(void); diff --git a/source/vdo/kernel/ioSubmitter.c b/source/vdo/kernel/ioSubmitter.c new file mode 100644 index 0000000..036bf25 --- /dev/null +++ b/source/vdo/kernel/ioSubmitter.c @@ -0,0 +1,668 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ioSubmitter.c#8 $ + */ + +#include "ioSubmitter.h" + +#include + +#include "memoryAlloc.h" + +#include "bio.h" +#include "dataKVIO.h" +#include "kernelLayer.h" +#include "logger.h" + +enum { + /* + * Whether to use bio merging code. + * + * Merging I/O requests in the request queue below us is helpful for + * many devices, and VDO does a good job sometimes of shuffling up + * the I/O order (too much for some simple I/O schedulers to sort + * out) as we deal with dedupe advice etc. The bio map tracks the + * yet-to-be-submitted I/O requests by block number so that we can + * collect together and submit sequential I/O operations that should + * be easy to merge. (So we don't actually *merge* them here, we + * just arrange them so that merging can happen.) + * + * For some devices, merging may not help, and we may want to turn + * off this code and save compute/spinlock cycles. + */ + USE_BIOMAP = 1, +}; + +/* + * Submission of bio operations to the underlying storage device will + * go through a separate work queue thread (or more than one) to + * prevent blocking in other threads if the storage device has a full + * queue. The plug structure allows that thread to do better batching + * of requests to make the I/O more efficient. + * + * When multiple worker threads are used, a thread is chosen for a + * I/O operation submission based on the PBN, so a given PBN will + * consistently wind up on the same thread. Flush operations are + * assigned round-robin. + * + * The map (protected by the mutex) collects pending I/O operations so + * that the worker thread can reorder them to try to encourage I/O + * request merging in the request queue underneath. + */ +typedef struct bioQueueData { + KvdoWorkQueue *queue; + struct blk_plug plug; + IntMap *map; + struct mutex lock; + unsigned int queueNumber; +} BioQueueData; + +struct ioSubmitter { + unsigned int numBioQueuesUsed; + unsigned int bioQueueRotationInterval; + unsigned int bioQueueRotor; + BioQueueData bioQueueData[]; +}; + +/**********************************************************************/ +static void startBioQueue(void *ptr) +{ + BioQueueData *bioQueueData = (BioQueueData *)ptr; + blk_start_plug(&bioQueueData->plug); +} + +/**********************************************************************/ +static void finishBioQueue(void *ptr) +{ + BioQueueData *bioQueueData = (BioQueueData *)ptr; + blk_finish_plug(&bioQueueData->plug); +} + +static const KvdoWorkQueueType bioQueueType = { + .start = startBioQueue, + .finish = finishBioQueue, + .actionTable = { + { .name = "bio_compressed_data", + .code = BIO_Q_ACTION_COMPRESSED_DATA, + .priority = 0 }, + { .name = "bio_data", + .code = BIO_Q_ACTION_DATA, + .priority = 0 }, + { .name = "bio_flush", + .code = BIO_Q_ACTION_FLUSH, + .priority = 2 }, + { .name = "bio_high", + .code = BIO_Q_ACTION_HIGH, + .priority = 2 }, + { .name = "bio_metadata", + .code = BIO_Q_ACTION_METADATA, + .priority = 1 }, + { .name = "bio_readcache", + .code = BIO_Q_ACTION_READCACHE, + .priority = 0 }, + { .name = "bio_verify", + .code = BIO_Q_ACTION_VERIFY, + .priority = 1 }, + }, +}; + +/** + * Check that we're running normally (i.e., not in an + * interrupt-servicing context) in an IOSubmitter bio thread. + **/ +static void assertRunningInBioQueue(void) +{ + ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context"); + ASSERT_LOG_ONLY(strnstr(current->comm, "bioQ", TASK_COMM_LEN) != NULL, + "running in bio submission work queue thread"); +} + +/** + * Returns the BioQueueData pointer associated with the current thread. + * Results are undefined if called from any other thread. + * + * @return the BioQueueData pointer + **/ +static inline BioQueueData *getCurrentBioQueueData(void) +{ + BioQueueData *bioQueueData = (BioQueueData *) getWorkQueuePrivateData(); + // Does it look like a bio queue thread? + BUG_ON(bioQueueData == NULL); + BUG_ON(bioQueueData->queue != getCurrentWorkQueue()); + return bioQueueData; +} + +/**********************************************************************/ +static inline IOSubmitter *bioQueueToSubmitter(BioQueueData *bioQueue) +{ + BioQueueData *firstBioQueue = bioQueue - bioQueue->queueNumber; + IOSubmitter *submitter = container_of(firstBioQueue, IOSubmitter, + bioQueueData[0]); + return submitter; +} + +/** + * Return the bio thread number handling the specified physical block + * number. + * + * @param ioSubmitter The I/O submitter data + * @param pbn The physical block number + * + * @return read cache zone number + **/ +static unsigned int bioQueueNumberForPBN(IOSubmitter *ioSubmitter, + PhysicalBlockNumber pbn) +{ + unsigned int bioQueueIndex + = ((pbn + % (ioSubmitter->numBioQueuesUsed + * ioSubmitter->bioQueueRotationInterval)) + / ioSubmitter->bioQueueRotationInterval); + + return bioQueueIndex; +} + +/** + * Check that we're running normally (i.e., not in an + * interrupt-servicing context) in an IOSubmitter bio thread. Also + * require that the thread we're running on is the correct one for the + * supplied physical block number. + * + * @param pbn The PBN that should have been used in thread selection + **/ +static void assertRunningInBioQueueForPBN(PhysicalBlockNumber pbn) +{ + assertRunningInBioQueue(); + + BioQueueData *thisQueue = getCurrentBioQueueData(); + IOSubmitter *submitter = bioQueueToSubmitter(thisQueue); + unsigned int computedQueueNumber = bioQueueNumberForPBN(submitter, pbn); + ASSERT_LOG_ONLY(thisQueue->queueNumber == computedQueueNumber, + "running in correct bio queue (%u vs %u) for PBN %llu", + thisQueue->queueNumber, computedQueueNumber, pbn); +} + +/** + * Increments appropriate counters for bio completions + * + * @param kvio the kvio associated with the bio + * @param bio the bio to count + */ +static void countAllBiosCompleted(KVIO *kvio, BIO *bio) +{ + KernelLayer *layer = kvio->layer; + if (isData(kvio)) { + countBios(&layer->biosOutCompleted, bio); + return; + } + + countBios(&layer->biosMetaCompleted, bio); + if (kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) { + countBios(&layer->biosJournalCompleted, bio); + } else if (kvio->vio->type == VIO_TYPE_BLOCK_MAP) { + countBios(&layer->biosPageCacheCompleted, bio); + } +} + +/**********************************************************************/ +void countCompletedBios(BIO *bio) +{ + KVIO *kvio = (KVIO *)bio->bi_private; + KernelLayer *layer = kvio->layer; + atomic64_inc(&layer->biosCompleted); + countAllBiosCompleted(kvio, bio); +} + +/**********************************************************************/ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +void completeAsyncBio(BIO *bio) +#else +void completeAsyncBio(BIO *bio, int error) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + int error = getBioResult(bio); +#endif + KVIO *kvio = (KVIO *) bio->bi_private; + kvioAddTraceRecord(kvio, THIS_LOCATION("$F($io);cb=io($io)")); + countCompletedBios(bio); + if ((error == 0) && isData(kvio) && isReadVIO(kvio->vio)) { + DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); + if (!isCompressed(dataKVIO->dataVIO.mapped.state) + && !dataKVIO->isPartial) { + kvdoAcknowledgeDataVIO(&dataKVIO->dataVIO); + return; + } + } + kvdoContinueKvio(kvio, error); +} + +/** + * Determines which bio counter to use + * + * @param kvio the kvio associated with the bio + * @param bio the bio to count + */ +static void countAllBios(KVIO *kvio, BIO *bio) +{ + KernelLayer *layer = kvio->layer; + if (isData(kvio)) { + countBios(&layer->biosOut, bio); + return; + } + + countBios(&layer->biosMeta, bio); + if (kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) { + countBios(&layer->biosJournal, bio); + } else if (kvio->vio->type == VIO_TYPE_BLOCK_MAP) { + countBios(&layer->biosPageCache, bio); + } +} + +/** + * Update stats and tracing info, then submit the supplied bio to the + * OS for processing. + * + * @param kvio The KVIO associated with the bio + * @param bio The bio to submit to the OS + * @param location Call site location for tracing + **/ +static void sendBioToDevice(KVIO *kvio, BIO *bio, TraceLocation location) +{ + assertRunningInBioQueueForPBN(kvio->vio->physical); + + atomic64_inc(&kvio->layer->biosSubmitted); + countAllBios(kvio, bio); + kvioAddTraceRecord(kvio, location); + bio->bi_next = NULL; + generic_make_request(bio); +} + +/** + * Submits a bio to the underlying block device. May block if the + * device is busy. + * + * For metadata or if USE_BIOMAP is disabled, kvio->bioToSubmit holds + * the BIO pointer to submit to the target device. For normal + * data when USE_BIOMAP is enabled, kvio->biosMerged is the list of + * all bios collected together in this group; all of them get + * submitted. In both cases, the bi_end_io callback is invoked when + * each I/O operation completes. + * + * @param item The work item in the KVIO "owning" either the bio to + * submit, or the head of the bio_list to be submitted. + **/ +static void processBioMap(KvdoWorkItem *item) +{ + assertRunningInBioQueue(); + KVIO *kvio = workItemAsKVIO(item); + /* + * XXX Make these paths more regular: Should bi_bdev be set here, or + * in the caller, or in the callback function? Should we call + * finishBioQueue for the biomap case on old kernels? + */ + if (USE_BIOMAP && isData(kvio)) { + // We need to make sure to do two things here: + // 1. Use each bio's kvio when submitting. Any other kvio is not safe + // 2. Detach the bio list from the kvio before submitting, because it + // could get reused/free'd up before all bios are submitted. + BioQueueData *bioQueueData = getWorkQueuePrivateData(); + BIO *bio = NULL; + mutex_lock(&bioQueueData->lock); + if (!bio_list_empty(&kvio->biosMerged)) { + intMapRemove(bioQueueData->map, getBioSector(kvio->biosMerged.head)); + intMapRemove(bioQueueData->map, getBioSector(kvio->biosMerged.tail)); + } + bio = kvio->biosMerged.head; + bio_list_init(&kvio->biosMerged); + mutex_unlock(&bioQueueData->lock); + // Somewhere in the list we'll be submitting the current "kvio", + // so drop our handle on it now. + kvio = NULL; + + while (bio != NULL) { + KVIO *kvioBio = bio->bi_private; + BIO *next = bio->bi_next; + bio->bi_next = NULL; + setBioBlockDevice(bio, getKernelLayerBdev(kvioBio->layer)); + sendBioToDevice(kvioBio, bio, THIS_LOCATION("$F($io)")); + bio = next; + } + } else { + sendBioToDevice(kvio, kvio->bioToSubmit, THIS_LOCATION("$F($io)")); + } +} + +/** + * This function will attempt to find an already queued bio that the current + * bio can be merged with. There are two types of merging possible, forward + * and backward, which are distinguished by a flag that uses kernel + * elevator terminology. + * + * @param map The bio map to use for merging + * @param kvio The kvio we want to merge + * @param mergeType The type of merging we want to try + * + * @return the kvio to merge to, NULL if no merging is possible + */ +static KVIO *getMergeableLocked(IntMap *map, + KVIO *kvio, + unsigned int mergeType) +{ + BIO *bio = kvio->bioToSubmit; + sector_t mergeSector = getBioSector(bio); + switch (mergeType) { + case ELEVATOR_BACK_MERGE: + mergeSector -= VDO_SECTORS_PER_BLOCK; + break; + case ELEVATOR_FRONT_MERGE: + mergeSector += VDO_SECTORS_PER_BLOCK; + break; + } + + KVIO *kvioMerge = intMapGet(map, mergeSector); + + if (kvioMerge != NULL) { + if (!areWorkItemActionsEqual(&kvio->enqueueable.workItem, + &kvioMerge->enqueueable.workItem)) { + return NULL; + } else if (bio_data_dir(bio) != bio_data_dir(kvioMerge->bioToSubmit)) { + return NULL; + } else if (bio_list_empty(&kvioMerge->biosMerged)) { + return NULL; + } else { + switch (mergeType) { + case ELEVATOR_BACK_MERGE: + if (getBioSector(kvioMerge->biosMerged.tail) != mergeSector) { + return NULL; + } + break; + case ELEVATOR_FRONT_MERGE: + if (getBioSector(kvioMerge->biosMerged.head) != mergeSector) { + return NULL; + } + break; + } + } + } + + return kvioMerge; +} + +/**********************************************************************/ +static inline unsigned int advanceBioRotor(IOSubmitter *bioData) +{ + unsigned int index = bioData->bioQueueRotor++ + % (bioData->numBioQueuesUsed + * bioData->bioQueueRotationInterval); + index /= bioData->bioQueueRotationInterval; + return index; +} + +/**********************************************************************/ +static bool tryBioMapMerge(BioQueueData *bioQueueData, KVIO *kvio, BIO *bio) +{ + bool merged = false; + + mutex_lock(&bioQueueData->lock); + KVIO *prevKvio = getMergeableLocked(bioQueueData->map, kvio, + ELEVATOR_BACK_MERGE); + KVIO *nextKvio = getMergeableLocked(bioQueueData->map, kvio, + ELEVATOR_FRONT_MERGE); + if (prevKvio == nextKvio) { + nextKvio = NULL; + } + int result; + if ((prevKvio == NULL) && (nextKvio == NULL)) { + // no merge. just add to bioQueue + result = intMapPut(bioQueueData->map, getBioSector(bio), kvio, true, NULL); + // We don't care about failure of intMapPut in this case. + result = result; + mutex_unlock(&bioQueueData->lock); + } else { + if (nextKvio == NULL) { + // Only prev. merge to prev's tail + intMapRemove(bioQueueData->map, getBioSector(prevKvio->biosMerged.tail)); + bio_list_merge(&prevKvio->biosMerged, &kvio->biosMerged); + result = intMapPut(bioQueueData->map, + getBioSector(prevKvio->biosMerged.head), + prevKvio, true, NULL); + result = intMapPut(bioQueueData->map, + getBioSector(prevKvio->biosMerged.tail), + prevKvio, true, NULL); + } else { + // Only next. merge to next's head + // + // Handle "next merge" and "gap fill" cases the same way so as to + // reorder bios in a way that's compatible with using funnel queues + // in work queues. This avoids removing an existing work item. + intMapRemove(bioQueueData->map, getBioSector(nextKvio->biosMerged.head)); + bio_list_merge_head(&nextKvio->biosMerged, &kvio->biosMerged); + result = intMapPut(bioQueueData->map, + getBioSector(nextKvio->biosMerged.head), + nextKvio, true, NULL); + result = intMapPut(bioQueueData->map, + getBioSector(nextKvio->biosMerged.tail), + nextKvio, true, NULL); + } + + // We don't care about failure of intMapPut in this case. + result = result; + mutex_unlock(&bioQueueData->lock); + merged = true; + } + return merged; +} + +/**********************************************************************/ +static BioQueueData *bioQueueDataForPBN(IOSubmitter *ioSubmitter, + PhysicalBlockNumber pbn) +{ + unsigned int bioQueueIndex = bioQueueNumberForPBN(ioSubmitter, pbn); + return &ioSubmitter->bioQueueData[bioQueueIndex]; +} + +/**********************************************************************/ +void submitBio(BIO *bio, BioQAction action) +{ + KVIO *kvio = bio->bi_private; + kvio->bioToSubmit = bio; + setupKVIOWork(kvio, processBioMap, (KvdoWorkFunction) bio->bi_end_io, + action); + + KernelLayer *layer = kvio->layer; + BioQueueData *bioQueueData + = bioQueueDataForPBN(layer->ioSubmitter, kvio->vio->physical); + + kvioAddTraceRecord(kvio, THIS_LOCATION("$F($io)")); + + bio->bi_next = NULL; + bio_list_init(&kvio->biosMerged); + bio_list_add(&kvio->biosMerged, bio); + + /* + * Enabling of MD RAID5 mode optimizes performance for MD RAID5 storage + * configurations. It clears the bits for sync I/O RW flags on data block + * bios and sets the bits for sync I/O RW flags on all journal-related + * bios. + * + * This increases the frequency of full-stripe writes by altering flags of + * submitted bios. For workloads with write requests this increases the + * likelihood that the MD RAID5 device will update a full stripe instead of + * a partial stripe, thereby avoiding making read requests to the underlying + * physical storage for purposes of parity chunk calculations. + * + * Setting the sync-flag on journal-related bios is expected to reduce + * latency on journal updates submitted to an MD RAID5 device. + */ + if (layer->deviceConfig->mdRaid5ModeEnabled) { + if (isData(kvio)) { + // Clear the bits for sync I/O RW flags on data block bios. + clearBioOperationFlagSync(bio); + } else if ((kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) + || (kvio->vio->type == VIO_TYPE_SLAB_JOURNAL)) { + // Set the bits for sync I/O RW flags on all journal-related and + // slab-journal-related bios. + setBioOperationFlagSync(bio); + } + } + + /* + * Try to use the bio map to submit this bio earlier if we're already sending + * IO for an adjacent block. If we can't use an existing pending bio, enqueue + * an operation to run in a bio submission thread appropriate to the + * indicated physical block number. + */ + + bool merged = false; + if (USE_BIOMAP && isData(kvio)) { + merged = tryBioMapMerge(bioQueueData, kvio, bio); + } + if (!merged) { + enqueueKVIOWork(bioQueueData->queue, kvio); + } +} + +/**********************************************************************/ +static int initializeBioQueue(BioQueueData *bioQueueData, + const char *threadNamePrefix, + const char *queueName, + unsigned int queueNumber, + KernelLayer *layer) +{ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,38) + bioQueueData->bdev = layer->dev->bdev; +#endif + bioQueueData->queueNumber = queueNumber; + + return makeWorkQueue(threadNamePrefix, queueName, &layer->wqDirectory, + layer, bioQueueData, &bioQueueType, 1, + &bioQueueData->queue); +} + +/**********************************************************************/ +int makeIOSubmitter(const char *threadNamePrefix, + unsigned int threadCount, + unsigned int rotationInterval, + unsigned int maxRequestsActive, + KernelLayer *layer, + IOSubmitter **ioSubmitterPtr) +{ + IOSubmitter *ioSubmitter; + int result = ALLOCATE_EXTENDED(IOSubmitter, + threadCount, + BioQueueData, + "bio submission data", + &ioSubmitter); + if (result != UDS_SUCCESS) { + return result; + } + + // Setup for each bio-submission work queue + char queueName[MAX_QUEUE_NAME_LEN]; + ioSubmitter->bioQueueRotationInterval = rotationInterval; + for (unsigned int i=0; i < threadCount; i++) { + BioQueueData *bioQueueData = &ioSubmitter->bioQueueData[i]; + snprintf(queueName, sizeof(queueName), "bioQ%u", i); + + if (USE_BIOMAP) { + mutex_init(&bioQueueData->lock); + /* + * One I/O operation per request, but both first & last sector numbers. + * + * If requests are assigned to threads round-robin, they should + * be distributed quite evenly. But if they're assigned based on + * PBN, things can sometimes be very uneven. So for now, we'll + * assume that all requests *may* wind up on one thread, and + * thus all in the same map. + */ + result = makeIntMap(maxRequestsActive * 2, 0, &bioQueueData->map); + if (result != 0) { + // Clean up the partially initialized bio-queue entirely and + // indicate that initialization failed. + logError("bio map initialization failed %d", result); + cleanupIOSubmitter(ioSubmitter); + freeIOSubmitter(ioSubmitter); + return result; + } + } + + result = initializeBioQueue(bioQueueData, + threadNamePrefix, + queueName, + i, + layer); + if (result != VDO_SUCCESS) { + // Clean up the partially initialized bio-queue entirely and + // indicate that initialization failed. + if (USE_BIOMAP) { + freeIntMap(&ioSubmitter->bioQueueData[i].map); + } + logError("bio queue initialization failed %d", result); + cleanupIOSubmitter(ioSubmitter); + freeIOSubmitter(ioSubmitter); + return result; + } + + ioSubmitter->numBioQueuesUsed++; + } + + *ioSubmitterPtr = ioSubmitter; + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void cleanupIOSubmitter(IOSubmitter *ioSubmitter) +{ + for (int i=ioSubmitter->numBioQueuesUsed - 1; i >= 0; i--) { + finishWorkQueue(ioSubmitter->bioQueueData[i].queue); + } +} + +/**********************************************************************/ +void freeIOSubmitter(IOSubmitter *ioSubmitter) +{ + for (int i = ioSubmitter->numBioQueuesUsed - 1; i >= 0; i--) { + ioSubmitter->numBioQueuesUsed--; + freeWorkQueue(&ioSubmitter->bioQueueData[i].queue); + if (USE_BIOMAP) { + freeIntMap(&ioSubmitter->bioQueueData[i].map); + } + } + FREE(ioSubmitter); +} + +/**********************************************************************/ +void dumpBioWorkQueue(IOSubmitter *ioSubmitter) +{ + for (int i=0; i < ioSubmitter->numBioQueuesUsed; i++) { + dumpWorkQueue(ioSubmitter->bioQueueData[i].queue); + } +} + + +/**********************************************************************/ +void enqueueBioWorkItem(IOSubmitter *ioSubmitter, KvdoWorkItem *workItem) +{ + unsigned int bioQueueIndex = advanceBioRotor(ioSubmitter); + enqueueWorkQueue(ioSubmitter->bioQueueData[bioQueueIndex].queue, + workItem); +} + diff --git a/source/vdo/kernel/ioSubmitter.h b/source/vdo/kernel/ioSubmitter.h new file mode 100644 index 0000000..c4fb5ce --- /dev/null +++ b/source/vdo/kernel/ioSubmitter.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ioSubmitter.h#4 $ + */ + +#ifndef IOSUBMITTER_H +#define IOSUBMITTER_H + +#include + +#include "kernelLayer.h" +#include "kvio.h" + +/** + * Does all the appropriate accounting for bio completions + * + * @param bio the bio to count + **/ +void countCompletedBios(BIO *bio); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Completes a bio relating to a kvio, causing the completion callback + * to be invoked. + * + * This is used as the bi_end_io function for most of the bios created + * within VDO and submitted to the storage device. Exceptions are the + * flush code and the read-block code, both of which need to regain + * control in the kernel layer after the I/O is completed. + * + * @param bio The bio to complete + **/ +void completeAsyncBio(BIO *bio); +#else +/** + * Completes a bio relating to a kvio, causing the completion callback + * to be invoked. + * + * This is used as the bi_end_io function for most of the bios created + * within VDO and submitted to the storage device. Exceptions are the + * flush code and the read-block code, both of which need to regain + * control in the kernel layer after the I/O is completed. + * + * @param bio The bio to complete + * @param error Possible error from underlying block device + **/ +void completeAsyncBio(BIO *bio, int error); +#endif + +/** + * Create a IOSubmitter structure for a new physical layer. + * + * @param [in] threadNamePrefix The per-device prefix to use in process names + * @param [in] threadCount Number of bio-submission threads to set up + * @param [in] rotationInterval Interval to use when rotating between + * bio-submission threads when enqueuing work + * items + * @param [in] maxRequestsActive Number of bios for merge tracking + * @param [in] layer The kernel layer + * @param [out] ioSubmitter Pointer to the new data structure + * + * @return VDO_SUCCESS or an error + **/ +int makeIOSubmitter(const char *threadNamePrefix, + unsigned int threadCount, + unsigned int rotationInterval, + unsigned int maxRequestsActive, + KernelLayer *layer, + IOSubmitter **ioSubmitter); + +/** + * Tear down the IOSubmitter fields as needed for a physical layer. + * + * @param [in] ioSubmitter The I/O submitter data to tear down + **/ +void cleanupIOSubmitter(IOSubmitter *ioSubmitter); + +/** + * Free the IOSubmitter fields and structure as needed for a + * physical layer. This must be called after + * cleanupIOSubmitter(). It is used to release resources late in + * the shutdown process to avoid or reduce the chance of race + * conditions. + * + * @param [in] ioSubmitter The I/O submitter data to destroy + **/ +void freeIOSubmitter(IOSubmitter *ioSubmitter); + +/** + * Dump info to the kernel log about the work queue used by the + * physical layer. For debugging only. + * + * @param [in] ioSubmitter The I/O submitter data + **/ +void dumpBioWorkQueue(IOSubmitter *ioSubmitter); + + +/** + * Enqueue a work item to run in the work queue(s) used for bio + * submissions from the physical layer. + * + * Outside of IOSubmitter, used only for finishing processing of empty + * flush bios by sending them to the storage device. + * + * @param ioSubmitter The I/O submitter data to update + * @param workItem The new work item to run + **/ +void enqueueBioWorkItem(IOSubmitter *ioSubmitter, KvdoWorkItem *workItem); + +/** + * Submit bio but don't block. + * + * Submits the bio to a helper work queue which sits in a loop + * submitting bios. The worker thread may block if the target device + * is busy, which is why we don't want to do the submission in the + * original calling thread. + * + * The bi_private field of the bio must point to a KVIO associated + * with the operation. The bi_end_io callback is invoked when the I/O + * operation completes. + * + * @param bio the block I/O operation descriptor to submit + * @param action the action code specifying the priority for the operation + **/ +void submitBio(BIO *bio, BioQAction action); + +#endif // IOSUBMITTER_H diff --git a/source/vdo/kernel/kernelLayer.c b/source/vdo/kernel/kernelLayer.c new file mode 100644 index 0000000..8d4d4ed --- /dev/null +++ b/source/vdo/kernel/kernelLayer.c @@ -0,0 +1,1409 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.c#38 $ + */ + +#include "kernelLayer.h" + +#include +#include +#include +#include +#include + +#include "logger.h" +#include "memoryAlloc.h" +#include "murmur/MurmurHash3.h" + +#include "lz4.h" +#include "releaseVersions.h" +#include "volumeGeometry.h" +#include "statistics.h" +#include "vdo.h" + +#include "bio.h" +#include "dataKVIO.h" +#include "dedupeIndex.h" +#include "deviceConfig.h" +#include "deviceRegistry.h" +#include "instanceNumber.h" +#include "ioSubmitter.h" +#include "kvdoFlush.h" +#include "kvio.h" +#include "poolSysfs.h" +#include "statusProcfs.h" +#include "stringUtils.h" +#include "verify.h" + +enum { + DEDUPE_TIMEOUT_REPORT_INTERVAL = 1000, +}; + +static const KvdoWorkQueueType bioAckQType = { + .actionTable = { + { .name = "bio_ack", + .code = BIO_ACK_Q_ACTION_ACK, + .priority = 0 }, + }, +}; + +static const KvdoWorkQueueType cpuQType = { + .actionTable = { + { .name = "cpu_complete_kvio", + .code = CPU_Q_ACTION_COMPLETE_KVIO, + .priority = 0 }, + { .name = "cpu_compress_block", + .code = CPU_Q_ACTION_COMPRESS_BLOCK, + .priority = 0 }, + { .name = "cpu_hash_block", + .code = CPU_Q_ACTION_HASH_BLOCK, + .priority = 0 }, + { .name = "cpu_event_reporter", + .code = CPU_Q_ACTION_EVENT_REPORTER, + .priority = 0 }, + }, +}; + +// 2000 is half the number of entries currently in our page cache, +// to allow for each in-progress operation to update two pages. +int defaultMaxRequestsActive = 2000; + +/**********************************************************************/ +static CRC32Checksum kvdoUpdateCRC32(CRC32Checksum crc, + const byte *buffer, + size_t length) +{ + /* + * The kernel's CRC 32 implementation does not do pre- and post- + * conditioning, so do it ourselves. + */ + return crc32(crc ^ 0xffffffff, buffer, length) ^ 0xffffffff; +} + +/**********************************************************************/ +static BlockCount kvdoGetBlockCount(PhysicalLayer *header) +{ + return asKernelLayer(header)->deviceConfig->physicalBlocks; +} + +/**********************************************************************/ +bool layerIsNamed(KernelLayer *layer, void *context) +{ + struct dm_target *ti = layer->deviceConfig->owningTarget; + const char *deviceName = dm_device_name(dm_table_get_md(ti->table)); + return (strcmp(deviceName, (const char *) context) == 0); +} + +/** + * Implements LayerFilter. + **/ +static bool layerUsesDevice(KernelLayer *layer, void *context) +{ + DeviceConfig *config = context; + return (layer->deviceConfig->ownedDevice->bdev->bd_dev + == config->ownedDevice->bdev->bd_dev); +} + +int mapToSystemError(int error) +{ + // 0 is success, negative a system error code + if (likely(error <= 0)) { + return error; + } + if (error < 1024) { + // errno macro used without negating - may be a minor bug + return -error; + } + // VDO or UDS error + char errorName[80], errorMessage[ERRBUF_SIZE]; + switch (sansUnrecoverable(error)) { + case VDO_NO_SPACE: + return -ENOSPC; + case VDO_READ_ONLY: + return -EIO; + default: + logInfo("%s: mapping internal status code %d (%s: %s) to EIO", + __func__, error, + stringErrorName(error, errorName, sizeof(errorName)), + stringError(error, errorMessage, sizeof(errorMessage))); + return -EIO; + } +} + +/**********************************************************************/ +static void setKernelLayerState(KernelLayer *layer, KernelLayerState newState) +{ + atomicStore32(&layer->state, newState); +} + +/**********************************************************************/ +void waitForNoRequestsActive(KernelLayer *layer) +{ + // Do nothing if there are no requests active. This check is not necessary + // for correctness but does reduce log message traffic. + if (limiterIsIdle(&layer->requestLimiter)) { + return; + } + + // We have to make sure to flush the packer before waiting. We do this + // by turning off compression, which also means no new entries coming in + // while waiting will end up in the packer. + bool wasCompressing = setKVDOCompressing(&layer->kvdo, false); + // Now wait for there to be no active requests + limiterWaitForIdle(&layer->requestLimiter); + // Reset the compression state after all requests are done + if (wasCompressing) { + setKVDOCompressing(&layer->kvdo, true); + } +} + +/** + * Start processing a new data KVIO based on the supplied bio, but from within + * a VDO thread context, when we're not allowed to block. Using this path at + * all suggests a bug or erroneous usage, but we special-case it to avoid a + * deadlock that can apparently result. Message will be logged to alert the + * administrator that something has gone wrong, while we attempt to continue + * processing other requests. + * + * If a request permit can be acquired immediately, kvdoLaunchDataKVIOFromBio + * will be called. (If the bio is a discard operation, a permit from the + * discard limiter will be requested but the call will be made with or without + * it.) If the request permit is not available, the bio will be saved on a list + * to be launched later. Either way, this function will not block, and will + * take responsibility for processing the bio. + * + * @param layer The kernel layer + * @param bio The bio to launch + * @param arrivalTime The arrival time of the bio + * + * @return DM_MAPIO_SUBMITTED or a system error code + **/ +static int launchDataKVIOFromVDOThread(KernelLayer *layer, + BIO *bio, + Jiffies arrivalTime) +{ + logWarning("kvdoMapBio called from within a VDO thread!"); + /* + * We're not yet entirely sure what circumstances are causing this situation + * in [ESC-638], but it does appear to be happening and causing VDO to + * deadlock. + * + * Somehow kvdoMapBio is being called from generic_make_request which is + * being called from the VDO code to pass a flush on down to the underlying + * storage system; we've got 2000 requests in progress, so we have to wait + * for one to complete, but none can complete while the bio thread is blocked + * from passing more I/O requests down. Near as we can tell, the flush bio + * should always have gotten updated to point to the storage system, so we + * shouldn't be calling back into VDO unless something's gotten messed up + * somewhere. + * + * To side-step this case, if the limiter says we're busy *and* we're running + * on one of VDO's own threads, we'll drop the I/O request in a special queue + * for processing as soon as KVIOs become free. + * + * We don't want to do this in general because it leads to unbounded + * buffering, arbitrarily high latencies, inability to push back in a way the + * caller can take advantage of, etc. If someone wants huge amounts of + * buffering on top of VDO, they're welcome to access it through the kernel + * page cache or roll their own. + */ + if (!limiterPoll(&layer->requestLimiter)) { + addToDeadlockQueue(&layer->deadlockQueue, bio, arrivalTime); + logWarning("queued an I/O request to avoid deadlock!"); + + return DM_MAPIO_SUBMITTED; + } + + bool hasDiscardPermit + = (isDiscardBio(bio) && limiterPoll(&layer->discardLimiter)); + int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, + hasDiscardPermit); + // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. + if (result != VDO_SUCCESS) { + return result; + } + + return DM_MAPIO_SUBMITTED; +} + +/**********************************************************************/ +int kvdoMapBio(KernelLayer *layer, BIO *bio) +{ + Jiffies arrivalTime = jiffies; + KernelLayerState state = getKernelLayerState(layer); + ASSERT_LOG_ONLY(state == LAYER_RUNNING, + "kvdoMapBio should not be called while in state %d", state); + + // Count all incoming bios. + countBios(&layer->biosIn, bio); + + // Handle empty bios. Empty flush bios are not associated with a VIO. + if (isFlushBio(bio)) { + if (ASSERT(getBioSize(bio) == 0, "Flush bio is size 0") != VDO_SUCCESS) { + // We expect flushes to be of size 0. + return -EINVAL; + } + if (shouldProcessFlush(layer)) { + launchKVDOFlush(layer, bio); + return DM_MAPIO_SUBMITTED; + } else { + // We're not acknowledging this bio now, but we'll never touch it + // again, so this is the last chance to account for it. + countBios(&layer->biosAcknowledged, bio); + atomic64_inc(&layer->flushOut); + setBioBlockDevice(bio, getKernelLayerBdev(layer)); + return DM_MAPIO_REMAPPED; + } + } + + if (ASSERT(getBioSize(bio) != 0, "Data bio is not size 0") != VDO_SUCCESS) { + // We expect non-flushes to be non-zero in size. + return -EINVAL; + } + + if (isDiscardBio(bio) && isReadBio(bio)) { + // Read and Discard should never occur together + return -EIO; + } + + KvdoWorkQueue *currentWorkQueue = getCurrentWorkQueue(); + if ((currentWorkQueue != NULL) + && (layer == getWorkQueueOwner(currentWorkQueue))) { + /* + * This prohibits sleeping during I/O submission to VDO from its own + * thread. + */ + return launchDataKVIOFromVDOThread(layer, bio, arrivalTime); + } + bool hasDiscardPermit = false; + if (isDiscardBio(bio)) { + limiterWaitForOneFree(&layer->discardLimiter); + hasDiscardPermit = true; + } + limiterWaitForOneFree(&layer->requestLimiter); + + int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, + hasDiscardPermit); + // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. + if (result != VDO_SUCCESS) { + return result; + } + + return DM_MAPIO_SUBMITTED; +} + +/**********************************************************************/ +struct block_device *getKernelLayerBdev(const KernelLayer *layer) +{ + return layer->deviceConfig->ownedDevice->bdev; +} + +/**********************************************************************/ +void completeManyRequests(KernelLayer *layer, uint32_t count) +{ + // If we had to buffer some requests to avoid deadlock, release them now. + while (count > 0) { + Jiffies arrivalTime = 0; + BIO *bio = pollDeadlockQueue(&layer->deadlockQueue, &arrivalTime); + if (likely(bio == NULL)) { + break; + } + + bool hasDiscardPermit + = (isDiscardBio(bio) && limiterPoll(&layer->discardLimiter)); + int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, + hasDiscardPermit); + if (result != VDO_SUCCESS) { + completeBio(bio, result); + } + // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. + count--; + } + // Notify the limiter, so it can wake any blocked processes. + if (count > 0) { + limiterReleaseMany(&layer->requestLimiter, count); + } +} + +/**********************************************************************/ +static void reportEvents(PeriodicEventReporter *reporter) +{ + atomic_set(&reporter->workItemQueued, 0); + uint64_t newValue = atomic64_read(&reporter->value); + uint64_t difference = newValue - reporter->lastReportedValue; + if (difference != 0) { + logDebug(reporter->format, difference); + reporter->lastReportedValue = newValue; + } +} + +/**********************************************************************/ +static void reportEventsWork(KvdoWorkItem *item) +{ + PeriodicEventReporter *reporter = container_of(item, PeriodicEventReporter, + workItem); + reportEvents(reporter); +} + +/**********************************************************************/ +static void initPeriodicEventReporter(PeriodicEventReporter *reporter, + const char *format, + unsigned long reportingInterval, + KernelLayer *layer) +{ + setupWorkItem(&reporter->workItem, reportEventsWork, NULL, + CPU_Q_ACTION_EVENT_REPORTER); + reporter->format = format; + reporter->reportingInterval = msecs_to_jiffies(reportingInterval); + reporter->layer = layer; +} + +/**********************************************************************/ +static void addEventCount(PeriodicEventReporter *reporter, unsigned int count) +{ + if (count > 0) { + atomic64_add(count, &reporter->value); + int oldWorkItemQueued = atomic_xchg(&reporter->workItemQueued, 1); + if (oldWorkItemQueued == 0) { + enqueueWorkQueueDelayed(reporter->layer->cpuQueue, + &reporter->workItem, + jiffies + reporter->reportingInterval); + } + } +} + +/**********************************************************************/ +static void stopPeriodicEventReporter(PeriodicEventReporter *reporter) +{ + reportEvents(reporter); +} + +/**********************************************************************/ +void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount) +{ + addEventCount(&layer->albireoTimeoutReporter, expiredCount); +} + +/**********************************************************************/ +static int kvdoCreateEnqueueable(VDOCompletion *completion) +{ + KvdoEnqueueable *kvdoEnqueueable; + int result = ALLOCATE(1, KvdoEnqueueable, "kvdoEnqueueable", + &kvdoEnqueueable); + if (result != VDO_SUCCESS) { + logError("kvdoEnqueueable allocation failure %d", result); + return result; + } + kvdoEnqueueable->enqueueable.completion = completion; + completion->enqueueable = &kvdoEnqueueable->enqueueable; + return VDO_SUCCESS; +} + +/**********************************************************************/ +static void kvdoDestroyEnqueueable(Enqueueable **enqueueablePtr) +{ + Enqueueable *enqueueable = *enqueueablePtr; + if (enqueueable != NULL) { + KvdoEnqueueable *kvdoEnqueueable + = container_of(enqueueable, KvdoEnqueueable, enqueueable); + FREE(kvdoEnqueueable); + *enqueueablePtr = NULL; + } +} + +/** + * Implements BufferAllocator. + **/ +static int kvdoAllocateIOBuffer(PhysicalLayer *layer __attribute__((unused)), + size_t bytes, + const char *why, + char **bufferPtr) +{ + return ALLOCATE(bytes, char, why, bufferPtr); +} + +/** + * Implements ExtentReader. Exists only for the geometry block; is unset after + * it is read. + **/ +static int kvdoSynchronousRead(PhysicalLayer *layer, + PhysicalBlockNumber startBlock, + size_t blockCount, + char *buffer, + size_t *blocksRead) +{ + if (blockCount != 1) { + return VDO_NOT_IMPLEMENTED; + } + + KernelLayer *kernelLayer = asKernelLayer(layer); + + BIO *bio; + int result = createBio(kernelLayer, buffer, &bio); + if (result != VDO_SUCCESS) { + return result; + } + setBioBlockDevice(bio, getKernelLayerBdev(kernelLayer)); + setBioSector(bio, blockToSector(kernelLayer, startBlock)); + setBioOperationRead(bio); + result = submitBioAndWait(bio); + if (result != 0) { + logErrorWithStringError(result, "synchronous read failed"); + result = -EIO; + } + freeBio(bio, kernelLayer); + + if (result != VDO_SUCCESS) { + return result; + } + if (blocksRead != NULL) { + *blocksRead = blockCount; + } + return VDO_SUCCESS; +} + +/** + * Implements VIODestructor. + **/ +static void kvdoFreeVIO(VIO **vioPtr) +{ + VIO *vio = *vioPtr; + if (vio == NULL) { + return; + } + + BUG_ON(isDataVIO(vio)); + + if (isCompressedWriteVIO(vio)) { + CompressedWriteKVIO *compressedWriteKVIO + = allocatingVIOAsCompressedWriteKVIO(vioAsAllocatingVIO(vio)); + freeCompressedWriteKVIO(&compressedWriteKVIO); + } else { + MetadataKVIO *metadataKVIO = vioAsMetadataKVIO(vio); + freeMetadataKVIO(&metadataKVIO); + } + + *vioPtr = NULL; +} + +/**********************************************************************/ +static WritePolicy kvdoGetWritePolicy(PhysicalLayer *common) +{ + KernelLayer *layer = asKernelLayer(common); + return getKVDOWritePolicy(&layer->kvdo); +} + +/** + * Function that is called when a synchronous operation is completed. We let + * the waiting thread know it can continue. + * + *

Implements OperationComplete. + * + * @param common The kernel layer + **/ +static void kvdoCompleteSyncOperation(PhysicalLayer *common) +{ + KernelLayer *layer = asKernelLayer(common); + complete(&layer->callbackSync); +} + +/** + * Wait for a synchronous operation to complete. + * + *

Implements OperationWaiter. + * + * @param common The kernel layer + **/ +static void waitForSyncOperation(PhysicalLayer *common) +{ + KernelLayer *layer = asKernelLayer(common); + // Using the "interruptible" interface means that Linux will not log a + // message when we wait for more than 120 seconds. + while (wait_for_completion_interruptible(&layer->callbackSync) != 0) { + // However, if we get a signal in a user-mode process, we could + // spin... + msleep(1); + } +} + +/** + * Make the bio set for allocating new bios. + * + * @param layer The kernel layer + * + * @returns VDO_SUCCESS if bio set created, error code otherwise + **/ +static int makeDedupeBioSet(KernelLayer *layer) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0) + int result = ALLOCATE(1, struct bio_set, "bio set", &layer->bioset); + if (result != VDO_SUCCESS) { + return result; + } + + result = bioset_init(layer->bioset, 0, 0, BIOSET_NEED_BVECS); + if (result != 0) { + return result; + } +#else +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) + layer->bioset = bioset_create(0, 0, BIOSET_NEED_BVECS); +#else + layer->bioset = bioset_create(0, 0); +#endif + if (layer->bioset == NULL) { + return -ENOMEM; + } +#endif + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int makeKernelLayer(uint64_t startingSector, + unsigned int instance, + DeviceConfig *config, + struct kobject *parentKobject, + ThreadConfig **threadConfigPointer, + char **reason, + KernelLayer **layerPtr) +{ + // VDO-3769 - Set a generic reason so we don't ever return garbage. + *reason = "Unspecified error"; + + KernelLayer *oldLayer = findLayerMatching(layerUsesDevice, config); + if (oldLayer != NULL) { + logError("Existing layer named %s already uses device %s", + oldLayer->deviceConfig->poolName, + oldLayer->deviceConfig->parentDeviceName); + *reason = "Cannot share storage device with already-running VDO"; + return VDO_BAD_CONFIGURATION; + } + + /* + * Part 1 - Allocate the kernel layer, its essential parts, and setup up the + * sysfs node. These must come first so that the sysfs node works correctly + * through the freeing of the kernel layer. After this part you must use + * freeKernelLayer. + */ + KernelLayer *layer; + int result = ALLOCATE(1, KernelLayer, "VDO configuration", &layer); + if (result != UDS_SUCCESS) { + *reason = "Cannot allocate VDO configuration"; + return result; + } + + // Allow the base VDO to allocate buffers and construct or destroy + // enqueuables as part of its allocation. + layer->common.allocateIOBuffer = kvdoAllocateIOBuffer; + layer->common.createEnqueueable = kvdoCreateEnqueueable; + layer->common.destroyEnqueueable = kvdoDestroyEnqueueable; + + result = allocateVDO(&layer->common, &layer->kvdo.vdo); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate VDO"; + FREE(layer); + return result; + } + + // After this point, calling kobject_put on kobj will decrement its + // reference count, and when the count goes to 0 the KernelLayer will + // be freed. + kobject_init(&layer->kobj, &kernelLayerKobjType); + result = kobject_add(&layer->kobj, parentKobject, config->poolName); + if (result != 0) { + *reason = "Cannot add sysfs node"; + kobject_put(&layer->kobj); + return result; + } + kobject_init(&layer->wqDirectory, &workQueueDirectoryKobjType); + result = kobject_add(&layer->wqDirectory, &layer->kobj, "work_queues"); + if (result != 0) { + *reason = "Cannot add sysfs node"; + kobject_put(&layer->wqDirectory); + kobject_put(&layer->kobj); + return result; + } + + /* + * Part 2 - Do all the simple initialization. These initializations have no + * order dependencies and can be done in any order, but freeKernelLayer() + * cannot be called until all the simple layer properties are set. + * + * The KernelLayer structure starts as all zeros. Pointer initializations + * consist of replacing a NULL pointer with a non-NULL pointer, which can be + * easily undone by freeing all of the non-NULL pointers (using the proper + * free routine). + */ + setKernelLayerState(layer, LAYER_SIMPLE_THINGS_INITIALIZED); + + initializeDeadlockQueue(&layer->deadlockQueue); + + int requestLimit = defaultMaxRequestsActive; + initializeLimiter(&layer->requestLimiter, requestLimit); + initializeLimiter(&layer->discardLimiter, requestLimit * 3 / 4); + + layer->allocationsAllowed = true; + layer->instance = instance; + layer->deviceConfig = config; + layer->startingSectorOffset = startingSector; + initializeRing(&layer->deviceConfigRing); + + layer->common.updateCRC32 = kvdoUpdateCRC32; + layer->common.getBlockCount = kvdoGetBlockCount; + layer->common.getWritePolicy = kvdoGetWritePolicy; + layer->common.createMetadataVIO = kvdoCreateMetadataVIO; + layer->common.createCompressedWriteVIO = kvdoCreateCompressedWriteVIO; + layer->common.freeVIO = kvdoFreeVIO; + layer->common.completeFlush = kvdoCompleteFlush; + layer->common.enqueue = kvdoEnqueue; + layer->common.waitForAdminOperation = waitForSyncOperation; + layer->common.completeAdminOperation = kvdoCompleteSyncOperation; + layer->common.getCurrentThreadID = kvdoGetCurrentThreadID; + layer->common.zeroDataVIO = kvdoZeroDataVIO; + layer->common.compareDataVIOs = kvdoCompareDataVIOs; + layer->common.copyData = kvdoCopyDataVIO; + layer->common.readData = kvdoReadDataVIO; + layer->common.writeData = kvdoWriteDataVIO; + layer->common.writeCompressedBlock = kvdoWriteCompressedBlock; + layer->common.readMetadata = kvdoSubmitMetadataVIO; + layer->common.writeMetadata = kvdoSubmitMetadataVIO; + layer->common.applyPartialWrite = kvdoModifyWriteDataVIO; + layer->common.flush = kvdoFlushVIO; + layer->common.hashData = kvdoHashDataVIO; + layer->common.checkForDuplication = kvdoCheckForDuplication; + layer->common.verifyDuplication = kvdoVerifyDuplication; + layer->common.acknowledgeDataVIO = kvdoAcknowledgeDataVIO; + layer->common.compressDataVIO = kvdoCompressDataVIO; + layer->common.updateAlbireo = kvdoUpdateDedupeAdvice; + + spin_lock_init(&layer->flushLock); + mutex_init(&layer->statsMutex); + bio_list_init(&layer->waitingFlushes); + + result = addLayerToDeviceRegistry(layer); + if (result != VDO_SUCCESS) { + *reason = "Cannot add layer to device registry"; + freeKernelLayer(layer); + return result; + } + + snprintf(layer->threadNamePrefix, sizeof(layer->threadNamePrefix), "%s%u", + THIS_MODULE->name, instance); + + result = makeThreadConfig(config->threadCounts.logicalZones, + config->threadCounts.physicalZones, + config->threadCounts.hashZones, + threadConfigPointer); + if (result != VDO_SUCCESS) { + *reason = "Cannot create thread configuration"; + freeKernelLayer(layer); + return result; + } + + logInfo("zones: %d logical, %d physical, %d hash; base threads: %d", + config->threadCounts.logicalZones, + config->threadCounts.physicalZones, + config->threadCounts.hashZones, + (*threadConfigPointer)->baseThreadCount); + + result = makeBatchProcessor(layer, returnDataKVIOBatchToPool, layer, + &layer->dataKVIOReleaser); + if (result != UDS_SUCCESS) { + *reason = "Cannot allocate KVIO-freeing batch processor"; + freeKernelLayer(layer); + return result; + } + + // Spare KVDOFlush, so that we will always have at least one available + result = makeKVDOFlush(&layer->spareKVDOFlush); + if (result != UDS_SUCCESS) { + *reason = "Cannot allocate KVDOFlush record"; + freeKernelLayer(layer); + return result; + } + + // BIO pool (needed before the geometry block) + result = makeDedupeBioSet(layer); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate dedupe bioset"; + freeKernelLayer(layer); + return result; + } + + // Read the geometry block so we know how to set up the index. Allow it to + // do synchronous reads. + layer->common.reader = kvdoSynchronousRead; + result = loadVolumeGeometry(&layer->common, &layer->geometry); + layer->common.reader = NULL; + if (result != VDO_SUCCESS) { + *reason = "Could not load geometry block"; + freeKernelLayer(layer); + return result; + } + + // Albireo Timeout Reporter + initPeriodicEventReporter(&layer->albireoTimeoutReporter, + "Albireo timeout on %llu requests", + DEDUPE_TIMEOUT_REPORT_INTERVAL, layer); + + // Dedupe Index + BUG_ON(layer->threadNamePrefix[0] == '\0'); + result = makeDedupeIndex(&layer->dedupeIndex, layer); + if (result != UDS_SUCCESS) { + *reason = "Cannot initialize dedupe index"; + freeKernelLayer(layer); + return result; + } + + // Compression context storage + result = ALLOCATE(config->threadCounts.cpuThreads, char *, "LZ4 context", + &layer->compressionContext); + if (result != VDO_SUCCESS) { + *reason = "cannot allocate LZ4 context"; + freeKernelLayer(layer); + return result; + } + for (int i = 0; i < config->threadCounts.cpuThreads; i++) { + result = ALLOCATE(LZ4_context_size(), char, "LZ4 context", + &layer->compressionContext[i]); + if (result != VDO_SUCCESS) { + *reason = "cannot allocate LZ4 context"; + freeKernelLayer(layer); + return result; + } + } + + + /* + * Part 3 - Do initializations that depend upon other previous + * initializations, but have no order dependencies at freeing time. + * Order dependencies for initialization are identified using BUG_ON. + */ + setKernelLayerState(layer, LAYER_BUFFER_POOLS_INITIALIZED); + + // Trace pool + BUG_ON(layer->requestLimiter.limit <= 0); + result = traceKernelLayerInit(layer); + if (result != VDO_SUCCESS) { + *reason = "Cannot initialize trace data"; + freeKernelLayer(layer); + return result; + } + + // KVIO and VIO pool + BUG_ON(layer->deviceConfig->logicalBlockSize <= 0); + BUG_ON(layer->requestLimiter.limit <= 0); + BUG_ON(layer->bioset == NULL); + BUG_ON(layer->deviceConfig->ownedDevice == NULL); + result = makeDataKVIOBufferPool(layer, layer->requestLimiter.limit, + &layer->dataKVIOPool); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocate vio data"; + freeKernelLayer(layer); + return result; + } + + /* + * Part 4 - Do initializations that depend upon other previous + * initialization, that may have order dependencies at freeing time. + * These are mostly starting up the workqueue threads. + */ + + // Base-code thread, etc + result = initializeKVDO(&layer->kvdo, *threadConfigPointer, reason); + if (result != VDO_SUCCESS) { + freeKernelLayer(layer); + return result; + } + + setKernelLayerState(layer, LAYER_REQUEST_QUEUE_INITIALIZED); + + // Bio queue + result = makeIOSubmitter(layer->threadNamePrefix, + config->threadCounts.bioThreads, + config->threadCounts.bioRotationInterval, + layer->requestLimiter.limit, + layer, + &layer->ioSubmitter); + if (result != VDO_SUCCESS) { + // If initialization of the bio-queues failed, they are cleaned + // up already, so just free the rest of the kernel layer. + freeKernelLayer(layer); + *reason = "bio submission initialization failed"; + return result; + } + setKernelLayerState(layer, LAYER_BIO_DATA_INITIALIZED); + + // Bio ack queue + if (useBioAckQueue(layer)) { + result = makeWorkQueue(layer->threadNamePrefix, "ackQ", + &layer->wqDirectory, layer, layer, &bioAckQType, + config->threadCounts.bioAckThreads, + &layer->bioAckQueue); + if (result != VDO_SUCCESS) { + *reason = "bio ack queue initialization failed"; + freeKernelLayer(layer); + return result; + } + } + + setKernelLayerState(layer, LAYER_BIO_ACK_QUEUE_INITIALIZED); + + // CPU Queues + result = makeWorkQueue(layer->threadNamePrefix, "cpuQ", &layer->wqDirectory, + layer, NULL, &cpuQType, + config->threadCounts.cpuThreads, &layer->cpuQueue); + if (result != VDO_SUCCESS) { + *reason = "Albireo CPU queue initialization failed"; + freeKernelLayer(layer); + return result; + } + + setKernelLayerState(layer, LAYER_CPU_QUEUE_INITIALIZED); + + *layerPtr = layer; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int prepareToModifyKernelLayer(KernelLayer *layer, + DeviceConfig *config, + char **errorPtr) +{ + DeviceConfig *extantConfig = layer->deviceConfig; + if (config->owningTarget->begin != extantConfig->owningTarget->begin) { + *errorPtr = "Starting sector cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (strcmp(config->parentDeviceName, extantConfig->parentDeviceName) != 0) { + *errorPtr = "Underlying device cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (config->logicalBlockSize != extantConfig->logicalBlockSize) { + *errorPtr = "Logical block size cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (config->cacheSize != extantConfig->cacheSize) { + *errorPtr = "Block map cache size cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (config->blockMapMaximumAge != extantConfig->blockMapMaximumAge) { + *errorPtr = "Block map maximum age cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (config->mdRaid5ModeEnabled != extantConfig->mdRaid5ModeEnabled) { + *errorPtr = "mdRaid5Mode cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + if (memcmp(&config->threadCounts, &extantConfig->threadCounts, + sizeof(ThreadCountConfig)) != 0) { + *errorPtr = "Thread configuration cannot change"; + return VDO_PARAMETER_MISMATCH; + } + + // Below here are the actions to take when a non-immutable property changes. + + if (config->writePolicy != extantConfig->writePolicy) { + // Nothing needs doing right now for a write policy change. + } + + if (config->owningTarget->len != extantConfig->owningTarget->len) { + size_t logicalBytes = to_bytes(config->owningTarget->len); + if ((logicalBytes % VDO_BLOCK_SIZE) != 0) { + *errorPtr = "Logical size must be a multiple of 4096"; + return VDO_PARAMETER_MISMATCH; + } + + int result = prepareToResizeLogical(layer, logicalBytes / VDO_BLOCK_SIZE); + if (result != VDO_SUCCESS) { + *errorPtr = "Device prepareToGrowLogical failed"; + return result; + } + } + + if (config->physicalBlocks != extantConfig->physicalBlocks) { + int result = prepareToResizePhysical(layer, config->physicalBlocks); + if (result != VDO_SUCCESS) { + if (result == VDO_TOO_MANY_SLABS) { + *errorPtr = "Device prepareToGrowPhysical failed (specified physical" + " size too big based on formatted slab size)"; + } else { + *errorPtr = "Device prepareToGrowPhysical failed"; + } + return result; + } + } + + return VDO_SUCCESS; +} + +/********************************************************************** + * Modify the pool name of the device. + * + * @param layer The kernel layer + * @param oldName The old pool name + * @param newName The new pool name + * + * @return VDO_SUCCESS or an error + * + */ +int modifyPoolName(KernelLayer *layer, char *oldName, char *newName) +{ + // We use pool name for sysfs and procfs. Rename them accordingly + logInfo("Modify pool name from %s to %s", oldName, newName); + + void *procfsPrivate; + int result = vdoCreateProcfsEntry(layer, newName, &procfsPrivate); + if (result != VDO_SUCCESS) { + return result; + } + + result = kobject_rename(&layer->kobj, newName); + if (result != 0) { + vdoDestroyProcfsEntry(newName, procfsPrivate); + return result; + } + + void *tmpProcfs = layer->procfsPrivate; + layer->procfsPrivate = procfsPrivate; + + vdoDestroyProcfsEntry(oldName, tmpProcfs); + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int modifyKernelLayer(KernelLayer *layer, + DeviceConfig *config) +{ + KernelLayerState state = getKernelLayerState(layer); + if (state == LAYER_RUNNING) { + return VDO_SUCCESS; + } else if (state != LAYER_SUSPENDED) { + logError("pre-resume invoked while in unexpected kernel layer state %d", + state); + return -EINVAL; + } + + setKernelLayerState(layer, LAYER_RESUMING); + + DeviceConfig *extantConfig = layer->deviceConfig; + + // A failure here is unrecoverable. So there is no problem if it happens. + + if (config->writePolicy != extantConfig->writePolicy) { + /* + * Ordinarily, when going from async to sync, we must flush any metadata + * written. However, because the underlying storage must have gone into + * sync mode before we suspend VDO, and suspending VDO concludes by + * issuing a flush, all metadata written before the suspend is flushed + * by the suspend and all metadata between the suspend and the write + * policy change is written to synchronous storage. + */ + logInfo("Modifying device '%s' write policy from %s to %s", + config->poolName, getConfigWritePolicyString(extantConfig), + getConfigWritePolicyString(config)); + setWritePolicy(layer->kvdo.vdo, config->writePolicy); + } + + if (config->owningTarget->len != extantConfig->owningTarget->len) { + size_t logicalBytes = to_bytes(config->owningTarget->len); + int result = resizeLogical(layer, logicalBytes / VDO_BLOCK_SIZE); + if (result != VDO_SUCCESS) { + return result; + } + } + + // Grow physical if the version is 0, so we can't tell if we + // got an old-style growPhysical command, or if size changed. + if ((config->physicalBlocks != extantConfig->physicalBlocks) + || (config->version == 0)) { + int result = resizePhysical(layer, config->physicalBlocks); + if (result != VDO_SUCCESS) { + return result; + } + } + + if (strcmp(config->poolName, extantConfig->poolName) != 0) { + logInfo("Modifying device '%s' pool name from %s to %s", + config->poolName, extantConfig->poolName, config->poolName); + int result = modifyPoolName(layer, extantConfig->poolName, + config->poolName); + if (result != VDO_SUCCESS) { + return result; + } + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void freeKernelLayer(KernelLayer *layer) +{ + // This is not the cleanest implementation, but given the current timing + // uncertainties in the shutdown process for work queues, we need to + // store information to enable a late-in-process deallocation of + // funnel-queue data structures in work queues. + bool usedBioAckQueue = false; + bool usedCpuQueue = false; + bool usedKVDO = false; + bool releaseInstance = false; + + KernelLayerState state = getKernelLayerState(layer); + switch (state) { + case LAYER_STOPPING: + logError("re-entered freeKernelLayer while stopping"); + break; + + case LAYER_RUNNING: + suspendKernelLayer(layer); + // fall through + + case LAYER_STARTING: + case LAYER_RESUMING: + case LAYER_SUSPENDED: + stopKernelLayer(layer); + // fall through + + case LAYER_STOPPED: + case LAYER_CPU_QUEUE_INITIALIZED: + finishWorkQueue(layer->cpuQueue); + usedCpuQueue = true; + releaseInstance = true; + // fall through + + case LAYER_BIO_ACK_QUEUE_INITIALIZED: + if (useBioAckQueue(layer)) { + finishWorkQueue(layer->bioAckQueue); + usedBioAckQueue = true; + } + // fall through + + case LAYER_BIO_DATA_INITIALIZED: + cleanupIOSubmitter(layer->ioSubmitter); + // fall through + + case LAYER_REQUEST_QUEUE_INITIALIZED: + finishKVDO(&layer->kvdo); + usedKVDO = true; + // fall through + + case LAYER_BUFFER_POOLS_INITIALIZED: + freeBufferPool(&layer->dataKVIOPool); + freeBufferPool(&layer->traceBufferPool); + // fall through + + case LAYER_SIMPLE_THINGS_INITIALIZED: + if (layer->compressionContext != NULL) { + for (int i = 0; i < layer->deviceConfig->threadCounts.cpuThreads; i++) { + FREE(layer->compressionContext[i]); + } + FREE(layer->compressionContext); + } + if (layer->dedupeIndex != NULL) { + finishDedupeIndex(layer->dedupeIndex); + } + FREE(layer->spareKVDOFlush); + layer->spareKVDOFlush = NULL; + freeBatchProcessor(&layer->dataKVIOReleaser); + removeLayerFromDeviceRegistry(layer); + break; + + default: + logError("Unknown Kernel Layer state: %d", state); + } + + // Late deallocation of resources in work queues. + if (usedCpuQueue) { + freeWorkQueue(&layer->cpuQueue); + } + if (usedBioAckQueue) { + freeWorkQueue(&layer->bioAckQueue); + } + if (layer->ioSubmitter) { + freeIOSubmitter(layer->ioSubmitter); + } + if (usedKVDO) { + destroyKVDO(&layer->kvdo); + } + if (layer->bioset != NULL) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0) + bioset_exit(layer->bioset); + FREE(layer->bioset); +#else + bioset_free(layer->bioset); +#endif + layer->bioset = NULL; + } + + freeDedupeIndex(&layer->dedupeIndex); + + stopPeriodicEventReporter(&layer->albireoTimeoutReporter); + if (releaseInstance) { + releaseKVDOInstance(layer->instance); + } + + // The call to kobject_put on the kobj sysfs node will decrement its + // reference count; when the count goes to zero the VDO object and + // the kernel layer object will be freed as a side effect. + kobject_put(&layer->wqDirectory); + kobject_put(&layer->kobj); +} + +/**********************************************************************/ +static void poolStatsRelease(struct kobject *kobj) +{ + KernelLayer *layer = container_of(kobj, KernelLayer, statsDirectory); + complete(&layer->statsShutdown); +} + +/**********************************************************************/ +int preloadKernelLayer(KernelLayer *layer, + const VDOLoadConfig *loadConfig, + char **reason) +{ + if (getKernelLayerState(layer) != LAYER_CPU_QUEUE_INITIALIZED) { + *reason = "preloadKernelLayer() may only be invoked after initialization"; + return UDS_BAD_STATE; + } + + setKernelLayerState(layer, LAYER_STARTING); + int result = preloadKVDO(&layer->kvdo, &layer->common, loadConfig, + layer->vioTraceRecording, reason); + if (result != VDO_SUCCESS) { + stopKernelLayer(layer); + return result; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int startKernelLayer(KernelLayer *layer, char **reason) +{ + if (getKernelLayerState(layer) != LAYER_STARTING) { + *reason = "Cannot start kernel from non-starting state"; + stopKernelLayer(layer); + return UDS_BAD_STATE; + } + + int result = startKVDO(&layer->kvdo, &layer->common, reason); + if (result != VDO_SUCCESS) { + stopKernelLayer(layer); + return result; + } + + setKernelLayerState(layer, LAYER_RUNNING); + static struct kobj_type statsDirectoryKobjType = { + .release = poolStatsRelease, + .sysfs_ops = &poolStatsSysfsOps, + .default_attrs = poolStatsAttrs, + }; + kobject_init(&layer->statsDirectory, &statsDirectoryKobjType); + result = kobject_add(&layer->statsDirectory, &layer->kobj, "statistics"); + if (result != 0) { + *reason = "Cannot add sysfs statistics node"; + stopKernelLayer(layer); + return result; + } + layer->statsAdded = true; + + if (layer->deviceConfig->deduplication) { + // Don't try to load or rebuild the index first (and log scary error + // messages) if this is known to be a newly-formatted volume. + startDedupeIndex(layer->dedupeIndex, wasNew(layer->kvdo.vdo)); + } + + result = vdoCreateProcfsEntry(layer, layer->deviceConfig->poolName, + &layer->procfsPrivate); + if (result != VDO_SUCCESS) { + *reason = "Could not create proc filesystem entry"; + stopKernelLayer(layer); + return result; + } + + layer->allocationsAllowed = false; + + return VDO_SUCCESS; +} + +/**********************************************************************/ +void stopKernelLayer(KernelLayer *layer) +{ + layer->allocationsAllowed = true; + + // Stop services that need to gather VDO statistics from the worker threads. + if (layer->statsAdded) { + layer->statsAdded = false; + init_completion(&layer->statsShutdown); + kobject_put(&layer->statsDirectory); + wait_for_completion(&layer->statsShutdown); + } + vdoDestroyProcfsEntry(layer->deviceConfig->poolName, layer->procfsPrivate); + + switch (getKernelLayerState(layer)) { + case LAYER_RUNNING: + suspendKernelLayer(layer); + // fall through + + case LAYER_SUSPENDED: + setKernelLayerState(layer, LAYER_STOPPING); + stopDedupeIndex(layer->dedupeIndex); + // fall through + + case LAYER_STOPPING: + case LAYER_STOPPED: + default: + setKernelLayerState(layer, LAYER_STOPPED); + } +} + +/**********************************************************************/ +int suspendKernelLayer(KernelLayer *layer) +{ + // It's important to note any error here does not actually stop device-mapper + // from suspending the device. All this work is done post suspend. + KernelLayerState state = getKernelLayerState(layer); + if (state == LAYER_SUSPENDED) { + return VDO_SUCCESS; + } + if (state != LAYER_RUNNING) { + logError("Suspend invoked while in unexpected kernel layer state %d", + state); + return -EINVAL; + } + + /* + * Attempt to flush all I/O before completing post suspend work. This is + * needed so that changing write policy upon resume is safe. Also, we think + * a suspended device is expected to have persisted all data written before + * the suspend, even if it hasn't been flushed yet. + */ + waitForNoRequestsActive(layer); + int result = synchronousFlush(layer); + if (result != VDO_SUCCESS) { + setKVDOReadOnly(&layer->kvdo, result); + } + + /* + * Suspend the VDO, writing out all dirty metadata if the no-flush flag + * was not set on the dmsetup suspend call. This will ensure that we don't + * have cause to write while suspended [VDO-4402]. + */ + int suspendResult = suspendKVDO(&layer->kvdo); + if (result == VDO_SUCCESS) { + result = suspendResult; + } + + suspendDedupeIndex(layer->dedupeIndex, !layer->noFlushSuspend); + setKernelLayerState(layer, LAYER_SUSPENDED); + return result; +} + +/**********************************************************************/ +int resumeKernelLayer(KernelLayer *layer) +{ + if (getKernelLayerState(layer) == LAYER_RUNNING) { + return VDO_SUCCESS; + } + + resumeDedupeIndex(layer->dedupeIndex); + int result = resumeKVDO(&layer->kvdo); + if (result != VDO_SUCCESS) { + return result; + } + + setKernelLayerState(layer, LAYER_RUNNING); + return VDO_SUCCESS; +} + +/***********************************************************************/ +int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount) +{ + logInfo("Preparing to resize physical to %llu", physicalCount); + // Allocations are allowed and permissible through this non-VDO thread, + // since IO triggered by this allocation to VDO can finish just fine. + int result = kvdoPrepareToGrowPhysical(&layer->kvdo, physicalCount); + if (result != VDO_SUCCESS) { + // kvdoPrepareToGrowPhysical logs errors. + if (result == VDO_PARAMETER_MISMATCH) { + // If we don't trap this case, mapToSystemError() will remap it to -EIO, + // which is misleading and ahistorical. + return -EINVAL; + } else { + return result; + } + } + + logInfo("Done preparing to resize physical"); + return VDO_SUCCESS; +} + +/***********************************************************************/ +int resizePhysical(KernelLayer *layer, BlockCount physicalCount) +{ + // We must not mark the layer as allowing allocations when it is suspended + // lest an allocation attempt block on writing IO to the suspended VDO. + int result = kvdoResizePhysical(&layer->kvdo, physicalCount); + if (result != VDO_SUCCESS) { + // kvdoResizePhysical logs errors + return result; + } + return VDO_SUCCESS; +} + +/***********************************************************************/ +int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount) +{ + logInfo("Preparing to resize logical to %llu", logicalCount); + // Allocations are allowed and permissible through this non-VDO thread, + // since IO triggered by this allocation to VDO can finish just fine. + int result = kvdoPrepareToGrowLogical(&layer->kvdo, logicalCount); + if (result != VDO_SUCCESS) { + // kvdoPrepareToGrowLogical logs errors + return result; + } + + logInfo("Done preparing to resize logical"); + return VDO_SUCCESS; +} + +/***********************************************************************/ +int resizeLogical(KernelLayer *layer, BlockCount logicalCount) +{ + logInfo("Resizing logical to %llu", logicalCount); + // We must not mark the layer as allowing allocations when it is suspended + // lest an allocation attempt block on writing IO to the suspended VDO. + int result = kvdoResizeLogical(&layer->kvdo, logicalCount); + if (result != VDO_SUCCESS) { + // kvdoResizeLogical logs errors + return result; + } + + logInfo("Logical blocks now %llu", logicalCount); + return VDO_SUCCESS; +} + diff --git a/source/vdo/kernel/kernelLayer.h b/source/vdo/kernel/kernelLayer.h new file mode 100644 index 0000000..4e0bf8c --- /dev/null +++ b/source/vdo/kernel/kernelLayer.h @@ -0,0 +1,583 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.h#18 $ + */ + +#ifndef KERNELLAYER_H +#define KERNELLAYER_H + +#include + +#include "atomic.h" +#include "constants.h" +#include "flush.h" +#include "intMap.h" +#include "physicalLayer.h" +#include "ringNode.h" +#include "volumeGeometry.h" +#include "waitQueue.h" + +#include "batchProcessor.h" +#include "bufferPool.h" +#include "deadlockQueue.h" +#include "deviceConfig.h" +#include "histogram.h" +#include "kernelStatistics.h" +#include "kernelTypes.h" +#include "kernelVDO.h" +#include "ktrace.h" +#include "limiter.h" +#include "statistics.h" +#include "workQueue.h" + +enum { + VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT) +}; + +typedef enum { + LAYER_SIMPLE_THINGS_INITIALIZED, + LAYER_BUFFER_POOLS_INITIALIZED, + LAYER_REQUEST_QUEUE_INITIALIZED, + LAYER_CPU_QUEUE_INITIALIZED, + LAYER_BIO_ACK_QUEUE_INITIALIZED, + LAYER_BIO_DATA_INITIALIZED, + LAYER_STARTING, + LAYER_RUNNING, + LAYER_SUSPENDED, + LAYER_STOPPING, + LAYER_STOPPED, + LAYER_RESUMING, +} KernelLayerState; + +/* Keep BIO statistics atomically */ +struct atomicBioStats { + atomic64_t read; // Number of not REQ_WRITE bios + atomic64_t write; // Number of REQ_WRITE bios + atomic64_t discard; // Number of REQ_DISCARD bios + atomic64_t flush; // Number of REQ_FLUSH bios + atomic64_t fua; // Number of REQ_FUA bios +}; + +// Data managing the reporting of Albireo timeouts +typedef struct periodicEventReporter { + uint64_t lastReportedValue; + const char *format; + atomic64_t value; + Jiffies reportingInterval; // jiffies + /* + * Just an approximation. If nonzero, then either the work item has + * been queued to run, or some other thread currently has + * responsibility for enqueueing it, or the reporter function is + * running but hasn't looked at the current value yet. + * + * If this is set, don't set the timer again, because we don't want + * the work item queued twice. Use an atomic xchg or cmpxchg to + * test-and-set it, and an atomic store to clear it. + */ + atomic_t workItemQueued; + KvdoWorkItem workItem; + KernelLayer *layer; +} PeriodicEventReporter; + +static inline uint64_t getEventCount(PeriodicEventReporter *reporter) +{ + return atomic64_read(&reporter->value); +} + +/** + * The VDO representation of the target device + **/ +struct kernelLayer { + PhysicalLayer common; + // Layer specific info + DeviceConfig *deviceConfig; + /** A ring of all DeviceConfigs referencing this layer */ + RingNode deviceConfigRing; + char threadNamePrefix[MAX_QUEUE_NAME_LEN]; + struct kobject kobj; + struct kobject wqDirectory; + struct kobject statsDirectory; + /** + * A counter value to attach to thread names and log messages to + * identify the individual device. + **/ + unsigned int instance; + /** Contains the current KernelLayerState, which rarely changes */ + Atomic32 state; + bool noFlushSuspend; + bool allocationsAllowed; + AtomicBool processingMessage; + /** Limit the number of requests that are being processed. */ + Limiter requestLimiter; + Limiter discardLimiter; + KVDO kvdo; + /** Incoming bios we've had to buffer to avoid deadlock. */ + DeadlockQueue deadlockQueue; + // for REQ_FLUSH processing + struct bio_list waitingFlushes; + KVDOFlush *spareKVDOFlush; + spinlock_t flushLock; + Jiffies flushArrivalTime; + /** + * Bio submission manager used for sending bios to the storage + * device. + **/ + IOSubmitter *ioSubmitter; + /** + * Work queue (possibly with multiple threads) for miscellaneous + * CPU-intensive, non-blocking work. + **/ + KvdoWorkQueue *cpuQueue; + /** N blobs of context data for LZ4 code, one per CPU thread. */ + char **compressionContext; + Atomic32 compressionContextIndex; + /** Optional work queue for calling bio_endio. */ + KvdoWorkQueue *bioAckQueue; + /** Underlying block device info. */ + uint64_t startingSectorOffset; + VolumeGeometry geometry; + // Memory allocation + BufferPool *dataKVIOPool; + struct bio_set *bioset; + // Albireo specific info + DedupeIndex *dedupeIndex; + // Statistics + atomic64_t biosSubmitted; + atomic64_t biosCompleted; + atomic64_t dedupeContextBusy; + atomic64_t flushOut; + AtomicBioStats biosIn; + AtomicBioStats biosInPartial; + AtomicBioStats biosOut; + AtomicBioStats biosOutCompleted; + AtomicBioStats biosAcknowledged; + AtomicBioStats biosAcknowledgedPartial; + AtomicBioStats biosMeta; + AtomicBioStats biosMetaCompleted; + AtomicBioStats biosJournal; + AtomicBioStats biosPageCache; + AtomicBioStats biosJournalCompleted; + AtomicBioStats biosPageCacheCompleted; + // for reporting Albireo timeouts + PeriodicEventReporter albireoTimeoutReporter; + // Debugging + /* Whether to dump VDO state on shutdown */ + bool dumpOnShutdown; + /** + * Whether we should collect tracing info. (Actually, this controls + * allocations; non-null record pointers cause recording.) + **/ + bool vioTraceRecording; + SampleCounter traceSampleCounter; + /* Should we log tracing info? */ + bool traceLogging; + /* Storage for trace data. */ + BufferPool *traceBufferPool; + /* Private storage for procfs. */ + void *procfsPrivate; + /* For returning batches of DataKVIOs to their pool */ + BatchProcessor *dataKVIOReleaser; + + // Administrative operations + /* The object used to wait for administrative operations to complete */ + struct completion callbackSync; + + // Statistics reporting + /* Protects the *statsStorage structs */ + struct mutex statsMutex; + /* Used when shutting down the sysfs statistics */ + struct completion statsShutdown;; + /* true if sysfs statistics directory is set up */ + bool statsAdded; + /* Used to gather statistics without allocating memory */ + VDOStatistics vdoStatsStorage; + KernelStatistics kernelStatsStorage; +}; + +typedef enum bioQAction { + BIO_Q_ACTION_COMPRESSED_DATA, + BIO_Q_ACTION_DATA, + BIO_Q_ACTION_FLUSH, + BIO_Q_ACTION_HIGH, + BIO_Q_ACTION_METADATA, + BIO_Q_ACTION_READCACHE, + BIO_Q_ACTION_VERIFY +} BioQAction; + +typedef enum cpuQAction { + CPU_Q_ACTION_COMPLETE_KVIO, + CPU_Q_ACTION_COMPRESS_BLOCK, + CPU_Q_ACTION_EVENT_REPORTER, + CPU_Q_ACTION_HASH_BLOCK, +} CPUQAction; + +typedef enum bioAckQAction { + BIO_ACK_Q_ACTION_ACK, +} BioAckQAction; + +typedef void (*DedupeShutdownCallbackFunction)(KernelLayer *layer); + +/* + * Wrapper for the Enqueueable object, to associate it with a kernel + * layer work item. + */ +typedef struct kvdoEnqueueable { + KvdoWorkItem workItem; + Enqueueable enqueueable; +} KvdoEnqueueable; + +/** + * Implements LayerFilter. + **/ +bool layerIsNamed(KernelLayer *layer, void *context) + __attribute__((warn_unused_result)); + +/** + * Creates a kernel specific physical layer to be used by VDO + * + * @param startingSector The sector offset of our table entry in the + * DM device + * @param instance Device instantiation counter + * @param parentKobject The parent sysfs node + * @param config The device configuration + * @param threadConfigPointer Where to store the new threadConfig handle + * @param reason The reason for any failure during this call + * @param layerPtr A pointer to hold the created layer + * + * @return VDO_SUCCESS or an error + **/ +int makeKernelLayer(uint64_t startingSector, + unsigned int instance, + DeviceConfig *config, + struct kobject *parentKobject, + ThreadConfig **threadConfigPointer, + char **reason, + KernelLayer **layerPtr) + __attribute__((warn_unused_result)); + +/** + * Prepare to modify a kernel layer. + * + * @param layer The layer to modify + * @param config The new device configuration + * @param errorPtr A pointer to store the reason for any failure + * + * @return VDO_SUCCESS or an error + **/ +int prepareToModifyKernelLayer(KernelLayer *layer, + DeviceConfig *config, + char **errorPtr) + __attribute__((warn_unused_result)); + +/** + * Modify a kernel physical layer. + * + * @param layer The layer to modify + * @param config The new device configuration + * + * @return VDO_SUCCESS or an error + **/ +int modifyKernelLayer(KernelLayer *layer, + DeviceConfig *config) + __attribute__((warn_unused_result)); + +/** + * Free a kernel physical layer. + * + * @param layer The layer, which must have been created by + * makeKernelLayer + **/ +void freeKernelLayer(KernelLayer *layer); + +/** + * Make and configure a kernel layer. This method does not alter the VDO state + * on disk. It should be run from the VDO constructor for devices which have + * not been started. + * + * @param layer The kernel layer + * @param loadConfig Load-time parameters for the VDO + * @param reason The reason for any failure during this call + * + * @return VDO_SUCCESS or an error + * + * @note redundant starts are silently ignored + **/ +int preloadKernelLayer(KernelLayer *layer, + const VDOLoadConfig *loadConfig, + char **reason); + +/** + * Start the kernel layer. This method finishes bringing a VDO online now that + * a table is being resumed for the first time. + * + * @param layer The kernel layer + * @param reason The reason for any failure during this call + * + * @return VDO_SUCCESS or an error + **/ +int startKernelLayer(KernelLayer *layer, char **reason); + +/** + * Stop the kernel layer. + * + * @param layer The kernel layer + **/ +void stopKernelLayer(KernelLayer *layer); + +/** + * Suspend the kernel layer. + * + * @param layer The kernel layer + * + * @return VDO_SUCCESS or an error + **/ +int suspendKernelLayer(KernelLayer *layer); + +/** + * Resume the kernel layer. + * + * @param layer The kernel layer + * + * @return VDO_SUCCESS or an error + **/ +int resumeKernelLayer(KernelLayer *layer); + +/** + * Get the kernel layer state. + * + * @param layer The kernel layer + * + * @return the instantaneously correct kernel layer state + **/ +static inline KernelLayerState getKernelLayerState(const KernelLayer *layer) +{ + return atomicLoad32(&layer->state); +} + +/** + * Function call to begin processing a bio passed in from the block layer + * + * @param layer The physical layer + * @param bio The bio from the block layer + * + * @return value to return from the VDO map function. Either an error code + * or DM_MAPIO_REMAPPED or DM_MAPPED_SUBMITTED (see vdoMapBio for + * details). + **/ +int kvdoMapBio(KernelLayer *layer, BIO *bio); + +/** + * Convert a generic PhysicalLayer to a kernelLayer. + * + * @param layer The PhysicalLayer to convert + * + * @return The PhysicalLayer as a KernelLayer + **/ +static inline KernelLayer *asKernelLayer(PhysicalLayer *layer) +{ + return container_of(layer, KernelLayer, common); +} + +/** + * Convert a block number (or count) to a (512-byte-)sector number. + * + * The argument type is sector_t to force conversion to the type we + * want, although the actual values passed are of various integral + * types. It's just too easy to forget and do the multiplication + * without casting, resulting in 32-bit arithmetic that accidentally + * produces wrong results in devices over 2TB (2**32 sectors). + * + * @param [in] layer the physical layer + * @param [in] blockNumber the block number/count + * + * @return the sector number/count + **/ +static inline sector_t blockToSector(KernelLayer *layer, sector_t blockNumber) +{ + return (blockNumber * VDO_SECTORS_PER_BLOCK); +} + +/** + * Convert a sector number (or count) to a block number. Does not + * check to make sure the sector number is an integral number of + * blocks. + * + * @param [in] layer the physical layer + * @param [in] sectorNumber the sector number/count + * + * @return the block number/count + **/ +static inline sector_t sectorToBlock(KernelLayer *layer, sector_t sectorNumber) +{ + return (sectorNumber / VDO_SECTORS_PER_BLOCK); +} + +/** + * Convert a sector number to an offset within a block. + * + * @param [in] layer the physical layer + * @param [in] sectorNumber the sector number + * + * @return the offset within the block + **/ +static inline BlockSize sectorToBlockOffset(KernelLayer *layer, + sector_t sectorNumber) +{ + unsigned int sectorsPerBlockMask = VDO_SECTORS_PER_BLOCK - 1; + return to_bytes(sectorNumber & sectorsPerBlockMask); +} + +/** + * Get the block device object currently underlying a kernel layer. + * + * @param layer The kernel layer in question + * + * @return The block device object under the layer + **/ +struct block_device *getKernelLayerBdev(const KernelLayer *layer) + __attribute__((warn_unused_result)); + +/** + * Set the layer's active config. + * + * @param layer The kernel layer in question + * @param config The config in question + **/ +static inline void setKernelLayerActiveConfig(KernelLayer *layer, + DeviceConfig *config) +{ + layer->deviceConfig = config; +} + +/** + * Given an error code, return a value we can return to the OS. The + * input error code may be a system-generated value (such as -EIO), an + * errno macro used in our code (such as EIO), or a UDS or VDO status + * code; the result must be something the rest of the OS can consume + * (negative errno values such as -EIO, in the case of the kernel). + * + * @param error the error code to convert + * + * @return a system error code value + **/ +int mapToSystemError(int error); + +/** + * Record and eventually report that some number of dedupe requests + * reached their expiration time without getting an answer, so we + * timed out on them. + * + * This is called in a timer context, so it shouldn't do the reporting + * directly. + * + * @param layer The kernel layer for the device + * @param expiredCount The number of expired requests we timed out on + **/ +void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount); + +/** + * Wait until there are no requests in progress. + * + * @param layer The kernel layer for the device + **/ +void waitForNoRequestsActive(KernelLayer *layer); + +/** + * Enqueues an item on our internal "cpu queues". Since there is more than + * one, we rotate through them in hopes of creating some general balance. + * + * @param layer The kernel layer + * @param item The work item to enqueue + */ +static inline void enqueueCPUWorkQueue(KernelLayer *layer, KvdoWorkItem *item) +{ + enqueueWorkQueue(layer->cpuQueue, item); +} + +/** + * Adjust parameters to prepare to use a larger physical space. + * The size must be larger than the current size. + * + * @param layer the kernel layer + * @param physicalCount the new physical size in blocks + * + * @return VDO_SUCCESS or an error + */ +int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount); + +/** + * Adjusts parameters to reflect resizing the underlying device. + * The size must be larger than the current size. + * + * @param layer the kernel layer + * @param physicalCount the new physical count in blocks + * + * @return VDO_SUCCESS or an error + */ +int resizePhysical(KernelLayer *layer, BlockCount physicalCount); + +/** + * Adjust parameters to prepare to present a larger logical space. + * The size must be larger than the current size. + * + * @param layer the kernel layer + * @param logicalCount the new logical size in blocks + * + * @return VDO_SUCCESS or an error + */ +int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount); + +/** + * Adjust parameters to present a larger logical space. + * The size must be larger than the current size. + * + * @param layer the kernel layer + * @param logicalCount the new logical size in blocks + * + * @return VDO_SUCCESS or an error + */ +int resizeLogical(KernelLayer *layer, BlockCount logicalCount); + +/** + * Indicate whether the kernel layer is configured to use a separate + * work queue for acknowledging received and processed bios. + * + * Note that this directly controls handling of write operations, but + * the compile-time flag USE_BIO_ACK_QUEUE_FOR_READ is also checked + * for read operations. + * + * @param layer The kernel layer + * + * @return Whether a bio-acknowledgement work queue is in use + **/ +static inline bool useBioAckQueue(KernelLayer *layer) +{ + return layer->deviceConfig->threadCounts.bioAckThreads > 0; +} + +/** + * Update bookkeeping for the completion of some number of requests, so that + * more incoming requests can be accepted. + * + * @param layer The kernel layer + * @param count The number of completed requests + **/ +void completeManyRequests(KernelLayer *layer, uint32_t count); + +#endif /* KERNELLAYER_H */ diff --git a/source/vdo/kernel/kernelStatistics.h b/source/vdo/kernel/kernelStatistics.h new file mode 100644 index 0000000..a5c1210 --- /dev/null +++ b/source/vdo/kernel/kernelStatistics.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#ifndef KERNEL_STATISTICS_H +#define KERNEL_STATISTICS_H + +#include "header.h" +#include "types.h" + +typedef struct { + /** Number of not REQ_WRITE bios */ + uint64_t read; + /** Number of REQ_WRITE bios */ + uint64_t write; + /** Number of REQ_DISCARD bios */ + uint64_t discard; + /** Number of REQ_FLUSH bios */ + uint64_t flush; + /** Number of REQ_FUA bios */ + uint64_t fua; +} BioStats; + +typedef struct { + /** Tracked bytes currently allocated. */ + uint64_t bytesUsed; + /** Maximum tracked bytes allocated. */ + uint64_t peakBytesUsed; +} MemoryUsage; + +/** UDS index statistics */ +typedef struct { + /** Number of chunk names stored in the index */ + uint64_t entriesIndexed; + /** Number of post calls that found an existing entry */ + uint64_t postsFound; + /** Number of post calls that added a new entry */ + uint64_t postsNotFound; + /** Number of query calls that found an existing entry */ + uint64_t queriesFound; + /** Number of query calls that added a new entry */ + uint64_t queriesNotFound; + /** Number of update calls that found an existing entry */ + uint64_t updatesFound; + /** Number of update calls that added a new entry */ + uint64_t updatesNotFound; + /** Current number of dedupe queries that are in flight */ + uint32_t currDedupeQueries; + /** Maximum number of dedupe queries that have been in flight */ + uint32_t maxDedupeQueries; +} IndexStatistics; + +typedef struct { + uint32_t version; + uint32_t releaseVersion; + /** The VDO instance */ + uint32_t instance; + /** Current number of active VIOs */ + uint32_t currentVIOsInProgress; + /** Maximum number of active VIOs */ + uint32_t maxVIOs; + /** Number of times the UDS index was too slow in responding */ + uint64_t dedupeAdviceTimeouts; + /** Number of flush requests submitted to the storage device */ + uint64_t flushOut; + /** Logical block size */ + uint64_t logicalBlockSize; + /** Bios submitted into VDO from above */ + BioStats biosIn; + BioStats biosInPartial; + /** Bios submitted onward for user data */ + BioStats biosOut; + /** Bios submitted onward for metadata */ + BioStats biosMeta; + BioStats biosJournal; + BioStats biosPageCache; + BioStats biosOutCompleted; + BioStats biosMetaCompleted; + BioStats biosJournalCompleted; + BioStats biosPageCacheCompleted; + BioStats biosAcknowledged; + BioStats biosAcknowledgedPartial; + /** Current number of bios in progress */ + BioStats biosInProgress; + /** Memory usage stats. */ + MemoryUsage memoryUsage; + /** The statistics for the UDS index */ + IndexStatistics index; +} KernelStatistics; + +/** + * Get the root for all stats proc files. + * + * @return The proc root + **/ +static inline const char *getProcRoot(void) { + return "vdo"; +} + +/** + * Get the proc file path for reading KernelStatistics. + * + * @return The proc file path + **/ +static inline const char *getKernelStatisticsProcFile(void) { + return "kernel_stats"; +} + +#endif /* not KERNEL_STATISTICS_H */ diff --git a/source/vdo/kernel/kernelTypes.h b/source/vdo/kernel/kernelTypes.h new file mode 100644 index 0000000..b338440 --- /dev/null +++ b/source/vdo/kernel/kernelTypes.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelTypes.h#3 $ + */ + +#ifndef KERNEL_TYPES_H +#define KERNEL_TYPES_H + +#include "types.h" + +/** + * The size of a discard request in bytes. + **/ +typedef uint32_t DiscardSize; + +/** + * A time in jiffies. + **/ +typedef uint64_t Jiffies; + +/** + * A timeout in jiffies. + **/ +typedef int64_t TimeoutJiffies; + +typedef struct atomicBioStats AtomicBioStats; +typedef struct bio BIO; +typedef struct dataKVIO DataKVIO; +typedef struct dedupeContext DedupeContext; +typedef struct dedupeIndex DedupeIndex; +typedef struct ioSubmitter IOSubmitter; +typedef struct kernelLayer KernelLayer; +typedef struct kvdo KVDO; +typedef struct kvdoFlush KVDOFlush; +typedef struct kvdoWorkItem KvdoWorkItem; +typedef struct kvdoWorkQueue KvdoWorkQueue; +typedef struct kvio KVIO; + +typedef void (*KVIOCallback)(KVIO *kvio); +typedef void (*DataKVIOCallback)(DataKVIO *dataKVIO); +typedef void (*KvdoWorkFunction)(KvdoWorkItem *workItem); + +/** + * Method type for layer matching methods. + * + * A LayerFilter method returns false if the layer doesn't match. + **/ +typedef bool LayerFilter(KernelLayer *layer, void *context); + +#endif /* KERNEL_TYPES_H */ diff --git a/source/vdo/kernel/kernelVDO.c b/source/vdo/kernel/kernelVDO.c new file mode 100644 index 0000000..5e1a72e --- /dev/null +++ b/source/vdo/kernel/kernelVDO.c @@ -0,0 +1,578 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDO.c#7 $ + */ + +#include "kernelVDOInternals.h" + +#include + +#include "memoryAlloc.h" + +#include "statistics.h" +#include "threadConfig.h" +#include "vdo.h" +#include "vdoDebug.h" +#include "vdoLoad.h" +#include "vdoResize.h" +#include "vdoResizeLogical.h" +#include "vdoResume.h" +#include "vdoSuspend.h" + +#include "kernelLayer.h" +#include "kvio.h" +#include "logger.h" + +enum { PARANOID_THREAD_CONSISTENCY_CHECKS = 0 }; + +/**********************************************************************/ +static void startKVDORequestQueue(void *ptr) +{ + KVDOThread *thread = ptr; + KVDO *kvdo = thread->kvdo; + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + registerAllocatingThread(&thread->allocatingThread, + &layer->allocationsAllowed); + setWorkQueuePrivateData(thread); +} + +/**********************************************************************/ +static void finishKVDORequestQueue(void *ptr) +{ + unregisterAllocatingThread(); +} + +/**********************************************************************/ +static const KvdoWorkQueueType requestQueueType = { + .start = startKVDORequestQueue, + .finish = finishKVDORequestQueue, + .actionTable = { + { .name = "req_completion", + .code = REQ_Q_ACTION_COMPLETION, + .priority = 1 }, + { .name = "req_flush", + .code = REQ_Q_ACTION_FLUSH, + .priority = 2 }, + { .name = "req_map_bio", + .code = REQ_Q_ACTION_MAP_BIO, + .priority = 0 }, + { .name = "req_sync", + .code = REQ_Q_ACTION_SYNC, + .priority = 2 }, + { .name = "req_vio_callback", + .code = REQ_Q_ACTION_VIO_CALLBACK, + .priority = 1 }, + }, +}; + +/**********************************************************************/ +int initializeKVDO(KVDO *kvdo, + const ThreadConfig *threadConfig, + char **reason) +{ + unsigned int baseThreads = threadConfig->baseThreadCount; + int result = ALLOCATE(baseThreads, KVDOThread, + "request processing work queue", + &kvdo->threads); + if (result != VDO_SUCCESS) { + *reason = "Cannot allocation thread structures"; + return result; + } + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + for (kvdo->initializedThreadCount = 0; + kvdo->initializedThreadCount < baseThreads; + kvdo->initializedThreadCount++) { + KVDOThread *thread = &kvdo->threads[kvdo->initializedThreadCount]; + + thread->kvdo = kvdo; + thread->threadID = kvdo->initializedThreadCount; + + char queueName[MAX_QUEUE_NAME_LEN]; + // Copy only LEN - 1 bytes and ensure NULL termination. + getVDOThreadName(threadConfig, kvdo->initializedThreadCount, + queueName, sizeof(queueName)); + int result = makeWorkQueue(layer->threadNamePrefix, queueName, + &layer->wqDirectory, layer, thread, + &requestQueueType, 1, &thread->requestQueue); + if (result != VDO_SUCCESS) { + *reason = "Cannot initialize request queue"; + while (kvdo->initializedThreadCount > 0) { + unsigned int threadToDestroy = kvdo->initializedThreadCount - 1; + thread = &kvdo->threads[threadToDestroy]; + finishWorkQueue(thread->requestQueue); + freeWorkQueue(&thread->requestQueue); + kvdo->initializedThreadCount--; + } + FREE(kvdo->threads); + return result; + } + + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +int preloadKVDO(KVDO *kvdo, + PhysicalLayer *common, + const VDOLoadConfig *loadConfig, + bool vioTraceRecording, + char **reason) +{ + KernelLayer *layer = asKernelLayer(common); + init_completion(&layer->callbackSync); + int result = prepareToLoadVDO(kvdo->vdo, loadConfig); + if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { + *reason = "Cannot load metadata from device"; + return result; + } + + setVDOTracingFlags(kvdo->vdo, vioTraceRecording); + return VDO_SUCCESS; +} + +/**********************************************************************/ +int startKVDO(KVDO *kvdo, PhysicalLayer *common, char **reason) +{ + KernelLayer *layer = asKernelLayer(common); + init_completion(&layer->callbackSync); + int result = performVDOLoad(kvdo->vdo); + if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { + *reason = "Cannot load metadata from device"; + return result; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int suspendKVDO(KVDO *kvdo) +{ + if (kvdo->vdo == NULL) { + return VDO_SUCCESS; + } + + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + init_completion(&layer->callbackSync); + int result = performVDOSuspend(kvdo->vdo, !layer->noFlushSuspend); + if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { + char errorName[80] = ""; + char errorMessage[ERRBUF_SIZE] = ""; + logError("%s: Suspend device failed %d (%s: %s)", + __func__, result, + stringErrorName(result, errorName, sizeof(errorName)), + stringError(result, errorMessage, sizeof(errorMessage))); + return result; + } + + // Convert VDO_READ_ONLY to VDO_SUCCESS since a read-only suspension still + // leaves the VDO suspended. + return VDO_SUCCESS; +} + +/**********************************************************************/ +int resumeKVDO(KVDO *kvdo) +{ + if (kvdo->vdo == NULL) { + return VDO_SUCCESS; + } + + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + init_completion(&layer->callbackSync); + return performVDOResume(kvdo->vdo); +} + +/**********************************************************************/ +void finishKVDO(KVDO *kvdo) +{ + for (int i = 0; i < kvdo->initializedThreadCount; i++) { + finishWorkQueue(kvdo->threads[i].requestQueue); + } +} + +/**********************************************************************/ +void destroyKVDO(KVDO *kvdo) +{ + destroyVDO(kvdo->vdo); + for (int i = 0; i < kvdo->initializedThreadCount; i++) { + freeWorkQueue(&kvdo->threads[i].requestQueue); + } + FREE(kvdo->threads); + kvdo->threads = NULL; +} + + +/**********************************************************************/ +void dumpKVDOWorkQueue(KVDO *kvdo) +{ + for (int i = 0; i < kvdo->initializedThreadCount; i++) { + dumpWorkQueue(kvdo->threads[i].requestQueue); + } +} + +/**********************************************************************/ +typedef struct { + KvdoWorkItem workItem; + KVDO *kvdo; + void *data; + struct completion *completion; +} SyncQueueWork; + +/** + * Initiate an arbitrary asynchronous base-code operation and wait for + * it. + * + * An async queue operation is performed and we wait for completion. + * + * @param kvdo The kvdo data handle + * @param action The operation to perform + * @param data Unique data that can be used by the operation + * @param threadID The thread on which to perform the operation + * @param completion The completion to wait on + * + * @return VDO_SUCCESS of an error code + **/ +static void performKVDOOperation(KVDO *kvdo, + KvdoWorkFunction action, + void *data, + ThreadID threadID, + struct completion *completion) +{ + SyncQueueWork sync; + + memset(&sync, 0, sizeof(sync)); + setupWorkItem(&sync.workItem, action, NULL, REQ_Q_ACTION_SYNC); + sync.kvdo = kvdo; + sync.data = data; + sync.completion = completion; + + init_completion(completion); + enqueueKVDOWork(kvdo, &sync.workItem, threadID); + wait_for_completion(completion); +} + +/**********************************************************************/ +typedef struct { + bool enable; + bool wasEnabled; +} VDOCompressData; + +/** + * Does the work of calling the base code to set compress state, then + * tells the function waiting on completion to go ahead. + * + * @param item The work item + **/ +static void setCompressingWork(KvdoWorkItem *item) +{ + SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); + VDOCompressData *data = (VDOCompressData *)work->data; + data->wasEnabled = setVDOCompressing(getVDO(work->kvdo), data->enable); + complete(work->completion); +} + +/***********************************************************************/ +bool setKVDOCompressing(KVDO *kvdo, bool enableCompression) +{ + struct completion compressWait; + VDOCompressData data; + data.enable = enableCompression; + performKVDOOperation(kvdo, setCompressingWork, &data, + getPackerZoneThread(getThreadConfig(kvdo->vdo)), + &compressWait); + return data.wasEnabled; +} + +/**********************************************************************/ +typedef struct { + int result; +} VDOReadOnlyData; + +/**********************************************************************/ +static void enterReadOnlyModeWork(KvdoWorkItem *item) +{ + SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); + VDOReadOnlyData *data = work->data; + makeVDOReadOnly(getVDO(work->kvdo), data->result); + complete(work->completion); +} + +/***********************************************************************/ +void setKVDOReadOnly(KVDO *kvdo, int result) +{ + struct completion readOnlyWait; + VDOReadOnlyData data; + data.result = result; + performKVDOOperation(kvdo, enterReadOnlyModeWork, &data, + getAdminThread(getThreadConfig(kvdo->vdo)), + &readOnlyWait); +} + +/** + * Does the work of calling the vdo statistics gathering tool + * + * @param item The work item + **/ +static void getVDOStatisticsWork(KvdoWorkItem *item) +{ + SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); + VDOStatistics *stats = (VDOStatistics *)work->data; + getVDOStatistics(getVDO(work->kvdo), stats); + complete(work->completion); +} + +/***********************************************************************/ +void getKVDOStatistics(KVDO *kvdo, VDOStatistics *stats) +{ + struct completion statsWait; + memset(stats, 0, sizeof(VDOStatistics)); + performKVDOOperation(kvdo, getVDOStatisticsWork, stats, + getAdminThread(getThreadConfig(kvdo->vdo)), + &statsWait); +} + +/** + * A structure to invoke an arbitrary VDO action. + **/ +typedef struct vdoActionData { + VDOAction *action; + VDOCompletion *vdoCompletion; + struct completion waiter; +} VDOActionData; + +/** + * Initialize a VDOActionData structure so that the specified action + * can be invoked on the specified completion. + * + * @param data A VDOActionData. + * @param action The VDOAction to execute. + * @param vdoCompletion The VDO completion upon which the action acts. + **/ +static void initializeVDOActionData(VDOActionData *data, + VDOAction *action, + VDOCompletion *vdoCompletion) +{ + *data = (VDOActionData) { + .action = action, + .vdoCompletion = vdoCompletion, + }; +} + +/** + * The VDO callback that completes the KVDO completion. + * + * @param vdoCompletion The VDO completion which was acted upon. + **/ +static void finishVDOAction(VDOCompletion *vdoCompletion) +{ + SyncQueueWork *work = vdoCompletion->parent; + complete(work->completion); +} + +/** + * Perform a VDO base code action as specified by a VDOActionData. + * + * Sets the completion callback and parent inside the VDOActionData + * so that the corresponding kernel completion is completed when + * the VDO completion is. + * + * @param item A KVDO work queue item. + **/ +static void performVDOActionWork(KvdoWorkItem *item) +{ + SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); + VDOActionData *data = work->data; + ThreadID id = getPhysicalLayer()->getCurrentThreadID(); + + setCallbackWithParent(data->vdoCompletion, finishVDOAction, id, work); + data->action(data->vdoCompletion); +} + +/**********************************************************************/ +int performKVDOExtendedCommand(KVDO *kvdo, int argc, char **argv) +{ + VDOActionData data; + VDOCommandCompletion cmd; + + int result = initializeVDOCommandCompletion(&cmd, getVDO(kvdo), argc, argv); + if (result != VDO_SUCCESS) { + return result; + } + + initializeVDOActionData(&data, executeVDOExtendedCommand, &cmd.completion); + performKVDOOperation(kvdo, performVDOActionWork, &data, + getAdminThread(getThreadConfig(kvdo->vdo)), + &data.waiter); + + return destroyVDOCommandCompletion(&cmd); +} + +/**********************************************************************/ +void dumpKVDOStatus(KVDO *kvdo) +{ + dumpVDOStatus(kvdo->vdo); +} + +/**********************************************************************/ +bool getKVDOCompressing(KVDO *kvdo) +{ + return getVDOCompressing(kvdo->vdo); +} + +/**********************************************************************/ +int kvdoPrepareToGrowPhysical(KVDO *kvdo, BlockCount physicalCount) +{ + VDO *vdo = kvdo->vdo; + return prepareToGrowPhysical(vdo, physicalCount); +} + +/**********************************************************************/ +int kvdoResizePhysical(KVDO *kvdo, BlockCount physicalCount) +{ + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + init_completion(&layer->callbackSync); + int result = performGrowPhysical(kvdo->vdo, physicalCount); + if (result != VDO_SUCCESS) { + logError("resize operation failed, result = %d", result); + return result; + } + + return VDO_SUCCESS; +} + +/**********************************************************************/ +int kvdoPrepareToGrowLogical(KVDO *kvdo, BlockCount logicalCount) +{ + VDO *vdo = kvdo->vdo; + return prepareToGrowLogical(vdo, logicalCount); +} + +/**********************************************************************/ +int kvdoResizeLogical(KVDO *kvdo, BlockCount logicalCount) +{ + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + init_completion(&layer->callbackSync); + int result = performGrowLogical(kvdo->vdo, logicalCount); + if (result != VDO_SUCCESS) { + logError("grow logical operation failed, result = %d", result); + } + + return result; +} + +/**********************************************************************/ +WritePolicy getKVDOWritePolicy(KVDO *kvdo) +{ + return getWritePolicy(kvdo->vdo); +} + +/**********************************************************************/ +void enqueueKVDOThreadWork(KVDOThread *thread, + KvdoWorkItem *item) +{ + enqueueWorkQueue(thread->requestQueue, item); +} + +/**********************************************************************/ +void enqueueKVDOWork(KVDO *kvdo, KvdoWorkItem *item, ThreadID threadID) +{ + enqueueKVDOThreadWork(&kvdo->threads[threadID], item); +} + +/**********************************************************************/ +void enqueueKVIO(KVIO *kvio, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + ThreadID threadID = vioAsCompletion(kvio->vio)->callbackThreadID; + BUG_ON(threadID >= kvio->layer->kvdo.initializedThreadCount); + launchKVIO(kvio, work, statsFunction, action, + kvio->layer->kvdo.threads[threadID].requestQueue); +} + +/**********************************************************************/ +static void kvdoEnqueueWork(KvdoWorkItem *workItem) +{ + KvdoEnqueueable *kvdoEnqueueable = container_of(workItem, + KvdoEnqueueable, + workItem); + runCallback(kvdoEnqueueable->enqueueable.completion); +} + +/**********************************************************************/ +void kvdoEnqueue(Enqueueable *enqueueable) +{ + KvdoEnqueueable *kvdoEnqueueable = container_of(enqueueable, + KvdoEnqueueable, + enqueueable); + KernelLayer *layer = asKernelLayer(enqueueable->completion->layer); + ThreadID threadID = enqueueable->completion->callbackThreadID; + if (ASSERT(threadID < layer->kvdo.initializedThreadCount, + "threadID %u (completion type %d) is less than thread count %u", + threadID, enqueueable->completion->type, + layer->kvdo.initializedThreadCount) != UDS_SUCCESS) { + BUG(); + } + + if (enqueueable->completion->type == VIO_COMPLETION) { + vioAddTraceRecord(asVIO(enqueueable->completion), + THIS_LOCATION("$F($cb)")); + } + setupWorkItem(&kvdoEnqueueable->workItem, kvdoEnqueueWork, + (KvdoWorkFunction) enqueueable->completion->callback, + REQ_Q_ACTION_COMPLETION); + enqueueKVDOThreadWork(&layer->kvdo.threads[threadID], + &kvdoEnqueueable->workItem); +} + +/**********************************************************************/ +ThreadID kvdoGetCurrentThreadID(void) +{ + KVDOThread *thread = getWorkQueuePrivateData(); + if (thread == NULL) { + return INVALID_THREAD_ID; + } + + ThreadID threadID = thread->threadID; + if (PARANOID_THREAD_CONSISTENCY_CHECKS) { + KVDO *kvdo = thread->kvdo; + KernelLayer *kernelLayer = asKernelLayer(getPhysicalLayer()); + BUG_ON(&kernelLayer->kvdo != kvdo); + BUG_ON(threadID >= kvdo->initializedThreadCount); + BUG_ON(thread != &kvdo->threads[threadID]); + } + return threadID; +} + +/**********************************************************************/ +static PhysicalLayer *getKernelPhysicalLayer(void) +{ + KVDOThread *thread = getWorkQueuePrivateData(); + if (thread == NULL) { + return NULL; + } + KVDO *kvdo = thread->kvdo; + KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); + return &layer->common; +} + +void initKernelVDOOnce(void) +{ + registerPhysicalLayerGetter(getKernelPhysicalLayer); +} diff --git a/source/vdo/kernel/kernelVDO.h b/source/vdo/kernel/kernelVDO.h new file mode 100644 index 0000000..b65534d --- /dev/null +++ b/source/vdo/kernel/kernelVDO.h @@ -0,0 +1,299 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDO.h#4 $ + */ + +#ifndef KERNEL_VDO_H +#define KERNEL_VDO_H + +#include "completion.h" +#include "kernelTypes.h" +#include "threadRegistry.h" +#include "workQueue.h" + +typedef struct { + KVDO *kvdo; + ThreadID threadID; + KvdoWorkQueue *requestQueue; + RegisteredThread allocatingThread; +} KVDOThread; + +struct kvdo { + KVDOThread *threads; + ThreadID initializedThreadCount; + KvdoWorkItem workItem; + VDOAction *action; + VDOCompletion *completion; + // Base-code device info + VDO *vdo; +}; + +typedef enum reqQAction { + REQ_Q_ACTION_COMPLETION, + REQ_Q_ACTION_FLUSH, + REQ_Q_ACTION_MAP_BIO, + REQ_Q_ACTION_SYNC, + REQ_Q_ACTION_VIO_CALLBACK +} ReqQAction; + +/** + * Initialize the base code interface. + * + * @param [in] kvdo The KVDO to be initialized + * @param [in] threadConfig The base-code thread configuration + * @param [out] reason The reason for failure + * + * @return VDO_SUCCESS or an error code + **/ +int initializeKVDO(KVDO *kvdo, + const ThreadConfig *threadConfig, + char **reason); + +/** + * Load the VDO state from disk but don't alter the on-disk state. This method + * is ultimately called from the constructor for devices which have not been + * resumed. + * + * @param [in] kvdo The KVDO to be started + * @param [in] common The physical layer pointer + * @param [in] loadConfig Load-time parameters for the VDO + * @param [in] vioTraceRecording Debug flag to store + * @param [out] reason The reason for failure + **/ +int preloadKVDO(KVDO *kvdo, + PhysicalLayer *common, + const VDOLoadConfig *loadConfig, + bool vioTraceRecording, + char **reason); + +/** + * Starts the base VDO instance associated with the kernel layer. This method + * is ultimately called from preresume the first time an instance is resumed. + * + * @param [in] kvdo The KVDO to be started + * @param [in] common The physical layer pointer + * @param [out] reason The reason for failure + * + * @return VDO_SUCCESS if started, otherwise error + */ +int startKVDO(KVDO *kvdo, PhysicalLayer *common, char **reason); + +/** + * Suspend the base VDO instance associated with the kernel layer. + * + * @param kvdo The KVDO to be suspended + * + * @return VDO_SUCCESS if stopped, otherwise error + **/ +int suspendKVDO(KVDO *kvdo); + +/** + * Resume the base VDO instance associated with the kernel layer. + * + * @param kvdo The KVDO to be resumed + * + * @return VDO_SUCCESS or an error + **/ +int resumeKVDO(KVDO *kvdo); + +/** + * Shut down the base code interface. The kvdo object must first be + * stopped. + * + * @param kvdo The KVDO to be shut down + **/ +void finishKVDO(KVDO *kvdo); + +/** + * Free up storage of the base code interface. The KVDO object must + * first have been "finished". + * + * @param kvdo The KVDO object to be destroyed + **/ +void destroyKVDO(KVDO *kvdo); + + +/** + * Dump to the kernel log any work-queue info associated with the base + * code. + * + * @param kvdo The KVDO object to be examined + **/ +void dumpKVDOWorkQueue(KVDO *kvdo); + +/** + * Get the VDO pointer for a kvdo object + * + * @param kvdo The KVDO object + * + * @return the VDO pointer + */ +static inline VDO *getVDO(KVDO *kvdo) +{ + return kvdo->vdo; +} + +/** + * Set whether compression is enabled. + * + * @param kvdo The KVDO object + * @param enableCompression The new compression mode + * + * @return state of compression before new value is set + **/ +bool setKVDOCompressing(KVDO *kvdo, bool enableCompression); + +/** + * Get the current compression mode + * + * @param kvdo The KVDO object to be queried + * + * @return whether compression is currently enabled + */ +bool getKVDOCompressing(KVDO *kvdo); + +/** + * Gets the latest statistics gathered by the base code. + * + * @param kvdo the KVDO object + * @param stats the statistics struct to fill in + */ +void getKVDOStatistics(KVDO *kvdo, VDOStatistics *stats); + +/** + * Get the current write policy + * + * @param kvdo The KVDO to be queried + * + * @return the write policy in effect + */ +WritePolicy getKVDOWritePolicy(KVDO *kvdo); + +/** + * Dump base code status information to the kernel log for debugging. + * + * @param kvdo The KVDO to be examined + */ +void dumpKVDOStatus(KVDO *kvdo); + +/** + * Request the base code prepare to grow the physical space. + * + * @param kvdo The KVDO to be updated + * @param physicalCount The new size + * + * @return VDO_SUCCESS or error + */ +int kvdoPrepareToGrowPhysical(KVDO *kvdo, BlockCount physicalCount); + +/** + * Notify the base code of resized physical storage. + * + * @param kvdo The KVDO to be updated + * @param physicalCount The new size + * + * @return VDO_SUCCESS or error + */ +int kvdoResizePhysical(KVDO *kvdo, BlockCount physicalCount); + +/** + * Request the base code prepare to grow the logical space. + * + * @param kvdo The KVDO to be updated + * @param logicalCount The new size + * + * @return VDO_SUCCESS or error + */ +int kvdoPrepareToGrowLogical(KVDO *kvdo, BlockCount logicalCount); + +/** + * Request the base code grow the logical space. + * + * @param kvdo The KVDO to be updated + * @param logicalCount The new size + * + * @return VDO_SUCCESS or error + */ +int kvdoResizeLogical(KVDO *kvdo, BlockCount logicalCount); + +/** + * Request the base code go read-only. + * + * @param kvdo The KVDO to be updated + * @param result The error code causing the read only + */ +void setKVDOReadOnly(KVDO *kvdo, int result); + +/** + * Perform an extended base-code command + * + * @param kvdo The KVDO upon which to perform the operation. + * @param argc The number of arguments to the command. + * @param argv The command arguments. Note that all extended + * command argv[0] strings start with "x-". + * + * @return VDO_SUCCESS or an error code + **/ +int performKVDOExtendedCommand(KVDO *kvdo, int argc, char **argv); + +/** + * Enqueue a work item to be processed in the base code context. + * + * @param kvdo The KVDO object in which to run the work item + * @param item The work item to be run + * @param threadID The thread on which to run the work item + **/ +void enqueueKVDOWork(KVDO *kvdo, KvdoWorkItem *item, ThreadID threadID); + +/** + * Set up and enqueue a VIO's work item to be processed in the base code + * context. + * + * @param kvio The VIO with the work item to be run + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +void enqueueKVIO(KVIO *kvio, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action); + +/** + * Enqueue an arbitrary completion for execution on its indicated + * thread. + * + * @param enqueueable The Enqueueable object containing the completion pointer + **/ +void kvdoEnqueue(Enqueueable *enqueueable); + +/** + * Get the base-code thread index for the current execution context. + * + * @return The thread ID, or (ThreadID)-1 if the current thread is + * not a base-code thread, or in an interrupt context. + **/ +ThreadID kvdoGetCurrentThreadID(void); + +/** + * Do one-time initialization of kernelVDO interface. + **/ +void initKernelVDOOnce(void); + +#endif // KERNEL_VDO_H diff --git a/source/vdo/kernel/kernelVDOInternals.h b/source/vdo/kernel/kernelVDOInternals.h new file mode 100644 index 0000000..aefe05a --- /dev/null +++ b/source/vdo/kernel/kernelVDOInternals.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDOInternals.h#1 $ + */ + +#ifndef KERNEL_VDO_INTERNALS_H +#define KERNEL_VDO_INTERNALS_H + +#include "kernelVDO.h" + +/** + * Enqueue a work item to be performed in the base code in a + * particular thread. + * + * @param thread The KVDO thread on which to run the work item + * @param item The work item to be run + **/ +void enqueueKVDOThreadWork(KVDOThread *thread, KvdoWorkItem *item); + +#endif // KERNEL_VDO_INTERNALS_H diff --git a/source/vdo/kernel/ktrace.c b/source/vdo/kernel/ktrace.c new file mode 100644 index 0000000..ebc654a --- /dev/null +++ b/source/vdo/kernel/ktrace.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ktrace.c#2 $ + */ + +#include "ktrace.h" + +#include "memoryAlloc.h" + +#include "dataVIO.h" + +#include "kvio.h" +#include "logger.h" + +enum { + // How much data from a trace can we log in one call without messing + // up the log or losing data? + TRACE_LOG_MAX = 820, + + // What fraction (1 out of TRACE_SAMPLE_INTERVAL VIOs) to trace + TRACE_SAMPLE_INTERVAL = 3, +}; + +bool traceRecording = false; + +static struct { + char buffer[2000]; + unsigned int counter; + struct mutex lock; +} traceLoggingState; + +/** + * Initialize a SampleCounter structure with the given sampling interval. + * + * @param counter The counter to initialize + * @param interval The desired sampling interval + **/ +static void initializeSampleCounter(SampleCounter *counter, + unsigned int interval) +{ + spin_lock_init(&counter->lock); + counter->tick = 0; + counter->interval = interval; +} + +/*************************************************************************/ +bool sampleThisOne(SampleCounter *counter) +{ + bool wantTracing = false; + spin_lock(&counter->lock); + counter->tick++; + if (counter->tick >= counter->interval) { + counter->tick = 0; + wantTracing = true; + } + spin_unlock(&counter->lock); + return wantTracing; +} + +/*************************************************************************/ +static void freeTraceDataBuffer(void *poolData, void *data) +{ + Trace *trace = (Trace *) data; + FREE(trace); +} + +/*************************************************************************/ +static int allocTraceDataBuffer(void *poolData, void **dataPtr) +{ + Trace *trace; + int result = ALLOCATE(1, Trace, __func__, &trace); + if (result != VDO_SUCCESS) { + logError("trace data allocation failure %d", result); + return result; + } + + *dataPtr = trace; + return VDO_SUCCESS; +} + +/*************************************************************************/ +int allocTraceFromPool(KernelLayer *layer, Trace **tracePointer) +{ + int result = allocBufferFromPool(layer->traceBufferPool, + (void **) tracePointer); + if (result == VDO_SUCCESS) { + (*tracePointer)->used = 0; + } + return result; +} + +/*************************************************************************/ +void freeTraceToPool(KernelLayer *layer, Trace *trace) +{ + freeBufferToPool(layer->traceBufferPool, trace); +} + +/*************************************************************************/ +int traceKernelLayerInit(KernelLayer *layer) +{ + layer->vioTraceRecording = traceRecording; + initializeSampleCounter(&layer->traceSampleCounter, TRACE_SAMPLE_INTERVAL); + unsigned int traceRecordsNeeded = 0; + if (layer->vioTraceRecording) { + traceRecordsNeeded += layer->requestLimiter.limit; + } + if (traceRecordsNeeded > 0) { + return makeBufferPool("KVDO Trace Data Pool", traceRecordsNeeded, + allocTraceDataBuffer, freeTraceDataBuffer, NULL, + layer, &layer->traceBufferPool); + } + return VDO_SUCCESS; +} + +/*************************************************************************/ +void initializeTraceLoggingOnce(void) +{ + mutex_init(&traceLoggingState.lock); +} + +/*************************************************************************/ +void logKvioTrace(KVIO *kvio) +{ + KernelLayer *layer = kvio->layer; + + mutex_lock(&traceLoggingState.lock); + traceLoggingState.counter++; + // Log about 0.1% to avoid spewing data faster than syslog can keep up + // (on certain of Permabit's test machines). + // Yes, the 37 is arbitrary and meaningless. + + if (layer->traceLogging && ((traceLoggingState.counter % 1024) == 37)) { + kvioAddTraceRecord(kvio, THIS_LOCATION(NULL)); + size_t traceLen = 0; + formatTrace(kvio->vio->trace, traceLoggingState.buffer, + sizeof(traceLoggingState.buffer), &traceLen); + + if (isMetadata(kvio)) { + logInfo("finishing kvio %s meta @%" PRIptr " %s", + (isWriteVIO(kvio->vio) ? "read" : "write"), + kvio, traceLoggingState.buffer); + } else if (isCompressedWriter(kvio)) { + logInfo("finishing kvio write comp @%" PRIptr " %s", + kvio, traceLoggingState.buffer); + } else { + const char *dupeLabel = ""; + if (isWriteVIO(kvio->vio)) { + DataVIO *dataVIO = vioAsDataVIO(kvio->vio); + if (isTrimDataVIO(dataVIO)) { + dupeLabel = "trim "; + } else if (dataVIO->isZeroBlock) { + dupeLabel = "zero "; + } else if (dataVIO->isDuplicate) { + dupeLabel = "dupe "; + } else { + dupeLabel = "new "; + } + } + + logInfo("finishing kvio %s data %s@%" PRIptr " %.*s", + (isWriteVIO(kvio->vio) ? "read" : "write"), + dupeLabel, kvio, TRACE_LOG_MAX, traceLoggingState.buffer); + char *buf = traceLoggingState.buffer; + while (traceLen > TRACE_LOG_MAX) { + traceLen -= TRACE_LOG_MAX; + buf += TRACE_LOG_MAX; + logInfo("more kvio %" PRIptr " path: %.*s", kvio, TRACE_LOG_MAX, buf); + } + } + } + + mutex_unlock(&traceLoggingState.lock); +} diff --git a/source/vdo/kernel/ktrace.h b/source/vdo/kernel/ktrace.h new file mode 100644 index 0000000..99cda7a --- /dev/null +++ b/source/vdo/kernel/ktrace.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ktrace.h#1 $ + */ + +#ifndef KTRACE_H +#define KTRACE_H + +#include + +#include "common.h" +#include "trace.h" + +struct kernelLayer; +struct kvio; + +// Implement event sampling once per N. +typedef struct { + unsigned int interval; + unsigned int tick; + spinlock_t lock; +} SampleCounter; + +/** + * Flag indicating whether newly created VDO devices should record trace info. + **/ +extern bool traceRecording; + +/** + * Updates the counter state and returns true once each time the + * sampling interval is reached. + * + * @param counter The sampling counter info + * + * @return whether to do sampling on this invocation + **/ +bool sampleThisOne(SampleCounter *counter); + +/** + * Initialize trace data in the KernelLayer + * + * @param layer The KernelLayer + * + * @return VDO_SUCCESS, or an error code + **/ +int traceKernelLayerInit(struct kernelLayer *layer); + +/** + * Initialize the mutex used when logging latency tracing data. + **/ +void initializeTraceLoggingOnce(void); + +/** + * Allocate a trace buffer + * + * @param layer The KernelLayer + * @param tracePointer The trace buffer is returned here + * + * @return VDO_SUCCESS or an error code + **/ +int allocTraceFromPool(struct kernelLayer *layer, Trace **tracePointer); + +/** + * Free a trace buffer + * + * @param layer The KernelLayer + * @param trace The trace buffer + **/ +void freeTraceToPool(struct kernelLayer *layer, Trace *trace); + +/** + * Log the trace at kvio freeing time + * + * @param kvio The kvio structure + **/ +void logKvioTrace(struct kvio *kvio); + +#endif /* KTRACE_H */ diff --git a/source/vdo/kernel/kvdoFlush.c b/source/vdo/kernel/kvdoFlush.c new file mode 100644 index 0000000..7b38af1 --- /dev/null +++ b/source/vdo/kernel/kvdoFlush.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvdoFlush.c#6 $ + */ + +#include "kvdoFlush.h" + +#include "logger.h" +#include "memoryAlloc.h" + +#include "threadConfig.h" + +#include "bio.h" +#include "ioSubmitter.h" + +/** + * A specific (concrete) encapsulation of flush requests. + * + *

We attempt to allocate a KVDOFlush objects for each incoming flush bio. + * In case the allocate fails, a spare object is pre-allocated by and stored + * in the kernel layer. The first time an allocation fails, the spare is used. + * If another allocation fails while the spare is in use, it will merely be + * queued for later processing. + * + *

When a KVDOFlush is complete, it will either be freed, immediately + * re-used for queued flushes, or stashed in the kernel layer as the new spare + * object. This ensures that we will always make forward progress. + **/ +struct kvdoFlush { + KvdoWorkItem workItem; + KernelLayer *layer; + struct bio_list bios; + Jiffies arrivalTime; // Time when earliest bio appeared + VDOFlush vdoFlush; +}; + +/**********************************************************************/ +int makeKVDOFlush(KVDOFlush **flushPtr) +{ + return ALLOCATE(1, KVDOFlush, __func__, flushPtr); +} + +/**********************************************************************/ +bool shouldProcessFlush(KernelLayer *layer) +{ + return (getKVDOWritePolicy(&layer->kvdo) != WRITE_POLICY_SYNC); +} + +/** + * Function call to handle an empty flush request from the request queue. + * + * @param item The work item representing the flush request + **/ +static void kvdoFlushWork(KvdoWorkItem *item) +{ + KVDOFlush *kvdoFlush = container_of(item, KVDOFlush, workItem); + flush(kvdoFlush->layer->kvdo.vdo, &kvdoFlush->vdoFlush); +} + +/** + * Initialize a KVDOFlush object, transferring all the bios in the kernel + * layer's waitingFlushes list to it. The caller MUST already hold the layer's + * flushLock. + * + * @param kvdoFlush The flush to initialize + * @param layer The kernel layer on which the flushLock is held + **/ +static void initializeKVDOFlush(KVDOFlush *kvdoFlush, KernelLayer *layer) +{ + kvdoFlush->layer = layer; + bio_list_init(&kvdoFlush->bios); + bio_list_merge(&kvdoFlush->bios, &layer->waitingFlushes); + bio_list_init(&layer->waitingFlushes); + kvdoFlush->arrivalTime = layer->flushArrivalTime; +} + +/**********************************************************************/ +static void enqueueKVDOFlush(KVDOFlush *kvdoFlush) +{ + setupWorkItem(&kvdoFlush->workItem, kvdoFlushWork, NULL, REQ_Q_ACTION_FLUSH); + KVDO *kvdo = &kvdoFlush->layer->kvdo; + enqueueKVDOWork(kvdo, &kvdoFlush->workItem, + getPackerZoneThread(getThreadConfig(kvdo->vdo))); +} + +/**********************************************************************/ +void launchKVDOFlush(KernelLayer *layer, BIO *bio) +{ + // Try to allocate a KVDOFlush to represent the flush request. If the + // allocation fails, we'll deal with it later. + KVDOFlush *kvdoFlush = ALLOCATE_NOWAIT(KVDOFlush, __func__); + + spin_lock(&layer->flushLock); + + // We have a new bio to start. Add it to the list. If it becomes the + // only entry on the list, record the time. + if (bio_list_empty(&layer->waitingFlushes)) { + layer->flushArrivalTime = jiffies; + } + bio_list_add(&layer->waitingFlushes, bio); + + if (kvdoFlush == NULL) { + // The KVDOFlush allocation failed. Try to use the spare KVDOFlush object. + if (layer->spareKVDOFlush == NULL) { + // The spare is already in use. This bio is on waitingFlushes and it + // will be handled by a flush completion or by a bio that can allocate. + spin_unlock(&layer->flushLock); + return; + } + + // Take and use the spare KVDOFlush object. + kvdoFlush = layer->spareKVDOFlush; + layer->spareKVDOFlush = NULL; + } + + // We have flushes to start. Capture them in the KVDOFlush object. + initializeKVDOFlush(kvdoFlush, layer); + + spin_unlock(&layer->flushLock); + + // Finish launching the flushes. + enqueueKVDOFlush(kvdoFlush); +} + +/** + * Release a KVDOFlush object that has completed its work. If there are any + * pending flush requests whose KVDOFlush allocation failed, they will be + * launched by immediately re-using the released KVDOFlush. If there is no + * spare KVDOFlush, the released object will become the spare. Otherwise, the + * KVDOFlush will be freed. + * + * @param kvdoFlush The completed flush object to re-use or free + **/ +static void releaseKVDOFlush(KVDOFlush *kvdoFlush) +{ + KernelLayer *layer = kvdoFlush->layer; + bool relaunchFlush = false; + bool freeFlush = false; + + spin_lock(&layer->flushLock); + if (bio_list_empty(&layer->waitingFlushes)) { + // Nothing needs to be started. Save one spare KVDOFlush object. + if (layer->spareKVDOFlush == NULL) { + // Make the new spare all zero, just like a newly allocated one. + memset(kvdoFlush, 0, sizeof(*kvdoFlush)); + layer->spareKVDOFlush = kvdoFlush; + } else { + freeFlush = true; + } + } else { + // We have flushes to start. Capture them in the KVDOFlush object. + initializeKVDOFlush(kvdoFlush, layer); + relaunchFlush = true; + } + spin_unlock(&layer->flushLock); + + if (relaunchFlush) { + // Finish launching the flushes. + enqueueKVDOFlush(kvdoFlush); + } else if (freeFlush) { + FREE(kvdoFlush); + } +} + +/** + * Function called to complete and free a flush request + * + * @param item The flush-request work item + **/ +static void kvdoCompleteFlushWork(KvdoWorkItem *item) +{ + KVDOFlush *kvdoFlush = container_of(item, KVDOFlush, workItem); + KernelLayer *layer = kvdoFlush->layer; + + BIO *bio; + while ((bio = bio_list_pop(&kvdoFlush->bios)) != NULL) { + // We're not acknowledging this bio now, but we'll never touch it + // again, so this is the last chance to account for it. + countBios(&layer->biosAcknowledged, bio); + + // Make sure the bio is a empty flush bio. + prepareFlushBIO(bio, bio->bi_private, getKernelLayerBdev(layer), + bio->bi_end_io); + atomic64_inc(&layer->flushOut); + generic_make_request(bio); + } + + + // Release the KVDOFlush object, freeing it, re-using it as the spare, or + // using it to launch any flushes that had to wait when allocations failed. + releaseKVDOFlush(kvdoFlush); +} + +/**********************************************************************/ +void kvdoCompleteFlush(VDOFlush **kfp) +{ + if (*kfp != NULL) { + KVDOFlush *kvdoFlush = container_of(*kfp, KVDOFlush, vdoFlush); + setupWorkItem(&kvdoFlush->workItem, kvdoCompleteFlushWork, NULL, + BIO_Q_ACTION_FLUSH); + enqueueBioWorkItem(kvdoFlush->layer->ioSubmitter, + &kvdoFlush->workItem); + *kfp = NULL; + } +} + +/**********************************************************************/ +int synchronousFlush(KernelLayer *layer) +{ + BIO bio; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) + bio_init(&bio, 0, 0); +#else + bio_init(&bio); +#endif + int result = 0; + + prepareFlushBIO(&bio, layer, getKernelLayerBdev(layer), NULL); + result = submitBioAndWait(&bio); + atomic64_inc(&layer->flushOut); + if (result != 0) { + logErrorWithStringError(result, "synchronous flush failed"); + result = -EIO; + } + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) + bio_uninit(&bio); +#endif + return result; +} diff --git a/source/vdo/kernel/kvdoFlush.h b/source/vdo/kernel/kvdoFlush.h new file mode 100644 index 0000000..2d90953 --- /dev/null +++ b/source/vdo/kernel/kvdoFlush.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvdoFlush.h#1 $ + */ + +#ifndef KVDO_FLUSH_H +#define KVDO_FLUSH_H + +#include "flush.h" + +#include "kernelLayer.h" + +/** + * Create a KVDOFlush. + * + * @param flushPtr A pointer to hold the new flush + **/ +int makeKVDOFlush(KVDOFlush **flushPtr); + +/** + * Answer the question as to whether VDO should be processing REQ_FLUSH + * requests or not. + * + * @param layer The layer + * + * @return true if VDO should process empty flush requests, or false if + * they should just be forwarded to our storage device. + **/ +bool shouldProcessFlush(KernelLayer *layer); + +/** + * Function called to start processing a flush request. It is called when we + * receive an empty flush bio from the block layer, and before acknowledging a + * non-empty bio with the FUA flag set. + * + * @param layer The physical layer + * @param bio The bio containing an empty flush request + **/ +void launchKVDOFlush(KernelLayer *layer, BIO *bio); + +/** + * Function called from base VDO to complete and free a flush request. + * + * @param kfp Pointer to the flush request + **/ +void kvdoCompleteFlush(VDOFlush **kfp); + +/** + * Issue a flush request and wait for it to complete. + * + * @param layer The kernel layer + * + * @return VDO_SUCCESS or an error + */ +int synchronousFlush(KernelLayer *layer); + +#endif /* KVDO_FLUSH_H */ diff --git a/source/vdo/kernel/kvio.c b/source/vdo/kernel/kvio.c new file mode 100644 index 0000000..336f86e --- /dev/null +++ b/source/vdo/kernel/kvio.c @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvio.c#7 $ + */ + +#include "kvio.h" + + +#include "logger.h" +#include "memoryAlloc.h" + +#include "numUtils.h" +#include "vdo.h" +#include "waitQueue.h" + +#include "bio.h" +#include "ioSubmitter.h" +#include "kvdoFlush.h" + +/** + * A function to tell vdo that we have completed the requested async + * operation for a vio + * + * @param item The work item of the VIO to complete + **/ +static void kvdoHandleVIOCallback(KvdoWorkItem *item) +{ + KVIO *kvio = workItemAsKVIO(item); + runCallback(vioAsCompletion(kvio->vio)); +} + +/**********************************************************************/ +void kvdoEnqueueVIOCallback(KVIO *kvio) +{ + enqueueKVIO(kvio, kvdoHandleVIOCallback, + (KvdoWorkFunction) vioAsCompletion(kvio->vio)->callback, + REQ_Q_ACTION_VIO_CALLBACK); +} + +/**********************************************************************/ +void kvdoContinueKvio(KVIO *kvio, int error) +{ + if (unlikely(error != VDO_SUCCESS)) { + setCompletionResult(vioAsCompletion(kvio->vio), error); + } + kvdoEnqueueVIOCallback(kvio); +} + +/**********************************************************************/ +// noinline ensures systemtap can hook in here +static noinline void maybeLogKvioTrace(KVIO *kvio) +{ + if (kvio->layer->traceLogging) { + logKvioTrace(kvio); + } +} + +/**********************************************************************/ +static void freeKVIO(KVIO **kvioPtr) +{ + KVIO *kvio = *kvioPtr; + if (kvio == NULL) { + return; + } + + if (unlikely(kvio->vio->trace != NULL)) { + maybeLogKvioTrace(kvio); + FREE(kvio->vio->trace); + } + + freeBio(kvio->bio, kvio->layer); + FREE(kvio); + *kvioPtr = NULL; +} + +/**********************************************************************/ +void freeMetadataKVIO(MetadataKVIO **metadataKVIOPtr) +{ + freeKVIO((KVIO **) metadataKVIOPtr); +} + +/**********************************************************************/ +void freeCompressedWriteKVIO(CompressedWriteKVIO **compressedWriteKVIOPtr) +{ + freeKVIO((KVIO **) compressedWriteKVIOPtr); +} + +/**********************************************************************/ +void kvdoWriteCompressedBlock(AllocatingVIO *allocatingVIO) +{ + // This method assumes that compressed writes never set the flush or FUA + // bits. + CompressedWriteKVIO *compressedWriteKVIO + = allocatingVIOAsCompressedWriteKVIO(allocatingVIO); + KVIO *kvio = compressedWriteKVIOAsKVIO(compressedWriteKVIO); + BIO *bio = kvio->bio; + resetBio(bio, kvio->layer); + setBioOperationWrite(bio); + setBioSector(bio, blockToSector(kvio->layer, kvio->vio->physical)); + submitBio(bio, BIO_Q_ACTION_COMPRESSED_DATA); +} + +/** + * Get the BioQueue action for a metadata VIO based on that VIO's priority. + * + * @param vio The VIO + * + * @return The action with which to submit the VIO's BIO. + **/ +static inline BioQAction getMetadataAction(VIO *vio) +{ + return ((vio->priority == VIO_PRIORITY_HIGH) + ? BIO_Q_ACTION_HIGH : BIO_Q_ACTION_METADATA); +} + +/**********************************************************************/ +void kvdoSubmitMetadataVIO(VIO *vio) +{ + KVIO *kvio = metadataKVIOAsKVIO(vioAsMetadataKVIO(vio)); + BIO *bio = kvio->bio; + resetBio(bio, kvio->layer); + + setBioSector(bio, blockToSector(kvio->layer, vio->physical)); + + // Metadata I/Os bypass the read cache. + if (isReadVIO(vio)) { + ASSERT_LOG_ONLY(!vioRequiresFlushBefore(vio), + "read VIO does not require flush before"); + vioAddTraceRecord(vio, THIS_LOCATION("$F;io=readMeta")); + setBioOperationRead(bio); + } else { + KernelLayerState state = getKernelLayerState(kvio->layer); + ASSERT_LOG_ONLY(((state == LAYER_RUNNING) + || (state == LAYER_RESUMING) + || (state = LAYER_STARTING)), + "write metadata in allowed state %d", state); + if (vioRequiresFlushBefore(vio)) { + setBioOperationWrite(bio); + setBioOperationFlagPreflush(bio); + vioAddTraceRecord(vio, THIS_LOCATION("$F;io=flushWriteMeta")); + } else { + setBioOperationWrite(bio); + vioAddTraceRecord(vio, THIS_LOCATION("$F;io=writeMeta")); + } + } + + if (vioRequiresFlushAfter(vio)) { + setBioOperationFlagFua(bio); + } + submitBio(bio, getMetadataAction(vio)); +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) +/** + * Handle the completion of a base-code initiated flush by continuing the flush + * VIO. + * + * @param bio The bio to complete + **/ +static void completeFlushBio(BIO *bio) +#else +/** + * Handle the completion of a base-code initiated flush by continuing the flush + * VIO. + * + * @param bio The bio to complete + * @param error Possible error from underlying block device + **/ +static void completeFlushBio(BIO *bio, int error) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) + int error = getBioResult(bio); +#endif + KVIO *kvio = (KVIO *) bio->bi_private; + // XXX This assumes a VDO-created bio around a buffer contains exactly 1 + // page, which we believe is true, but do not assert. + bio->bi_vcnt = 1; + // Restore the bio's notion of its own data. + resetBio(bio, kvio->layer); + kvdoContinueKvio(kvio, error); +} + +/**********************************************************************/ +void kvdoFlushVIO(VIO *vio) +{ + KVIO *kvio = metadataKVIOAsKVIO(vioAsMetadataKVIO(vio)); + BIO *bio = kvio->bio; + KernelLayer *layer = kvio->layer; + resetBio(bio, layer); + prepareFlushBIO(bio, kvio, getKernelLayerBdev(layer), completeFlushBio); + submitBio(bio, getMetadataAction(vio)); +} + +/* + * Hook for a SystemTap probe to potentially restrict the choices + * of which VIOs should have their latencies tracked. + * + * Normally returns true. Even if true is returned, sampleThisOne may + * cut down the monitored VIOs by some fraction so as to reduce the + * impact on system performance. + * + * Must be "noinline" so that SystemTap can find the return + * instruction and modify the return value. + * + * @param kvio The KVIO being initialized + * @param layer The kernel layer + * @param bio The incoming I/O request + * + * @return whether it's useful to track latency for VIOs looking like + * this one + */ +static noinline bool +sampleThisVIO(KVIO *kvio, KernelLayer *layer, BIO *bio) +{ + bool result = true; + // Ensure the arguments and result exist at the same time, for SystemTap. + __asm__ __volatile__("" + : "=g" (result) + : "0" (result), + "g" (kvio), + "g" (layer), + "g" (bio) + : "memory"); + return result; +} + +/**********************************************************************/ +void initializeKVIO(KVIO *kvio, + KernelLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + BIO *bio) +{ + if (layer->vioTraceRecording + && sampleThisVIO(kvio, layer, bio) + && sampleThisOne(&layer->traceSampleCounter)) { + int result = (isDataVIOType(vioType) + ? allocTraceFromPool(layer, &kvio->vio->trace) + : ALLOCATE(1, Trace, "trace", &kvio->vio->trace)); + if (result != VDO_SUCCESS) { + logError("trace record allocation failure %d", result); + } + } + + kvio->bio = bio; + kvio->layer = layer; + if (bio != NULL) { + bio->bi_private = kvio; + } + + initializeVIO(kvio->vio, vioType, priority, parent, getVDO(&layer->kvdo), + &layer->common); + + // XXX: The "init" label should be replaced depending on the + // write/read/flush path followed. + kvioAddTraceRecord(kvio, THIS_LOCATION("$F;io=?init;j=normal")); + + VDOCompletion *completion = vioAsCompletion(kvio->vio); + kvio->enqueueable.enqueueable.completion = completion; + completion->enqueueable = &kvio->enqueueable.enqueueable; +} + +/** + * Construct a metadata KVIO. + * + * @param [in] layer The physical layer + * @param [in] vioType The type of VIO to create + * @param [in] priority The relative priority to assign to the + * MetadataKVIO + * @param [in] parent The parent of the MetadataKVIO completion + * @param [in] bio The bio to associate with this MetadataKVIO + * @param [out] metadataKVIOPtr A pointer to hold the new MetadataKVIO + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int makeMetadataKVIO(KernelLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + BIO *bio, + MetadataKVIO **metadataKVIOPtr) +{ + // If MetadataKVIO grows past 256 bytes, we'll lose benefits of VDOSTORY-176. + STATIC_ASSERT(sizeof(MetadataKVIO) <= 256); + + // Metadata VIOs should use direct allocation and not use the buffer pool, + // which is reserved for submissions from the linux block layer. + MetadataKVIO *metadataKVIO; + int result = ALLOCATE(1, MetadataKVIO, __func__, &metadataKVIO); + if (result != VDO_SUCCESS) { + logError("metadata KVIO allocation failure %d", result); + return result; + } + + KVIO *kvio = &metadataKVIO->kvio; + kvio->vio = &metadataKVIO->vio; + initializeKVIO(kvio, layer, vioType, priority, parent, bio); + *metadataKVIOPtr = metadataKVIO; + return VDO_SUCCESS; +} + +/** + * Construct a CompressedWriteKVIO. + * + * @param [in] layer The physical layer + * @param [in] parent The parent of the CompressedWriteKVIO + * completion + * @param [in] bio The bio to associate with this + * CompressedWriteKVIO + * @param [out] compressedWriteKVIOPtr A pointer to hold the new + * CompressedWriteKVIO + * + * @return VDO_SUCCESS or an error + **/ +__attribute__((warn_unused_result)) +static int +makeCompressedWriteKVIO(KernelLayer *layer, + void *parent, + BIO *bio, + CompressedWriteKVIO **compressedWriteKVIOPtr) +{ + // Compressed write VIOs should use direct allocation and not use the buffer + // pool, which is reserved for submissions from the linux block layer. + CompressedWriteKVIO *compressedWriteKVIO; + int result = ALLOCATE(1, CompressedWriteKVIO, __func__, + &compressedWriteKVIO); + if (result != VDO_SUCCESS) { + logError("compressed write KVIO allocation failure %d", result); + return result; + } + + KVIO *kvio = &compressedWriteKVIO->kvio; + kvio->vio = allocatingVIOAsVIO(&compressedWriteKVIO->allocatingVIO); + initializeKVIO(kvio, layer, VIO_TYPE_COMPRESSED_BLOCK, + VIO_PRIORITY_COMPRESSED_DATA, parent, bio); + *compressedWriteKVIOPtr = compressedWriteKVIO; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int kvdoCreateMetadataVIO(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + char *data, + VIO **vioPtr) +{ + int result = ASSERT(isMetadataVIOType(vioType), + "%d is a metadata type", vioType); + if (result != VDO_SUCCESS) { + return result; + } + + BIO *bio; + KernelLayer *kernelLayer = asKernelLayer(layer); + result = createBio(kernelLayer, data, &bio); + if (result != VDO_SUCCESS) { + return result; + } + + MetadataKVIO *metadataKVIO; + result = makeMetadataKVIO(kernelLayer, vioType, priority, parent, bio, + &metadataKVIO); + if (result != VDO_SUCCESS) { + freeBio(bio, kernelLayer); + return result; + } + + *vioPtr = &metadataKVIO->vio; + return VDO_SUCCESS; +} + +/**********************************************************************/ +int kvdoCreateCompressedWriteVIO(PhysicalLayer *layer, + void *parent, + char *data, + AllocatingVIO **allocatingVIOPtr) +{ + BIO *bio; + KernelLayer *kernelLayer = asKernelLayer(layer); + int result = createBio(kernelLayer, data, &bio); + if (result != VDO_SUCCESS) { + return result; + } + + CompressedWriteKVIO *compressedWriteKVIO; + result = makeCompressedWriteKVIO(kernelLayer, parent, bio, + &compressedWriteKVIO); + if (result != VDO_SUCCESS) { + freeBio(bio, kernelLayer); + return result; + } + + *allocatingVIOPtr = &compressedWriteKVIO->allocatingVIO; + return VDO_SUCCESS; +} diff --git a/source/vdo/kernel/kvio.h b/source/vdo/kernel/kvio.h new file mode 100644 index 0000000..64200cd --- /dev/null +++ b/source/vdo/kernel/kvio.h @@ -0,0 +1,340 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvio.h#3 $ + */ + +#ifndef KVIO_H +#define KVIO_H + +#include "allocatingVIO.h" +#include "vio.h" + +#include "kernelLayer.h" + +/** + * A specific (semi-opaque) encapsulation of a single block + **/ +struct kvio { + KvdoEnqueueable enqueueable; + VIO *vio; + KernelLayer *layer; + BIO *bio; + + /** + * A bio pointer used in enqueueBioMap (used via submitBio etc), to + * pass information -- which bio to submit to the storage device -- + * across a thread switch. This may match another bio pointer in + * this structure, or could point somewhere else. + **/ + BIO *bioToSubmit; + /** + * A list of enqueued bios with consecutive block numbers, stored by + * enqueueBioMap under the first-enqueued KVIO. The other KVIOs are + * found via their bio entries in this list, and are not added to + * the work queue as separate work items. + **/ + struct bio_list biosMerged; + /** A slot for an arbitrary bit of data, for use by systemtap. */ + long debugSlot; +}; + +typedef struct { + KVIO kvio; + VIO vio; +} MetadataKVIO; + +typedef struct { + KVIO kvio; + AllocatingVIO allocatingVIO; +} CompressedWriteKVIO; + +/** + * Determine whether a KVIO is a data VIO or not + * + * @param kvio The KVIO to check + * + * @return true if a data KVIO + */ +static inline bool isData(KVIO *kvio) +{ + return isDataVIO(kvio->vio); +} + +/** + * Determine whether a KVIO is a compressed block write VIO or not + * + * @param kvio The KVIO to check + * + * @return true if a compressed block writer + */ +static inline bool isCompressedWriter(KVIO *kvio) +{ + return isCompressedWriteVIO(kvio->vio); +} + +/** + * Determine whether a KVIO is a metadata VIO or not + * + * @param kvio The KVIO to check + * + * @return true if a metadata KVIO + */ +static inline bool isMetadata(KVIO *kvio) +{ + return isMetadataVIO(kvio->vio); +} + +/** + * Convert a VIO to a MetadataKVIO. + * + * @param vio The VIO to convert + * + * @return the VIO as a KVIO + **/ +static inline MetadataKVIO *vioAsMetadataKVIO(VIO *vio) +{ + ASSERT_LOG_ONLY(isMetadataVIO(vio), "VIO is a metadata VIO"); + return container_of(vio, MetadataKVIO, vio); +} + +/** + * Convert a MetadataKVIO to a KVIO. + * + * @param metadataKVIO The MetadataKVIO to convert + * + * @return The MetadataKVIO as a KVIO + **/ +static inline KVIO *metadataKVIOAsKVIO(MetadataKVIO *metadataKVIO) +{ + return &metadataKVIO->kvio; +} + +/** + * Returns a pointer to the CompressedWriteKVIO wrapping an AllocatingVIO. + * + * @param allocatingVIO The AllocatingVIO to convert + * + * @return the CompressedWriteKVIO + **/ +static inline CompressedWriteKVIO * +allocatingVIOAsCompressedWriteKVIO(AllocatingVIO *allocatingVIO) +{ + ASSERT_LOG_ONLY(isCompressedWriteAllocatingVIO(allocatingVIO), + "AllocatingVIO is a compressed write"); + return container_of(allocatingVIO, CompressedWriteKVIO, allocatingVIO); +} + +/** + * Convert a CompressedWriteKVIO to a KVIO. + * + * @param compressedWriteKVIO The CompressedWriteKVIO to convert + * + * @return The CompressedWriteKVIO as a KVIO + **/ +static inline +KVIO *compressedWriteKVIOAsKVIO(CompressedWriteKVIO *compressedWriteKVIO) +{ + return &compressedWriteKVIO->kvio; +} + +/** + * Returns a pointer to the KVIO wrapping a work item + * + * @param item the work item + * + * @return the KVIO + **/ +static inline KVIO *workItemAsKVIO(KvdoWorkItem *item) +{ + return container_of(item, KVIO, enqueueable.workItem); +} + +/** + * Enqueue a KVIO on a work queue. + * + * @param queue The queue + * @param kvio The KVIO + **/ +static inline void enqueueKVIOWork(KvdoWorkQueue *queue, KVIO *kvio) +{ + enqueueWorkQueue(queue, &kvio->enqueueable.workItem); +} + +/** + * Add a trace record for the current source location. + * + * @param kvio The KVIO structure to be updated + * @param location The source-location descriptor to be recorded + **/ +static inline void kvioAddTraceRecord(KVIO *kvio, TraceLocation location) +{ + vioAddTraceRecord(kvio->vio, location); +} + +/** + * Set up the work item for a KVIO. + * + * @param kvio The KVIO to set up + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + **/ +static inline void setupKVIOWork(KVIO *kvio, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + setupWorkItem(&kvio->enqueueable.workItem, work, statsFunction, action); +} + +/** + * Set up and enqueue a KVIO. + * + * @param kvio The KVIO to set up + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, mapping to a relative priority + * @param queue The queue on which to enqueue the KVIO + **/ +static inline void launchKVIO(KVIO *kvio, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action, + KvdoWorkQueue *queue) +{ + setupKVIOWork(kvio, work, statsFunction, action); + enqueueKVIOWork(queue, kvio); +} + +/** + * Move a KVIO back to the base threads. + * + * @param kvio The KVIO to enqueue + **/ +void kvdoEnqueueVIOCallback(KVIO *kvio); + +/** + * Handles kvio-related I/O post-processing. + * + * @param kvio The kvio to finalize + * @param error Possible error + **/ +void kvdoContinueKvio(KVIO *kvio, int error); + +/** + * Initialize a KVIO. + * + * @param kvio The KVIO to initialize + * @param layer The physical layer + * @param vioType The type of VIO to create + * @param priority The relative priority to assign to the KVIO + * @param parent The parent of the KVIO completion + * @param bio The bio to associate with this KVIO + **/ +void initializeKVIO(KVIO *kvio, + KernelLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + BIO *bio); + +/** + * Destroy a MetadataKVIO and NULL out the pointer to it. + * + * @param metadataKVIOPtr A pointer to the MetadataKVIO to destroy + **/ +void freeMetadataKVIO(MetadataKVIO **metadataKVIOPtr); + +/** + * Destroy a CompressedWriteKVIO and NULL out the pointer to it. + * + * @param compressedWriteKVIOPtr A pointer to the CompressedWriteKVIO to + * destroy + **/ +void freeCompressedWriteKVIO(CompressedWriteKVIO **compressedWriteKVIOPtr); + +/** + * Create a new VIO (and its enclosing KVIO) for metadata operations. + * + *

Implements MetadataVIOCreator. + * + * @param [in] layer The physical layer + * @param [in] vioType The type of VIO to create + * @param [in] priority The relative priority to assign to the VIO + * @param [in] parent The parent to assign to the VIO's completion + * @param [in] data The buffer + * @param [out] vioPtr A pointer to hold new VIO + * + * @return VDO_SUCCESS or an error + **/ +int kvdoCreateMetadataVIO(PhysicalLayer *layer, + VIOType vioType, + VIOPriority priority, + void *parent, + char *data, + VIO **vioPtr) + __attribute__((warn_unused_result)); + +/** + * Create a new AllocatingVIO (and its enclosing KVIO) for compressed writes. + * + *

Implements CompressedWriteVIOCreator. + * + * @param [in] layer The physical layer + * @param [in] parent The parent to assign to the AllocatingVIO's + * completion + * @param [in] data The buffer + * @param [out] allocatingVIOPtr A pointer to hold new AllocatingVIO + * + * @return VDO_SUCCESS or an error + **/ +int kvdoCreateCompressedWriteVIO(PhysicalLayer *layer, + void *parent, + char *data, + AllocatingVIO **allocatingVIOPtr) + __attribute__((warn_unused_result)); + +/** + * Submit a compressed block write. + * + *

Implements CompressedWriter. + * + * @param allocatingVIO The AllocatingVIO for the compressed write + **/ +void kvdoWriteCompressedBlock(AllocatingVIO *allocatingVIO); + +/** + * Read or write a single metadata VIO. + * + *

Implements MetadataReader and MetadataWriter + * + * @param vio The VIO to read or write + **/ +void kvdoSubmitMetadataVIO(VIO *vio); + +/** + * Issue an empty flush to the lower layer using the BIO in a metadata VIO. + * + *

Implements MetadataWriter. + * + * @param vio The VIO to flush + **/ +void kvdoFlushVIO(VIO *vio); + +#endif /* KVIO_H */ diff --git a/source/vdo/kernel/limiter.c b/source/vdo/kernel/limiter.c new file mode 100644 index 0000000..72a4bb5 --- /dev/null +++ b/source/vdo/kernel/limiter.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/limiter.c#2 $ + */ + +#include "limiter.h" + +#include + +/**********************************************************************/ +void getLimiterValuesAtomically(Limiter *limiter, + uint32_t *active, + uint32_t *maximum) +{ + spin_lock(&limiter->lock); + *active = limiter->active; + *maximum = limiter->maximum; + spin_unlock(&limiter->lock); +} + +/**********************************************************************/ +void initializeLimiter(Limiter *limiter, uint32_t limit) +{ + limiter->active = 0; + limiter->limit = limit; + limiter->maximum = 0; + init_waitqueue_head(&limiter->waiterQueue); + spin_lock_init(&limiter->lock); +} + +/**********************************************************************/ +bool limiterIsIdle(Limiter *limiter) +{ + spin_lock(&limiter->lock); + bool idle = limiter->active == 0; + spin_unlock(&limiter->lock); + return idle; +} + +/**********************************************************************/ +void limiterReleaseMany(Limiter *limiter, uint32_t count) +{ + spin_lock(&limiter->lock); + limiter->active -= count; + spin_unlock(&limiter->lock); + if (waitqueue_active(&limiter->waiterQueue)) { + wake_up_nr(&limiter->waiterQueue, count); + } +} + +/**********************************************************************/ +void limiterWaitForIdle(Limiter *limiter) +{ + spin_lock(&limiter->lock); + while (limiter->active > 0) { + DEFINE_WAIT(wait); + prepare_to_wait_exclusive(&limiter->waiterQueue, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&limiter->lock); + io_schedule(); + spin_lock(&limiter->lock); + finish_wait(&limiter->waiterQueue, &wait); + }; + spin_unlock(&limiter->lock); +} + +/** + * Take one permit from the limiter, if one is available, and update + * the maximum active count if appropriate. + * + * The limiter's lock must already be locked. + * + * @param limiter The limiter to update + * + * @return true iff the permit was acquired + **/ +static bool takePermitLocked(Limiter *limiter) +{ + if (limiter->active >= limiter->limit) { + return false; + } + limiter->active += 1; + if (limiter->active > limiter->maximum) { + limiter->maximum = limiter->active; + } + return true; +} + +/**********************************************************************/ +void limiterWaitForOneFree(Limiter *limiter) +{ + spin_lock(&limiter->lock); + while (!takePermitLocked(limiter)) { + DEFINE_WAIT(wait); + prepare_to_wait_exclusive(&limiter->waiterQueue, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&limiter->lock); + io_schedule(); + spin_lock(&limiter->lock); + finish_wait(&limiter->waiterQueue, &wait); + }; + spin_unlock(&limiter->lock); +} + +/**********************************************************************/ +bool limiterPoll(Limiter *limiter) +{ + spin_lock(&limiter->lock); + bool acquired = takePermitLocked(limiter); + spin_unlock(&limiter->lock); + return acquired; +} diff --git a/source/vdo/kernel/limiter.h b/source/vdo/kernel/limiter.h new file mode 100644 index 0000000..a9ee8fc --- /dev/null +++ b/source/vdo/kernel/limiter.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/limiter.h#2 $ + */ + +#ifndef LIMITER_H +#define LIMITER_H + +#include + +/* + * A Limiter is a fancy counter used to limit resource usage. We have a + * limit to number of resources that we are willing to use, and a Limiter + * holds us to that limit. + */ + +typedef struct limiter { + // A spinlock controlling access to the contents of this struct + spinlock_t lock; + // The queue of threads waiting for a resource to become available + wait_queue_head_t waiterQueue; + // The number of resources in use + uint32_t active; + // The maximum number number of resources that have ever been in use + uint32_t maximum; + // The limit to the number of resources that are allowed to be used + uint32_t limit; +} Limiter; + +/** + * Get the Limiter variable values (atomically under the lock) + * + * @param limiter The limiter + * @param active The number of requests in progress + * @param maximum The maximum number of requests that have ever been active + **/ +void getLimiterValuesAtomically(Limiter *limiter, + uint32_t *active, + uint32_t *maximum); + +/** + * Initialize a Limiter + * + * @param limiter The limiter + * @param limit The limit to the number of active resources + **/ +void initializeLimiter(Limiter *limiter, uint32_t limit); + +/** + * Determine whether there are any active resources + * + * @param limiter The limiter + * + * @return true if there are no active resources + **/ +bool limiterIsIdle(Limiter *limiter); + +/** + * Release resources, making them available for other uses + * + * @param limiter The limiter + * @param count The number of resources to release + **/ +void limiterReleaseMany(Limiter *limiter, uint32_t count); + +/** + * Release one resource, making it available for another use + * + * @param limiter The limiter + **/ +static inline void limiterRelease(Limiter *limiter) +{ + limiterReleaseMany(limiter, 1); +} + +/** + * Wait until there are no active resources + * + * @param limiter The limiter + **/ +void limiterWaitForIdle(Limiter *limiter); + +/** + * Prepare to start using one resource, waiting if there are too many resources + * already in use. After returning from this routine, the caller may use the + * resource, and must call limiterRelease after freeing the resource. + * + * @param limiter The limiter + **/ +void limiterWaitForOneFree(Limiter *limiter); + +/** + * Attempt to reserve one resource, without waiting. After returning from this + * routine, if allocation was successful, the caller may use the resource, and + * must call limiterRelease after freeing the resource. + * + * @param limiter The limiter + * + * @return true iff the resource was allocated + **/ +bool limiterPoll(Limiter *limiter); + +#endif /* LIMITER_H */ diff --git a/source/vdo/kernel/logger.c b/source/vdo/kernel/logger.c new file mode 100644 index 0000000..d18f5ea --- /dev/null +++ b/source/vdo/kernel/logger.c @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/logger.c#4 $ + */ + +#include "logger.h" + +#include +#include +#include + +#include "errors.h" +#include "threadDevice.h" + +static const int DEFAULT_PRIORITY = LOG_INFO; + +typedef struct { + const char *name; + const int priority; +} PRIORITY_NAMES; + +static const PRIORITY_NAMES PRIORITIES[] = { + { "ALERT", LOG_ALERT }, + { "CRIT", LOG_CRIT }, + { "CRITICAL", LOG_CRIT }, + { "DEBUG", LOG_DEBUG }, + { "EMERG", LOG_EMERG }, + { "EMERGENCY", LOG_EMERG }, + { "ERR", LOG_ERR }, + { "ERROR", LOG_ERR }, + { "INFO", LOG_INFO }, + { "NOTICE", LOG_NOTICE }, + { "PANIC", LOG_EMERG }, + { "WARN", LOG_WARNING }, + { "WARNING", LOG_WARNING }, + { NULL, -1 }, +}; + +enum { + PRIORITY_COUNT = 8 +}; + +static const char *PRIORITY_STRINGS[] = { + "EMERGENCY", + "ALERT", + "CRITICAL", + "ERROR", + "WARN", + "NOTICE", + "INFO", + "DEBUG", +}; + +static int logLevel = LOG_INFO; + +/**********************************************************************/ +int stringToPriority(const char *string) +{ + for (int i = 0; PRIORITIES[i].name != NULL; i++) { + if (strcasecmp(string, PRIORITIES[i].name) == 0) { + return PRIORITIES[i].priority; + } + } + return DEFAULT_PRIORITY; +} + +/**********************************************************************/ +int getLogLevel(void) +{ + return logLevel; +} + +/**********************************************************************/ +void setLogLevel(int newLogLevel) +{ + logLevel = newLogLevel; +} + +/**********************************************************************/ +const char *priorityToString(int priority) +{ + if ((priority < 0) || (priority >= PRIORITY_COUNT)) { + return "unknown"; + } + return PRIORITY_STRINGS[priority]; +} + +/**********************************************************************/ +static const char *priorityToLogLevel(int priority) +{ + switch (priority) { + case LOG_EMERG: + case LOG_ALERT: + case LOG_CRIT: + return KERN_CRIT; + case LOG_ERR: + return KERN_ERR; + case LOG_WARNING: + return KERN_WARNING; + case LOG_NOTICE: + return KERN_NOTICE; + case LOG_INFO: + return KERN_INFO; + case LOG_DEBUG: + return KERN_DEBUG; + default: + return ""; + } +} + +/**********************************************************************/ +static const char *getCurrentInterruptType(void) +{ + if (in_nmi()) { + return "NMI"; + } + if (in_irq()) { + return "HI"; + } + if (in_softirq()) { + return "SI"; + } + return "INTR"; +} + +/** + * Emit a log message to the kernel log in a format suited to the current + * thread context. Context info formats: + * + * interrupt: kvdo[NMI]: blah + * thread w/dev id: kvdo12:myprog: blah + * kvdo thread: kvdo12:foobarQ: blah + * other thread: kvdo: myprog: blah + * + * Fields: module name, interrupt level, process name, device ID. + * + * @param level A string describing the logging level + * @param moduleName The name of the module doing the logging + * @param prefix The prefix of the log message + * @param vaf1 The first message format descriptor + * @param vaf2 The second message format descriptor + **/ +static void emitLogMessage(const char *level, + const char *moduleName, + const char *prefix, + const struct va_format *vaf1, + const struct va_format *vaf2) +{ + if (in_interrupt()) { + printk("%s%s[%s]: %s%pV%pV\n", + level, moduleName, getCurrentInterruptType(), + prefix, vaf1, vaf2); + return; + } + + // Not at interrupt level; we have a process we can look at, and + // might have a device ID. + int deviceInstance = getThreadDeviceID(); + if (deviceInstance != -1) { + printk("%s%s%u:%s: %s%pV%pV\n", + level, moduleName, deviceInstance, current->comm, + prefix, vaf1, vaf2); + return; + } + + if (((current->flags & PF_KTHREAD) != 0) + && (strncmp(moduleName, current->comm, strlen(moduleName)) == 0)) { + /* + * It's a kernel thread starting with "kvdo" (or whatever). Assume it's + * ours and that its name is sufficient. + */ + printk("%s%s: %s%pV%pV\n", + level, current->comm, + prefix, vaf1, vaf2); + return; + } + + // Identify the module and the process. + printk("%s%s: %s: %s%pV%pV\n", + level, moduleName, current->comm, + prefix, vaf1, vaf2); +} + +/**********************************************************************/ +void logMessagePack(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + va_list args2) +{ + if (priority > getLogLevel()) { + return; + } + + /* + * The kernel's printk has some magic for indirection to a secondary + * va_list. It wants us to supply a pointer to the va_list. + * + * However, va_list varies across platforms and can be an array + * type, which makes passing it around as an argument kind of + * tricky, due to the automatic conversion to a pointer. This makes + * taking the address of the argument a dicey thing; if we use "&a" + * it works fine for non-array types, but for array types we get the + * address of a pointer. Functions like va_copy and sprintf don't + * care as they get "va_list" values passed and are written to do + * the right thing, but printk explicitly wants the address of the + * va_list. + * + * So, we copy the va_list values to ensure that "&" consistently + * works the way we want. + */ + va_list args1Copy; + va_copy(args1Copy, args1); + va_list args2Copy; + va_copy(args2Copy, args2); + struct va_format vaf1 = { + .fmt = (fmt1 != NULL) ? fmt1 : "", + .va = &args1Copy, + }; + struct va_format vaf2 = { + .fmt = (fmt2 != NULL) ? fmt2 : "", + .va = &args2Copy, + }; + + if (prefix == NULL) { + prefix = ""; + } + + emitLogMessage(priorityToLogLevel(priority), THIS_MODULE->name, + prefix, &vaf1, &vaf2); + + va_end(args1Copy); + va_end(args2Copy); +} + +/**********************************************************************/ +void logEmbeddedMessage(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) +{ + va_list ap; + va_start(ap, fmt2); + logMessagePack(priority, prefix, fmt1, args1, fmt2, ap); + va_end(ap); +} + +#pragma GCC diagnostic push +/* + * GCC (version 8.1.1 20180502 (Red Hat 8.1.1-1)) on Fedora 28 seems + * to think that this function should get a printf format + * attribute. But we have no second format string, and no additional + * arguments at the call site, and GCC also gets unhappy trying to + * analyze the format and values when there are none. So we'll just + * shut it up. + */ +#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" +/** + * Log a message. + * + * This helper function exists solely to create a valid va_list with + * no useful info. It does the real work of vLogMessage, which wants a + * second va_list object to pass down. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + * @param args The variadic argument list of format parameters. + **/ +static void vLogMessageHelper(int priority, + const char *format, + va_list args, + ...) +{ + va_list dummy; + va_start(dummy, args); + logMessagePack(priority, NULL, format, args, NULL, dummy); + va_end(dummy); +} +#pragma GCC diagnostic pop + +/*****************************************************************************/ +void vLogMessage(int priority, const char *format, va_list args) +{ + vLogMessageHelper(priority, format, args); +} + +/**********************************************************************/ +void logMessage(int priority, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(priority, format, args); + va_end(args); +} + +/**********************************************************************/ +__attribute__((format(printf, 2, 3))) +static void logAtLevel(int priority, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(priority, format, args); + va_end(args); +} + +/**********************************************************************/ +void logDebug(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_DEBUG, format, args); + va_end(args); +} + +/**********************************************************************/ +void logInfo(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_INFO, format, args); + va_end(args); +} + +/**********************************************************************/ +void logNotice(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_NOTICE, format, args); + va_end(args); +} + +/**********************************************************************/ +void logWarning(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_WARNING, format, args); + va_end(args); +} + +/**********************************************************************/ +void logError(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_ERR, format, args); + va_end(args); +} + +/**********************************************************************/ +void vLogError(const char *format, va_list args) +{ + vLogMessage(LOG_ERR, format, args); +} + +/**********************************************************************/ +void logBacktrace(int priority) +{ + logAtLevel(priority, "[backtrace]"); + if (priority > logLevel) { + return; + } + dump_stack(); +} + +/**********************************************************************/ +int vLogWithStringError(int priority, + int errnum, + const char *format, + va_list args) +{ + char errbuf[ERRBUF_SIZE] = ""; + logEmbeddedMessage(priority, NULL, format, args, ": %s (%d)", + stringError(errnum, errbuf, sizeof(errbuf)), + errnum); + return errnum; +} + +/**********************************************************************/ +int logWithStringError(int priority, int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(priority, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logErrorWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_ERR, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int vLogErrorWithStringError(int errnum, const char *format, va_list args) +{ + vLogWithStringError(LOG_ERR, errnum, format, args); + return errnum; +} + +/**********************************************************************/ +int logWarningWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_WARNING, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logDebugWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_DEBUG, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logInfoWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_INFO, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logNoticeWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_NOTICE, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logFatalWithStringError(int errnum, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogWithStringError(LOG_CRIT, errnum, format, args); + va_end(args); + return errnum; +} + +/**********************************************************************/ +int logUnrecoverable(int errnum, const char *format, ...) +{ + if ((errnum == UDS_SUCCESS || errnum == UDS_QUEUED) || (errnum == 0)) { + return errnum; + } + + va_list args; + va_start(args, format); + vLogWithStringError(LOG_CRIT, errnum, format, args); + va_end(args); + return makeUnrecoverable(errnum); +} + +/**********************************************************************/ +void logFatal(const char *format, ...) +{ + va_list args; + + va_start(args, format); + vLogMessage(LOG_CRIT, format, args); + va_end(args); +} + +/**********************************************************************/ +void pauseForLogger(void) +{ + // Hopefully, a few milliseconds of sleep will be large enough + // for the kernel log buffer to be flushed. + msleep(4); +} diff --git a/source/vdo/kernel/logger.h b/source/vdo/kernel/logger.h new file mode 100644 index 0000000..6e8088e --- /dev/null +++ b/source/vdo/kernel/logger.h @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/logger.h#2 $ + */ + +#ifndef LOGGER_H +#define LOGGER_H 1 + +#include +#include +#include +#include + +#define LOG_EMERG 0 /* system is unusable */ +#define LOG_ALERT 1 /* action must be taken immediately */ +#define LOG_CRIT 2 /* critical conditions */ +#define LOG_ERR 3 /* error conditions */ +#define LOG_WARNING 4 /* warning conditions */ +#define LOG_NOTICE 5 /* normal but significant condition */ +#define LOG_INFO 6 /* informational */ +#define LOG_DEBUG 7 /* debug-level messages */ + +// Make it easy to log real pointer values using %px when in development. +#define PRIptr "pK" + +/** + * @file + * + * The functions in this file are not thread safe in the sense that nothing + * prevents multiple threads from opening or closing loggers out from under + * other threads. In reality this isn't a problem since the only calls in + * production code to openLogger() and closeLogger() are made in uds.c while + * uds mutex is held, and uds does not make any logging calls before it calls + * openLogger or after it calls closeLogger(). + * + * All of the log() functions will preserve the callers value of errno. + **/ + +/** + * Get the current logging level. + * + * @return the current logging priority level. + **/ +int getLogLevel(void); + +/** + * Set the current logging level. + * + * @param newLogLevel the new value for the logging priority level. + **/ +void setLogLevel(int newLogLevel); + +/** + * Return the integer logging priority represented by a name. + * + * @param string the name of the logging priority (case insensitive). + * + * @return the integer priority named by string, or DEFAULT_PRIORITY + * if not recognized. + **/ +int stringToPriority(const char *string); + +/** + * Return the printable name of a logging priority. + * + * @return the priority name + **/ +const char *priorityToString(int priority); + +/** + * Log a debug message. + * + * @param format The format of the message (a printf style format) + **/ +void logDebug(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an informational message. + * + * @param format The format of the message (a printf style format) + **/ +void logInfo(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a normal (but notable) condition. + * + * @param format The format of the message (a printf style format) + **/ +void logNotice(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a warning. + * + * @param format The format of the message (a printf style format) + **/ +void logWarning(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an error. + * + * @param format The format of the message (a printf style format) + **/ +void logError(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log an error. + * + * @param format The format of the message (a printf style format) + * @param args args for format. + **/ + +void vLogError(const char *format, va_list args) + __attribute__((format(printf, 1, 0))); + +/** + * Log a message embedded within another message. + * + * @param priority the priority at which to log the message + * @param prefix optional string prefix to message, may be NULL + * @param fmt1 format of message first part, may be NULL + * @param args1 arguments for message first part + * @param fmt2 format of message second part + **/ +void logEmbeddedMessage(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + ...) + __attribute__((format(printf, 3, 0), format(printf, 5, 6))); + +/** + * Log a message pack consisting of multiple variable sections. + * + * @param priority the priority at which to log the message + * @param prefix optional string prefix to message, may be NULL + * @param fmt1 format of message first part, may be NULL + * @param args1 arguments for message first part + * @param fmt2 format of message second part, may be NULL + * @param args2 arguments for message second part + **/ +void logMessagePack(int priority, + const char *prefix, + const char *fmt1, + va_list args1, + const char *fmt2, + va_list args2) + __attribute__((format(printf, 3, 0))); + +/** + * Log a stack backtrace. + * + * @param priority The priority at which to log the backtrace + **/ +void logBacktrace(int priority); + +/** + * Log a message with an error from an error code. + * + * @param priority The priority of the logging entry + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return errnum + **/ +int logWithStringError(int priority, int errnum, const char *format, ...) + __attribute__((format(printf, 3, 4))); + +/** + * Log a message with an error from an error code. + * + * @param priority The priority of the logging entry + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * @param args The list of arguments with format. + * + * @return errnum + **/ +int vLogWithStringError(int priority, + int errnum, + const char *format, + va_list args) + __attribute__((format(printf, 3, 0))); + +/** + * Log an error prefixed with the string associated with the errnum. + * + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * + * @return errnum + **/ +int logErrorWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logDebugWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logInfoWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logNoticeWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logWarningWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/**********************************************************************/ +int logFatalWithStringError(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Log an error prefixed with the string associated with the errnum. + * + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * @param args a va_list of args for the format. + * @return errnum + **/ +int vLogErrorWithStringError(int errnum, const char *format, va_list args) + __attribute__((format(printf, 2, 0))); + +/** + * Log an ERROR level message and return makeUnrecoverable(errnum) + * UDS_SUCCESS is ignored and returned. + * + * @param errnum Int value of errno or a UDS_* value. + * @param format The format of the message (a printf style format) + * @return makeUnrecoverable(errnum) or UDS_SUCCESS. + **/ +int logUnrecoverable(int errnum, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Log a fatal error. + * + * @param format The format of the message (a printf style format) + **/ +void logFatal(const char *format, ...) __attribute__((format(printf, 1, 2))); + +/** + * Log a message -- for internal use only. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + * @param args The variadic argument list of format parameters. + **/ +void vLogMessage(int priority, const char *format, va_list args) + __attribute__((format(printf, 2, 0))); + +/** + * Log a message. + * + * @param priority The syslog priority value for the message. + * @param format The format of the message (a printf style format) + **/ +void logMessage(int priority, const char *format, ...) + __attribute__((format(printf, 2, 3))); + +/** + * Sleep or delay a short time (likely a few milliseconds) in an attempt allow + * the log buffers to be written out in case they might be overrun. This is + * unnecessary in user-space (and is a no-op there), but is needed when + * quickly issuing a lot of log output in the Linux kernel, as when dumping a + * large number of data structures. + **/ +void pauseForLogger(void); + +#endif /* LOGGER_H */ diff --git a/source/vdo/kernel/memoryUsage.c b/source/vdo/kernel/memoryUsage.c new file mode 100644 index 0000000..86521a4 --- /dev/null +++ b/source/vdo/kernel/memoryUsage.c @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/memoryUsage.c#3 $ + */ + +#include "memoryUsage.h" + +#include "memoryAlloc.h" + +#include "kernelStatistics.h" + +/**********************************************************************/ +MemoryUsage getMemoryUsage() +{ + MemoryUsage memoryUsage; + getMemoryStats(&memoryUsage.bytesUsed, &memoryUsage.peakBytesUsed); + return memoryUsage; +} + diff --git a/source/vdo/kernel/memoryUsage.h b/source/vdo/kernel/memoryUsage.h new file mode 100644 index 0000000..336ab0a --- /dev/null +++ b/source/vdo/kernel/memoryUsage.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/memoryUsage.h#1 $ + */ + +#ifndef MEMORY_USAGE_H +#define MEMORY_USAGE_H 1 + +#include "memoryAlloc.h" + +#include "kernelStatistics.h" + +/** + * Get the memory usage for statistics reporting. + * + * @return The memory usage + **/ +MemoryUsage getMemoryUsage(void) + __attribute__((warn_unused_result)); + +#endif /* MEMORY_USAGE_H */ diff --git a/source/vdo/kernel/poolSysfs.c b/source/vdo/kernel/poolSysfs.c new file mode 100644 index 0000000..7f37480 --- /dev/null +++ b/source/vdo/kernel/poolSysfs.c @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/poolSysfs.c#1 $ + */ + +#include "poolSysfs.h" + +#include "memoryAlloc.h" + +#include "vdo.h" + +#include "dedupeIndex.h" + +typedef struct poolAttribute { + struct attribute attr; + ssize_t (*show)(KernelLayer *layer, char *buf); + ssize_t (*store)(KernelLayer *layer, const char *value, size_t count); +} PoolAttribute; + +/**********************************************************************/ +static ssize_t vdoPoolAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + PoolAttribute *poolAttr = container_of(attr, PoolAttribute, attr); + if (poolAttr->show == NULL) { + return -EINVAL; + } + KernelLayer *layer = container_of(kobj, KernelLayer, kobj); + return poolAttr->show(layer, buf); +} + +/**********************************************************************/ +static ssize_t vdoPoolAttrStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + PoolAttribute *poolAttr = container_of(attr, PoolAttribute, attr); + if (poolAttr->store == NULL) { + return -EINVAL; + } + KernelLayer *layer = container_of(kobj, KernelLayer, kobj); + return poolAttr->store(layer, buf, length); +} + +static struct sysfs_ops vdoPoolSysfsOps = { + .show = vdoPoolAttrShow, + .store = vdoPoolAttrStore, +}; + +/**********************************************************************/ +static ssize_t poolCompressingShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%s\n", (getKVDOCompressing(&layer->kvdo) ? "1" : "0")); +} + +/**********************************************************************/ +static ssize_t poolDiscardsActiveShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.active); +} + +/**********************************************************************/ +static ssize_t poolDiscardsLimitShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.limit); +} + +/**********************************************************************/ +static ssize_t poolDiscardsLimitStore(KernelLayer *layer, + const char *buf, + size_t length) +{ + unsigned int value; + if ((length > 12) || (sscanf(buf, "%u", &value) != 1) || (value < 1)) { + return -EINVAL; + } + layer->discardLimiter.limit = value; + return length; +} + +/**********************************************************************/ +static ssize_t poolDiscardsMaximumShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.maximum); +} + +/**********************************************************************/ +static ssize_t poolInstanceShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%u\n", layer->instance); +} + +/**********************************************************************/ +static ssize_t poolRequestsActiveShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.active); +} + +/**********************************************************************/ +static ssize_t poolRequestsLimitShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.limit); +} + +/**********************************************************************/ +static ssize_t poolRequestsMaximumShow(KernelLayer *layer, char *buf) +{ + return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.maximum); +} + +/**********************************************************************/ +static void vdoPoolRelease(struct kobject *kobj) +{ + KernelLayer *layer = container_of(kobj, KernelLayer, kobj); + freeVDO(&layer->kvdo.vdo); + FREE(layer); +} + +static PoolAttribute vdoPoolCompressingAttr = { + .attr = { .name = "compressing", .mode = 0444, }, + .show = poolCompressingShow, +}; + +static PoolAttribute vdoPoolDiscardsActiveAttr = { + .attr = { .name = "discards_active", .mode = 0444, }, + .show = poolDiscardsActiveShow, +}; + +static PoolAttribute vdoPoolDiscardsLimitAttr = { + .attr = { .name = "discards_limit", .mode = 0644, }, + .show = poolDiscardsLimitShow, + .store = poolDiscardsLimitStore, +}; + +static PoolAttribute vdoPoolDiscardsMaximumAttr = { + .attr = { .name = "discards_maximum", .mode = 0444, }, + .show = poolDiscardsMaximumShow, +}; + +static PoolAttribute vdoPoolInstanceAttr = { + .attr = { .name = "instance", .mode = 0444, }, + .show = poolInstanceShow, +}; + +static PoolAttribute vdoPoolRequestsActiveAttr = { + .attr = { .name = "requests_active", .mode = 0444, }, + .show = poolRequestsActiveShow, +}; + +static PoolAttribute vdoPoolRequestsLimitAttr = { + .attr = { .name = "requests_limit", .mode = 0444, }, + .show = poolRequestsLimitShow, +}; + +static PoolAttribute vdoPoolRequestsMaximumAttr = { + .attr = { .name = "requests_maximum", .mode = 0444, }, + .show = poolRequestsMaximumShow, +}; + +static struct attribute *poolAttrs[] = { + &vdoPoolCompressingAttr.attr, + &vdoPoolDiscardsActiveAttr.attr, + &vdoPoolDiscardsLimitAttr.attr, + &vdoPoolDiscardsMaximumAttr.attr, + &vdoPoolInstanceAttr.attr, + &vdoPoolRequestsActiveAttr.attr, + &vdoPoolRequestsLimitAttr.attr, + &vdoPoolRequestsMaximumAttr.attr, + NULL, +}; + +struct kobj_type kernelLayerKobjType = { + .release = vdoPoolRelease, + .sysfs_ops = &vdoPoolSysfsOps, + .default_attrs = poolAttrs, +}; + +/**********************************************************************/ +static void workQueueDirectoryRelease(struct kobject *kobj) +{ + /* + * The workQueueDirectory holds an implicit reference to its parent, + * the kernelLayer object (->kobj), so even if there are some + * external references held to the workQueueDirectory when work + * queue shutdown calls kobject_put on the kernelLayer object, the + * kernelLayer object won't actually be released and won't free the + * KernelLayer storage until the workQueueDirectory object is + * released first. + * + * So, we don't need to do any additional explicit management here. + * + * (But we aren't allowed to use a NULL function pointer to indicate + * a no-op.) + */ +} + +/**********************************************************************/ +static struct attribute *noAttrs[] = { + NULL, +}; + +static struct sysfs_ops noSysfsOps = { + // These should never be reachable since there are no attributes. + .show = NULL, + .store = NULL, +}; + +struct kobj_type workQueueDirectoryKobjType = { + .release = workQueueDirectoryRelease, + .sysfs_ops = &noSysfsOps, + .default_attrs = noAttrs, +}; diff --git a/source/vdo/kernel/poolSysfs.h b/source/vdo/kernel/poolSysfs.h new file mode 100644 index 0000000..85fe11c --- /dev/null +++ b/source/vdo/kernel/poolSysfs.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/poolSysfs.h#1 $ + */ + +#ifndef POOL_SYSFS_H +#define POOL_SYSFS_H + +#include + +// The kobj_type used for setting up the kernel layer kobject. +extern struct kobj_type kernelLayerKobjType; +// The kobj_type used for the "work_queues" subdirectory. +extern struct kobj_type workQueueDirectoryKobjType; + +// The sysfs_ops used for the "statistics" subdirectory. +extern struct sysfs_ops poolStatsSysfsOps; +// The attribute used for the "statistics" subdirectory. +extern struct attribute *poolStatsAttrs[]; + +#endif /* POOL_SYSFS_H */ diff --git a/source/vdo/kernel/poolSysfsStats.c b/source/vdo/kernel/poolSysfsStats.c new file mode 100644 index 0000000..daa0cf0 --- /dev/null +++ b/source/vdo/kernel/poolSysfsStats.c @@ -0,0 +1,2628 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include "dedupeIndex.h" +#include "logger.h" +#include "poolSysfs.h" +#include "statistics.h" +#include "statusProcfs.h" +#include "threadDevice.h" +#include "vdo.h" + +typedef struct poolStatsAttribute { + struct attribute attr; + ssize_t (*show)(KernelLayer *layer, char *buf); +} PoolStatsAttribute; + +static ssize_t poolStatsAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + PoolStatsAttribute *poolStatsAttr = container_of(attr, PoolStatsAttribute, + attr); + + if (poolStatsAttr->show == NULL) { + return -EINVAL; + } + KernelLayer *layer = container_of(kobj, KernelLayer, statsDirectory); + return poolStatsAttr->show(layer, buf); +} + +struct sysfs_ops poolStatsSysfsOps = { + .show = poolStatsAttrShow, + .store = NULL, +}; + +/**********************************************************************/ +/** Number of blocks used for data */ +static ssize_t poolStatsDataBlocksUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.dataBlocksUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsDataBlocksUsedAttr = { + .attr = { .name = "data_blocks_used", .mode = 0444, }, + .show = poolStatsDataBlocksUsedShow, +}; + +/**********************************************************************/ +/** Number of blocks used for VDO metadata */ +static ssize_t poolStatsOverheadBlocksUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.overheadBlocksUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsOverheadBlocksUsedAttr = { + .attr = { .name = "overhead_blocks_used", .mode = 0444, }, + .show = poolStatsOverheadBlocksUsedShow, +}; + +/**********************************************************************/ +/** Number of logical blocks that are currently mapped to physical blocks */ +static ssize_t poolStatsLogicalBlocksUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.logicalBlocksUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsLogicalBlocksUsedAttr = { + .attr = { .name = "logical_blocks_used", .mode = 0444, }, + .show = poolStatsLogicalBlocksUsedShow, +}; + +/**********************************************************************/ +/** number of physical blocks */ +static ssize_t poolStatsPhysicalBlocksShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.physicalBlocks); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsPhysicalBlocksAttr = { + .attr = { .name = "physical_blocks", .mode = 0444, }, + .show = poolStatsPhysicalBlocksShow, +}; + +/**********************************************************************/ +/** number of logical blocks */ +static ssize_t poolStatsLogicalBlocksShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.logicalBlocks); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsLogicalBlocksAttr = { + .attr = { .name = "logical_blocks", .mode = 0444, }, + .show = poolStatsLogicalBlocksShow, +}; + +/**********************************************************************/ +/** Size of the block map page cache, in bytes */ +static ssize_t poolStatsBlockMapCacheSizeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMapCacheSize); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapCacheSizeAttr = { + .attr = { .name = "block_map_cache_size", .mode = 0444, }, + .show = poolStatsBlockMapCacheSizeShow, +}; + +/**********************************************************************/ +/** String describing the active write policy of the VDO */ +static ssize_t poolStatsWritePolicyShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%s\n", layer->vdoStatsStorage.writePolicy); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsWritePolicyAttr = { + .attr = { .name = "write_policy", .mode = 0444, }, + .show = poolStatsWritePolicyShow, +}; + +/**********************************************************************/ +/** The physical block size */ +static ssize_t poolStatsBlockSizeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockSize); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockSizeAttr = { + .attr = { .name = "block_size", .mode = 0444, }, + .show = poolStatsBlockSizeShow, +}; + +/**********************************************************************/ +/** Number of times the VDO has successfully recovered */ +static ssize_t poolStatsCompleteRecoveriesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.completeRecoveries); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsCompleteRecoveriesAttr = { + .attr = { .name = "complete_recoveries", .mode = 0444, }, + .show = poolStatsCompleteRecoveriesShow, +}; + +/**********************************************************************/ +/** Number of times the VDO has recovered from read-only mode */ +static ssize_t poolStatsReadOnlyRecoveriesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.readOnlyRecoveries); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsReadOnlyRecoveriesAttr = { + .attr = { .name = "read_only_recoveries", .mode = 0444, }, + .show = poolStatsReadOnlyRecoveriesShow, +}; + +/**********************************************************************/ +/** String describing the operating mode of the VDO */ +static ssize_t poolStatsModeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%s\n", layer->vdoStatsStorage.mode); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsModeAttr = { + .attr = { .name = "mode", .mode = 0444, }, + .show = poolStatsModeShow, +}; + +/**********************************************************************/ +/** Whether the VDO is in recovery mode */ +static ssize_t poolStatsInRecoveryModeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%d\n", layer->vdoStatsStorage.inRecoveryMode); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsInRecoveryModeAttr = { + .attr = { .name = "in_recovery_mode", .mode = 0444, }, + .show = poolStatsInRecoveryModeShow, +}; + +/**********************************************************************/ +/** What percentage of recovery mode work has been completed */ +static ssize_t poolStatsRecoveryPercentageShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%u\n", layer->vdoStatsStorage.recoveryPercentage); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsRecoveryPercentageAttr = { + .attr = { .name = "recovery_percentage", .mode = 0444, }, + .show = poolStatsRecoveryPercentageShow, +}; + +/**********************************************************************/ +/** Number of compressed data items written since startup */ +static ssize_t poolStatsPackerCompressedFragmentsWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedFragmentsWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsPackerCompressedFragmentsWrittenAttr = { + .attr = { .name = "packer_compressed_fragments_written", .mode = 0444, }, + .show = poolStatsPackerCompressedFragmentsWrittenShow, +}; + +/**********************************************************************/ +/** Number of blocks containing compressed items written since startup */ +static ssize_t poolStatsPackerCompressedBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedBlocksWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsPackerCompressedBlocksWrittenAttr = { + .attr = { .name = "packer_compressed_blocks_written", .mode = 0444, }, + .show = poolStatsPackerCompressedBlocksWrittenShow, +}; + +/**********************************************************************/ +/** Number of VIOs that are pending in the packer */ +static ssize_t poolStatsPackerCompressedFragmentsInPackerShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedFragmentsInPacker); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsPackerCompressedFragmentsInPackerAttr = { + .attr = { .name = "packer_compressed_fragments_in_packer", .mode = 0444, }, + .show = poolStatsPackerCompressedFragmentsInPackerShow, +}; + +/**********************************************************************/ +/** The total number of slabs from which blocks may be allocated */ +static ssize_t poolStatsAllocatorSlabCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsAllocatorSlabCountAttr = { + .attr = { .name = "allocator_slab_count", .mode = 0444, }, + .show = poolStatsAllocatorSlabCountShow, +}; + +/**********************************************************************/ +/** The total number of slabs from which blocks have ever been allocated */ +static ssize_t poolStatsAllocatorSlabsOpenedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabsOpened); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsAllocatorSlabsOpenedAttr = { + .attr = { .name = "allocator_slabs_opened", .mode = 0444, }, + .show = poolStatsAllocatorSlabsOpenedShow, +}; + +/**********************************************************************/ +/** The number of times since loading that a slab has been re-opened */ +static ssize_t poolStatsAllocatorSlabsReopenedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabsReopened); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsAllocatorSlabsReopenedAttr = { + .attr = { .name = "allocator_slabs_reopened", .mode = 0444, }, + .show = poolStatsAllocatorSlabsReopenedShow, +}; + +/**********************************************************************/ +/** Number of times the on-disk journal was full */ +static ssize_t poolStatsJournalDiskFullShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.diskFull); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalDiskFullAttr = { + .attr = { .name = "journal_disk_full", .mode = 0444, }, + .show = poolStatsJournalDiskFullShow, +}; + +/**********************************************************************/ +/** Number of times the recovery journal requested slab journal commits. */ +static ssize_t poolStatsJournalSlabJournalCommitsRequestedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.slabJournalCommitsRequested); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalSlabJournalCommitsRequestedAttr = { + .attr = { .name = "journal_slab_journal_commits_requested", .mode = 0444, }, + .show = poolStatsJournalSlabJournalCommitsRequestedShow, +}; + +/**********************************************************************/ +/** The total number of items on which processing has started */ +static ssize_t poolStatsJournalEntriesStartedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.started); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalEntriesStartedAttr = { + .attr = { .name = "journal_entries_started", .mode = 0444, }, + .show = poolStatsJournalEntriesStartedShow, +}; + +/**********************************************************************/ +/** The total number of items for which a write operation has been issued */ +static ssize_t poolStatsJournalEntriesWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.written); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalEntriesWrittenAttr = { + .attr = { .name = "journal_entries_written", .mode = 0444, }, + .show = poolStatsJournalEntriesWrittenShow, +}; + +/**********************************************************************/ +/** The total number of items for which a write operation has completed */ +static ssize_t poolStatsJournalEntriesCommittedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.committed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalEntriesCommittedAttr = { + .attr = { .name = "journal_entries_committed", .mode = 0444, }, + .show = poolStatsJournalEntriesCommittedShow, +}; + +/**********************************************************************/ +/** The total number of items on which processing has started */ +static ssize_t poolStatsJournalBlocksStartedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.started); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalBlocksStartedAttr = { + .attr = { .name = "journal_blocks_started", .mode = 0444, }, + .show = poolStatsJournalBlocksStartedShow, +}; + +/**********************************************************************/ +/** The total number of items for which a write operation has been issued */ +static ssize_t poolStatsJournalBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.written); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalBlocksWrittenAttr = { + .attr = { .name = "journal_blocks_written", .mode = 0444, }, + .show = poolStatsJournalBlocksWrittenShow, +}; + +/**********************************************************************/ +/** The total number of items for which a write operation has completed */ +static ssize_t poolStatsJournalBlocksCommittedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.committed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsJournalBlocksCommittedAttr = { + .attr = { .name = "journal_blocks_committed", .mode = 0444, }, + .show = poolStatsJournalBlocksCommittedShow, +}; + +/**********************************************************************/ +/** Number of times the on-disk journal was full */ +static ssize_t poolStatsSlabJournalDiskFullCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.diskFullCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalDiskFullCountAttr = { + .attr = { .name = "slab_journal_disk_full_count", .mode = 0444, }, + .show = poolStatsSlabJournalDiskFullCountShow, +}; + +/**********************************************************************/ +/** Number of times an entry was added over the flush threshold */ +static ssize_t poolStatsSlabJournalFlushCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.flushCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalFlushCountAttr = { + .attr = { .name = "slab_journal_flush_count", .mode = 0444, }, + .show = poolStatsSlabJournalFlushCountShow, +}; + +/**********************************************************************/ +/** Number of times an entry was added over the block threshold */ +static ssize_t poolStatsSlabJournalBlockedCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.blockedCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalBlockedCountAttr = { + .attr = { .name = "slab_journal_blocked_count", .mode = 0444, }, + .show = poolStatsSlabJournalBlockedCountShow, +}; + +/**********************************************************************/ +/** Number of times a tail block was written */ +static ssize_t poolStatsSlabJournalBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.blocksWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalBlocksWrittenAttr = { + .attr = { .name = "slab_journal_blocks_written", .mode = 0444, }, + .show = poolStatsSlabJournalBlocksWrittenShow, +}; + +/**********************************************************************/ +/** Number of times we had to wait for the tail to write */ +static ssize_t poolStatsSlabJournalTailBusyCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.tailBusyCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabJournalTailBusyCountAttr = { + .attr = { .name = "slab_journal_tail_busy_count", .mode = 0444, }, + .show = poolStatsSlabJournalTailBusyCountShow, +}; + +/**********************************************************************/ +/** Number of blocks written */ +static ssize_t poolStatsSlabSummaryBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabSummary.blocksWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsSlabSummaryBlocksWrittenAttr = { + .attr = { .name = "slab_summary_blocks_written", .mode = 0444, }, + .show = poolStatsSlabSummaryBlocksWrittenShow, +}; + +/**********************************************************************/ +/** Number of reference blocks written */ +static ssize_t poolStatsRefCountsBlocksWrittenShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.refCounts.blocksWritten); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsRefCountsBlocksWrittenAttr = { + .attr = { .name = "ref_counts_blocks_written", .mode = 0444, }, + .show = poolStatsRefCountsBlocksWrittenShow, +}; + +/**********************************************************************/ +/** number of dirty (resident) pages */ +static ssize_t poolStatsBlockMapDirtyPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.dirtyPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapDirtyPagesAttr = { + .attr = { .name = "block_map_dirty_pages", .mode = 0444, }, + .show = poolStatsBlockMapDirtyPagesShow, +}; + +/**********************************************************************/ +/** number of clean (resident) pages */ +static ssize_t poolStatsBlockMapCleanPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.cleanPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapCleanPagesAttr = { + .attr = { .name = "block_map_clean_pages", .mode = 0444, }, + .show = poolStatsBlockMapCleanPagesShow, +}; + +/**********************************************************************/ +/** number of free pages */ +static ssize_t poolStatsBlockMapFreePagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.freePages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFreePagesAttr = { + .attr = { .name = "block_map_free_pages", .mode = 0444, }, + .show = poolStatsBlockMapFreePagesShow, +}; + +/**********************************************************************/ +/** number of pages in failed state */ +static ssize_t poolStatsBlockMapFailedPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.failedPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFailedPagesAttr = { + .attr = { .name = "block_map_failed_pages", .mode = 0444, }, + .show = poolStatsBlockMapFailedPagesShow, +}; + +/**********************************************************************/ +/** number of pages incoming */ +static ssize_t poolStatsBlockMapIncomingPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.incomingPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapIncomingPagesAttr = { + .attr = { .name = "block_map_incoming_pages", .mode = 0444, }, + .show = poolStatsBlockMapIncomingPagesShow, +}; + +/**********************************************************************/ +/** number of pages outgoing */ +static ssize_t poolStatsBlockMapOutgoingPagesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.outgoingPages); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapOutgoingPagesAttr = { + .attr = { .name = "block_map_outgoing_pages", .mode = 0444, }, + .show = poolStatsBlockMapOutgoingPagesShow, +}; + +/**********************************************************************/ +/** how many times free page not avail */ +static ssize_t poolStatsBlockMapCachePressureShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.cachePressure); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapCachePressureAttr = { + .attr = { .name = "block_map_cache_pressure", .mode = 0444, }, + .show = poolStatsBlockMapCachePressureShow, +}; + +/**********************************************************************/ +/** number of getVDOPageAsync() for read */ +static ssize_t poolStatsBlockMapReadCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.readCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapReadCountAttr = { + .attr = { .name = "block_map_read_count", .mode = 0444, }, + .show = poolStatsBlockMapReadCountShow, +}; + +/**********************************************************************/ +/** number or getVDOPageAsync() for write */ +static ssize_t poolStatsBlockMapWriteCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.writeCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapWriteCountAttr = { + .attr = { .name = "block_map_write_count", .mode = 0444, }, + .show = poolStatsBlockMapWriteCountShow, +}; + +/**********************************************************************/ +/** number of times pages failed to read */ +static ssize_t poolStatsBlockMapFailedReadsShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.failedReads); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFailedReadsAttr = { + .attr = { .name = "block_map_failed_reads", .mode = 0444, }, + .show = poolStatsBlockMapFailedReadsShow, +}; + +/**********************************************************************/ +/** number of times pages failed to write */ +static ssize_t poolStatsBlockMapFailedWritesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.failedWrites); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFailedWritesAttr = { + .attr = { .name = "block_map_failed_writes", .mode = 0444, }, + .show = poolStatsBlockMapFailedWritesShow, +}; + +/**********************************************************************/ +/** number of gets that are reclaimed */ +static ssize_t poolStatsBlockMapReclaimedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.reclaimed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapReclaimedAttr = { + .attr = { .name = "block_map_reclaimed", .mode = 0444, }, + .show = poolStatsBlockMapReclaimedShow, +}; + +/**********************************************************************/ +/** number of gets for outgoing pages */ +static ssize_t poolStatsBlockMapReadOutgoingShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.readOutgoing); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapReadOutgoingAttr = { + .attr = { .name = "block_map_read_outgoing", .mode = 0444, }, + .show = poolStatsBlockMapReadOutgoingShow, +}; + +/**********************************************************************/ +/** number of gets that were already there */ +static ssize_t poolStatsBlockMapFoundInCacheShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.foundInCache); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFoundInCacheAttr = { + .attr = { .name = "block_map_found_in_cache", .mode = 0444, }, + .show = poolStatsBlockMapFoundInCacheShow, +}; + +/**********************************************************************/ +/** number of gets requiring discard */ +static ssize_t poolStatsBlockMapDiscardRequiredShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.discardRequired); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapDiscardRequiredAttr = { + .attr = { .name = "block_map_discard_required", .mode = 0444, }, + .show = poolStatsBlockMapDiscardRequiredShow, +}; + +/**********************************************************************/ +/** number of gets enqueued for their page */ +static ssize_t poolStatsBlockMapWaitForPageShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.waitForPage); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapWaitForPageAttr = { + .attr = { .name = "block_map_wait_for_page", .mode = 0444, }, + .show = poolStatsBlockMapWaitForPageShow, +}; + +/**********************************************************************/ +/** number of gets that have to fetch */ +static ssize_t poolStatsBlockMapFetchRequiredShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.fetchRequired); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFetchRequiredAttr = { + .attr = { .name = "block_map_fetch_required", .mode = 0444, }, + .show = poolStatsBlockMapFetchRequiredShow, +}; + +/**********************************************************************/ +/** number of page fetches */ +static ssize_t poolStatsBlockMapPagesLoadedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.pagesLoaded); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapPagesLoadedAttr = { + .attr = { .name = "block_map_pages_loaded", .mode = 0444, }, + .show = poolStatsBlockMapPagesLoadedShow, +}; + +/**********************************************************************/ +/** number of page saves */ +static ssize_t poolStatsBlockMapPagesSavedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.pagesSaved); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapPagesSavedAttr = { + .attr = { .name = "block_map_pages_saved", .mode = 0444, }, + .show = poolStatsBlockMapPagesSavedShow, +}; + +/**********************************************************************/ +/** the number of flushes issued */ +static ssize_t poolStatsBlockMapFlushCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.flushCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBlockMapFlushCountAttr = { + .attr = { .name = "block_map_flush_count", .mode = 0444, }, + .show = poolStatsBlockMapFlushCountShow, +}; + +/**********************************************************************/ +/** Number of times the UDS advice proved correct */ +static ssize_t poolStatsHashLockDedupeAdviceValidShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.dedupeAdviceValid); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsHashLockDedupeAdviceValidAttr = { + .attr = { .name = "hash_lock_dedupe_advice_valid", .mode = 0444, }, + .show = poolStatsHashLockDedupeAdviceValidShow, +}; + +/**********************************************************************/ +/** Number of times the UDS advice proved incorrect */ +static ssize_t poolStatsHashLockDedupeAdviceStaleShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.dedupeAdviceStale); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsHashLockDedupeAdviceStaleAttr = { + .attr = { .name = "hash_lock_dedupe_advice_stale", .mode = 0444, }, + .show = poolStatsHashLockDedupeAdviceStaleShow, +}; + +/**********************************************************************/ +/** Number of writes with the same data as another in-flight write */ +static ssize_t poolStatsHashLockConcurrentDataMatchesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.concurrentDataMatches); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsHashLockConcurrentDataMatchesAttr = { + .attr = { .name = "hash_lock_concurrent_data_matches", .mode = 0444, }, + .show = poolStatsHashLockConcurrentDataMatchesShow, +}; + +/**********************************************************************/ +/** Number of writes whose hash collided with an in-flight write */ +static ssize_t poolStatsHashLockConcurrentHashCollisionsShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.concurrentHashCollisions); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsHashLockConcurrentHashCollisionsAttr = { + .attr = { .name = "hash_lock_concurrent_hash_collisions", .mode = 0444, }, + .show = poolStatsHashLockConcurrentHashCollisionsShow, +}; + +/**********************************************************************/ +/** number of times VDO got an invalid dedupe advice PBN from UDS */ +static ssize_t poolStatsErrorsInvalidAdvicePBNCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.invalidAdvicePBNCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsErrorsInvalidAdvicePBNCountAttr = { + .attr = { .name = "errors_invalid_advicePBNCount", .mode = 0444, }, + .show = poolStatsErrorsInvalidAdvicePBNCountShow, +}; + +/**********************************************************************/ +/** number of times a VIO completed with a VDO_NO_SPACE error */ +static ssize_t poolStatsErrorsNoSpaceErrorCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.noSpaceErrorCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsErrorsNoSpaceErrorCountAttr = { + .attr = { .name = "errors_no_space_error_count", .mode = 0444, }, + .show = poolStatsErrorsNoSpaceErrorCountShow, +}; + +/**********************************************************************/ +/** number of times a VIO completed with a VDO_READ_ONLY error */ +static ssize_t poolStatsErrorsReadOnlyErrorCountShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); + retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.readOnlyErrorCount); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsErrorsReadOnlyErrorCountAttr = { + .attr = { .name = "errors_read_only_error_count", .mode = 0444, }, + .show = poolStatsErrorsReadOnlyErrorCountShow, +}; + +/**********************************************************************/ +/** The VDO instance */ +static ssize_t poolStatsInstanceShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.instance); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsInstanceAttr = { + .attr = { .name = "instance", .mode = 0444, }, + .show = poolStatsInstanceShow, +}; + +/**********************************************************************/ +/** Current number of active VIOs */ +static ssize_t poolStatsCurrentVIOsInProgressShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.currentVIOsInProgress); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsCurrentVIOsInProgressAttr = { + .attr = { .name = "currentVIOs_in_progress", .mode = 0444, }, + .show = poolStatsCurrentVIOsInProgressShow, +}; + +/**********************************************************************/ +/** Maximum number of active VIOs */ +static ssize_t poolStatsMaxVIOsShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.maxVIOs); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsMaxVIOsAttr = { + .attr = { .name = "maxVIOs", .mode = 0444, }, + .show = poolStatsMaxVIOsShow, +}; + +/**********************************************************************/ +/** Number of times the UDS index was too slow in responding */ +static ssize_t poolStatsDedupeAdviceTimeoutsShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.dedupeAdviceTimeouts); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsDedupeAdviceTimeoutsAttr = { + .attr = { .name = "dedupe_advice_timeouts", .mode = 0444, }, + .show = poolStatsDedupeAdviceTimeoutsShow, +}; + +/**********************************************************************/ +/** Number of flush requests submitted to the storage device */ +static ssize_t poolStatsFlushOutShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.flushOut); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsFlushOutAttr = { + .attr = { .name = "flush_out", .mode = 0444, }, + .show = poolStatsFlushOutShow, +}; + +/**********************************************************************/ +/** Logical block size */ +static ssize_t poolStatsLogicalBlockSizeShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.logicalBlockSize); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsLogicalBlockSizeAttr = { + .attr = { .name = "logical_block_size", .mode = 0444, }, + .show = poolStatsLogicalBlockSizeShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosInReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInReadAttr = { + .attr = { .name = "bios_in_read", .mode = 0444, }, + .show = poolStatsBiosInReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosInWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInWriteAttr = { + .attr = { .name = "bios_in_write", .mode = 0444, }, + .show = poolStatsBiosInWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosInDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInDiscardAttr = { + .attr = { .name = "bios_in_discard", .mode = 0444, }, + .show = poolStatsBiosInDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosInFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInFlushAttr = { + .attr = { .name = "bios_in_flush", .mode = 0444, }, + .show = poolStatsBiosInFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosInFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInFuaAttr = { + .attr = { .name = "bios_in_fua", .mode = 0444, }, + .show = poolStatsBiosInFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosInPartialReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialReadAttr = { + .attr = { .name = "bios_in_partial_read", .mode = 0444, }, + .show = poolStatsBiosInPartialReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosInPartialWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialWriteAttr = { + .attr = { .name = "bios_in_partial_write", .mode = 0444, }, + .show = poolStatsBiosInPartialWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosInPartialDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialDiscardAttr = { + .attr = { .name = "bios_in_partial_discard", .mode = 0444, }, + .show = poolStatsBiosInPartialDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosInPartialFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialFlushAttr = { + .attr = { .name = "bios_in_partial_flush", .mode = 0444, }, + .show = poolStatsBiosInPartialFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosInPartialFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInPartialFuaAttr = { + .attr = { .name = "bios_in_partial_fua", .mode = 0444, }, + .show = poolStatsBiosInPartialFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosOutReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutReadAttr = { + .attr = { .name = "bios_out_read", .mode = 0444, }, + .show = poolStatsBiosOutReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosOutWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutWriteAttr = { + .attr = { .name = "bios_out_write", .mode = 0444, }, + .show = poolStatsBiosOutWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosOutDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutDiscardAttr = { + .attr = { .name = "bios_out_discard", .mode = 0444, }, + .show = poolStatsBiosOutDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosOutFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutFlushAttr = { + .attr = { .name = "bios_out_flush", .mode = 0444, }, + .show = poolStatsBiosOutFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosOutFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutFuaAttr = { + .attr = { .name = "bios_out_fua", .mode = 0444, }, + .show = poolStatsBiosOutFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosMetaReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaReadAttr = { + .attr = { .name = "bios_meta_read", .mode = 0444, }, + .show = poolStatsBiosMetaReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosMetaWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaWriteAttr = { + .attr = { .name = "bios_meta_write", .mode = 0444, }, + .show = poolStatsBiosMetaWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosMetaDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaDiscardAttr = { + .attr = { .name = "bios_meta_discard", .mode = 0444, }, + .show = poolStatsBiosMetaDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosMetaFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaFlushAttr = { + .attr = { .name = "bios_meta_flush", .mode = 0444, }, + .show = poolStatsBiosMetaFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosMetaFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaFuaAttr = { + .attr = { .name = "bios_meta_fua", .mode = 0444, }, + .show = poolStatsBiosMetaFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosJournalReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalReadAttr = { + .attr = { .name = "bios_journal_read", .mode = 0444, }, + .show = poolStatsBiosJournalReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosJournalWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalWriteAttr = { + .attr = { .name = "bios_journal_write", .mode = 0444, }, + .show = poolStatsBiosJournalWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosJournalDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalDiscardAttr = { + .attr = { .name = "bios_journal_discard", .mode = 0444, }, + .show = poolStatsBiosJournalDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosJournalFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalFlushAttr = { + .attr = { .name = "bios_journal_flush", .mode = 0444, }, + .show = poolStatsBiosJournalFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosJournalFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalFuaAttr = { + .attr = { .name = "bios_journal_fua", .mode = 0444, }, + .show = poolStatsBiosJournalFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosPageCacheReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheReadAttr = { + .attr = { .name = "bios_page_cache_read", .mode = 0444, }, + .show = poolStatsBiosPageCacheReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosPageCacheWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheWriteAttr = { + .attr = { .name = "bios_page_cache_write", .mode = 0444, }, + .show = poolStatsBiosPageCacheWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosPageCacheDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheDiscardAttr = { + .attr = { .name = "bios_page_cache_discard", .mode = 0444, }, + .show = poolStatsBiosPageCacheDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosPageCacheFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheFlushAttr = { + .attr = { .name = "bios_page_cache_flush", .mode = 0444, }, + .show = poolStatsBiosPageCacheFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosPageCacheFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheFuaAttr = { + .attr = { .name = "bios_page_cache_fua", .mode = 0444, }, + .show = poolStatsBiosPageCacheFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosOutCompletedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedReadAttr = { + .attr = { .name = "bios_out_completed_read", .mode = 0444, }, + .show = poolStatsBiosOutCompletedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosOutCompletedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedWriteAttr = { + .attr = { .name = "bios_out_completed_write", .mode = 0444, }, + .show = poolStatsBiosOutCompletedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosOutCompletedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedDiscardAttr = { + .attr = { .name = "bios_out_completed_discard", .mode = 0444, }, + .show = poolStatsBiosOutCompletedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosOutCompletedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedFlushAttr = { + .attr = { .name = "bios_out_completed_flush", .mode = 0444, }, + .show = poolStatsBiosOutCompletedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosOutCompletedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosOutCompletedFuaAttr = { + .attr = { .name = "bios_out_completed_fua", .mode = 0444, }, + .show = poolStatsBiosOutCompletedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosMetaCompletedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedReadAttr = { + .attr = { .name = "bios_meta_completed_read", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosMetaCompletedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedWriteAttr = { + .attr = { .name = "bios_meta_completed_write", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosMetaCompletedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedDiscardAttr = { + .attr = { .name = "bios_meta_completed_discard", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosMetaCompletedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedFlushAttr = { + .attr = { .name = "bios_meta_completed_flush", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosMetaCompletedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosMetaCompletedFuaAttr = { + .attr = { .name = "bios_meta_completed_fua", .mode = 0444, }, + .show = poolStatsBiosMetaCompletedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosJournalCompletedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedReadAttr = { + .attr = { .name = "bios_journal_completed_read", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosJournalCompletedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedWriteAttr = { + .attr = { .name = "bios_journal_completed_write", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosJournalCompletedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedDiscardAttr = { + .attr = { .name = "bios_journal_completed_discard", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosJournalCompletedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedFlushAttr = { + .attr = { .name = "bios_journal_completed_flush", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosJournalCompletedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosJournalCompletedFuaAttr = { + .attr = { .name = "bios_journal_completed_fua", .mode = 0444, }, + .show = poolStatsBiosJournalCompletedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosPageCacheCompletedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedReadAttr = { + .attr = { .name = "bios_page_cache_completed_read", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosPageCacheCompletedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedWriteAttr = { + .attr = { .name = "bios_page_cache_completed_write", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosPageCacheCompletedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedDiscardAttr = { + .attr = { .name = "bios_page_cache_completed_discard", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosPageCacheCompletedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedFlushAttr = { + .attr = { .name = "bios_page_cache_completed_flush", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosPageCacheCompletedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosPageCacheCompletedFuaAttr = { + .attr = { .name = "bios_page_cache_completed_fua", .mode = 0444, }, + .show = poolStatsBiosPageCacheCompletedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosAcknowledgedReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedReadAttr = { + .attr = { .name = "bios_acknowledged_read", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosAcknowledgedWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedWriteAttr = { + .attr = { .name = "bios_acknowledged_write", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosAcknowledgedDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedDiscardAttr = { + .attr = { .name = "bios_acknowledged_discard", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosAcknowledgedFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedFlushAttr = { + .attr = { .name = "bios_acknowledged_flush", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosAcknowledgedFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedFuaAttr = { + .attr = { .name = "bios_acknowledged_fua", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosAcknowledgedPartialReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialReadAttr = { + .attr = { .name = "bios_acknowledged_partial_read", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosAcknowledgedPartialWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialWriteAttr = { + .attr = { .name = "bios_acknowledged_partial_write", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosAcknowledgedPartialDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialDiscardAttr = { + .attr = { .name = "bios_acknowledged_partial_discard", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosAcknowledgedPartialFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialFlushAttr = { + .attr = { .name = "bios_acknowledged_partial_flush", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosAcknowledgedPartialFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosAcknowledgedPartialFuaAttr = { + .attr = { .name = "bios_acknowledged_partial_fua", .mode = 0444, }, + .show = poolStatsBiosAcknowledgedPartialFuaShow, +}; + +/**********************************************************************/ +/** Number of not REQ_WRITE bios */ +static ssize_t poolStatsBiosInProgressReadShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.read); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressReadAttr = { + .attr = { .name = "bios_in_progress_read", .mode = 0444, }, + .show = poolStatsBiosInProgressReadShow, +}; + +/**********************************************************************/ +/** Number of REQ_WRITE bios */ +static ssize_t poolStatsBiosInProgressWriteShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.write); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressWriteAttr = { + .attr = { .name = "bios_in_progress_write", .mode = 0444, }, + .show = poolStatsBiosInProgressWriteShow, +}; + +/**********************************************************************/ +/** Number of REQ_DISCARD bios */ +static ssize_t poolStatsBiosInProgressDiscardShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.discard); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressDiscardAttr = { + .attr = { .name = "bios_in_progress_discard", .mode = 0444, }, + .show = poolStatsBiosInProgressDiscardShow, +}; + +/**********************************************************************/ +/** Number of REQ_FLUSH bios */ +static ssize_t poolStatsBiosInProgressFlushShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.flush); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressFlushAttr = { + .attr = { .name = "bios_in_progress_flush", .mode = 0444, }, + .show = poolStatsBiosInProgressFlushShow, +}; + +/**********************************************************************/ +/** Number of REQ_FUA bios */ +static ssize_t poolStatsBiosInProgressFuaShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.fua); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsBiosInProgressFuaAttr = { + .attr = { .name = "bios_in_progress_fua", .mode = 0444, }, + .show = poolStatsBiosInProgressFuaShow, +}; + +/**********************************************************************/ +/** Tracked bytes currently allocated. */ +static ssize_t poolStatsMemoryUsageBytesUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.memoryUsage.bytesUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsMemoryUsageBytesUsedAttr = { + .attr = { .name = "memory_usage_bytes_used", .mode = 0444, }, + .show = poolStatsMemoryUsageBytesUsedShow, +}; + +/**********************************************************************/ +/** Maximum tracked bytes allocated. */ +static ssize_t poolStatsMemoryUsagePeakBytesUsedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.memoryUsage.peakBytesUsed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsMemoryUsagePeakBytesUsedAttr = { + .attr = { .name = "memory_usage_peak_bytes_used", .mode = 0444, }, + .show = poolStatsMemoryUsagePeakBytesUsedShow, +}; + +/**********************************************************************/ +/** Number of chunk names stored in the index */ +static ssize_t poolStatsIndexEntriesIndexedShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.entriesIndexed); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexEntriesIndexedAttr = { + .attr = { .name = "index_entries_indexed", .mode = 0444, }, + .show = poolStatsIndexEntriesIndexedShow, +}; + +/**********************************************************************/ +/** Number of post calls that found an existing entry */ +static ssize_t poolStatsIndexPostsFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.postsFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexPostsFoundAttr = { + .attr = { .name = "index_posts_found", .mode = 0444, }, + .show = poolStatsIndexPostsFoundShow, +}; + +/**********************************************************************/ +/** Number of post calls that added a new entry */ +static ssize_t poolStatsIndexPostsNotFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.postsNotFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexPostsNotFoundAttr = { + .attr = { .name = "index_posts_not_found", .mode = 0444, }, + .show = poolStatsIndexPostsNotFoundShow, +}; + +/**********************************************************************/ +/** Number of query calls that found an existing entry */ +static ssize_t poolStatsIndexQueriesFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.queriesFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexQueriesFoundAttr = { + .attr = { .name = "index_queries_found", .mode = 0444, }, + .show = poolStatsIndexQueriesFoundShow, +}; + +/**********************************************************************/ +/** Number of query calls that added a new entry */ +static ssize_t poolStatsIndexQueriesNotFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.queriesNotFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexQueriesNotFoundAttr = { + .attr = { .name = "index_queries_not_found", .mode = 0444, }, + .show = poolStatsIndexQueriesNotFoundShow, +}; + +/**********************************************************************/ +/** Number of update calls that found an existing entry */ +static ssize_t poolStatsIndexUpdatesFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.updatesFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexUpdatesFoundAttr = { + .attr = { .name = "index_updates_found", .mode = 0444, }, + .show = poolStatsIndexUpdatesFoundShow, +}; + +/**********************************************************************/ +/** Number of update calls that added a new entry */ +static ssize_t poolStatsIndexUpdatesNotFoundShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.updatesNotFound); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexUpdatesNotFoundAttr = { + .attr = { .name = "index_updates_not_found", .mode = 0444, }, + .show = poolStatsIndexUpdatesNotFoundShow, +}; + +/**********************************************************************/ +/** Current number of dedupe queries that are in flight */ +static ssize_t poolStatsIndexCurrDedupeQueriesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.index.currDedupeQueries); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexCurrDedupeQueriesAttr = { + .attr = { .name = "index_curr_dedupe_queries", .mode = 0444, }, + .show = poolStatsIndexCurrDedupeQueriesShow, +}; + +/**********************************************************************/ +/** Maximum number of dedupe queries that have been in flight */ +static ssize_t poolStatsIndexMaxDedupeQueriesShow(KernelLayer *layer, char *buf) +{ + ssize_t retval; + mutex_lock(&layer->statsMutex); + getKernelStats(layer, &layer->kernelStatsStorage); + retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.index.maxDedupeQueries); + mutex_unlock(&layer->statsMutex); + return retval; +} + +static PoolStatsAttribute poolStatsIndexMaxDedupeQueriesAttr = { + .attr = { .name = "index_max_dedupe_queries", .mode = 0444, }, + .show = poolStatsIndexMaxDedupeQueriesShow, +}; + +struct attribute *poolStatsAttrs[] = { + &poolStatsDataBlocksUsedAttr.attr, + &poolStatsOverheadBlocksUsedAttr.attr, + &poolStatsLogicalBlocksUsedAttr.attr, + &poolStatsPhysicalBlocksAttr.attr, + &poolStatsLogicalBlocksAttr.attr, + &poolStatsBlockMapCacheSizeAttr.attr, + &poolStatsWritePolicyAttr.attr, + &poolStatsBlockSizeAttr.attr, + &poolStatsCompleteRecoveriesAttr.attr, + &poolStatsReadOnlyRecoveriesAttr.attr, + &poolStatsModeAttr.attr, + &poolStatsInRecoveryModeAttr.attr, + &poolStatsRecoveryPercentageAttr.attr, + &poolStatsPackerCompressedFragmentsWrittenAttr.attr, + &poolStatsPackerCompressedBlocksWrittenAttr.attr, + &poolStatsPackerCompressedFragmentsInPackerAttr.attr, + &poolStatsAllocatorSlabCountAttr.attr, + &poolStatsAllocatorSlabsOpenedAttr.attr, + &poolStatsAllocatorSlabsReopenedAttr.attr, + &poolStatsJournalDiskFullAttr.attr, + &poolStatsJournalSlabJournalCommitsRequestedAttr.attr, + &poolStatsJournalEntriesStartedAttr.attr, + &poolStatsJournalEntriesWrittenAttr.attr, + &poolStatsJournalEntriesCommittedAttr.attr, + &poolStatsJournalBlocksStartedAttr.attr, + &poolStatsJournalBlocksWrittenAttr.attr, + &poolStatsJournalBlocksCommittedAttr.attr, + &poolStatsSlabJournalDiskFullCountAttr.attr, + &poolStatsSlabJournalFlushCountAttr.attr, + &poolStatsSlabJournalBlockedCountAttr.attr, + &poolStatsSlabJournalBlocksWrittenAttr.attr, + &poolStatsSlabJournalTailBusyCountAttr.attr, + &poolStatsSlabSummaryBlocksWrittenAttr.attr, + &poolStatsRefCountsBlocksWrittenAttr.attr, + &poolStatsBlockMapDirtyPagesAttr.attr, + &poolStatsBlockMapCleanPagesAttr.attr, + &poolStatsBlockMapFreePagesAttr.attr, + &poolStatsBlockMapFailedPagesAttr.attr, + &poolStatsBlockMapIncomingPagesAttr.attr, + &poolStatsBlockMapOutgoingPagesAttr.attr, + &poolStatsBlockMapCachePressureAttr.attr, + &poolStatsBlockMapReadCountAttr.attr, + &poolStatsBlockMapWriteCountAttr.attr, + &poolStatsBlockMapFailedReadsAttr.attr, + &poolStatsBlockMapFailedWritesAttr.attr, + &poolStatsBlockMapReclaimedAttr.attr, + &poolStatsBlockMapReadOutgoingAttr.attr, + &poolStatsBlockMapFoundInCacheAttr.attr, + &poolStatsBlockMapDiscardRequiredAttr.attr, + &poolStatsBlockMapWaitForPageAttr.attr, + &poolStatsBlockMapFetchRequiredAttr.attr, + &poolStatsBlockMapPagesLoadedAttr.attr, + &poolStatsBlockMapPagesSavedAttr.attr, + &poolStatsBlockMapFlushCountAttr.attr, + &poolStatsHashLockDedupeAdviceValidAttr.attr, + &poolStatsHashLockDedupeAdviceStaleAttr.attr, + &poolStatsHashLockConcurrentDataMatchesAttr.attr, + &poolStatsHashLockConcurrentHashCollisionsAttr.attr, + &poolStatsErrorsInvalidAdvicePBNCountAttr.attr, + &poolStatsErrorsNoSpaceErrorCountAttr.attr, + &poolStatsErrorsReadOnlyErrorCountAttr.attr, + &poolStatsInstanceAttr.attr, + &poolStatsCurrentVIOsInProgressAttr.attr, + &poolStatsMaxVIOsAttr.attr, + &poolStatsDedupeAdviceTimeoutsAttr.attr, + &poolStatsFlushOutAttr.attr, + &poolStatsLogicalBlockSizeAttr.attr, + &poolStatsBiosInReadAttr.attr, + &poolStatsBiosInWriteAttr.attr, + &poolStatsBiosInDiscardAttr.attr, + &poolStatsBiosInFlushAttr.attr, + &poolStatsBiosInFuaAttr.attr, + &poolStatsBiosInPartialReadAttr.attr, + &poolStatsBiosInPartialWriteAttr.attr, + &poolStatsBiosInPartialDiscardAttr.attr, + &poolStatsBiosInPartialFlushAttr.attr, + &poolStatsBiosInPartialFuaAttr.attr, + &poolStatsBiosOutReadAttr.attr, + &poolStatsBiosOutWriteAttr.attr, + &poolStatsBiosOutDiscardAttr.attr, + &poolStatsBiosOutFlushAttr.attr, + &poolStatsBiosOutFuaAttr.attr, + &poolStatsBiosMetaReadAttr.attr, + &poolStatsBiosMetaWriteAttr.attr, + &poolStatsBiosMetaDiscardAttr.attr, + &poolStatsBiosMetaFlushAttr.attr, + &poolStatsBiosMetaFuaAttr.attr, + &poolStatsBiosJournalReadAttr.attr, + &poolStatsBiosJournalWriteAttr.attr, + &poolStatsBiosJournalDiscardAttr.attr, + &poolStatsBiosJournalFlushAttr.attr, + &poolStatsBiosJournalFuaAttr.attr, + &poolStatsBiosPageCacheReadAttr.attr, + &poolStatsBiosPageCacheWriteAttr.attr, + &poolStatsBiosPageCacheDiscardAttr.attr, + &poolStatsBiosPageCacheFlushAttr.attr, + &poolStatsBiosPageCacheFuaAttr.attr, + &poolStatsBiosOutCompletedReadAttr.attr, + &poolStatsBiosOutCompletedWriteAttr.attr, + &poolStatsBiosOutCompletedDiscardAttr.attr, + &poolStatsBiosOutCompletedFlushAttr.attr, + &poolStatsBiosOutCompletedFuaAttr.attr, + &poolStatsBiosMetaCompletedReadAttr.attr, + &poolStatsBiosMetaCompletedWriteAttr.attr, + &poolStatsBiosMetaCompletedDiscardAttr.attr, + &poolStatsBiosMetaCompletedFlushAttr.attr, + &poolStatsBiosMetaCompletedFuaAttr.attr, + &poolStatsBiosJournalCompletedReadAttr.attr, + &poolStatsBiosJournalCompletedWriteAttr.attr, + &poolStatsBiosJournalCompletedDiscardAttr.attr, + &poolStatsBiosJournalCompletedFlushAttr.attr, + &poolStatsBiosJournalCompletedFuaAttr.attr, + &poolStatsBiosPageCacheCompletedReadAttr.attr, + &poolStatsBiosPageCacheCompletedWriteAttr.attr, + &poolStatsBiosPageCacheCompletedDiscardAttr.attr, + &poolStatsBiosPageCacheCompletedFlushAttr.attr, + &poolStatsBiosPageCacheCompletedFuaAttr.attr, + &poolStatsBiosAcknowledgedReadAttr.attr, + &poolStatsBiosAcknowledgedWriteAttr.attr, + &poolStatsBiosAcknowledgedDiscardAttr.attr, + &poolStatsBiosAcknowledgedFlushAttr.attr, + &poolStatsBiosAcknowledgedFuaAttr.attr, + &poolStatsBiosAcknowledgedPartialReadAttr.attr, + &poolStatsBiosAcknowledgedPartialWriteAttr.attr, + &poolStatsBiosAcknowledgedPartialDiscardAttr.attr, + &poolStatsBiosAcknowledgedPartialFlushAttr.attr, + &poolStatsBiosAcknowledgedPartialFuaAttr.attr, + &poolStatsBiosInProgressReadAttr.attr, + &poolStatsBiosInProgressWriteAttr.attr, + &poolStatsBiosInProgressDiscardAttr.attr, + &poolStatsBiosInProgressFlushAttr.attr, + &poolStatsBiosInProgressFuaAttr.attr, + &poolStatsMemoryUsageBytesUsedAttr.attr, + &poolStatsMemoryUsagePeakBytesUsedAttr.attr, + &poolStatsIndexEntriesIndexedAttr.attr, + &poolStatsIndexPostsFoundAttr.attr, + &poolStatsIndexPostsNotFoundAttr.attr, + &poolStatsIndexQueriesFoundAttr.attr, + &poolStatsIndexQueriesNotFoundAttr.attr, + &poolStatsIndexUpdatesFoundAttr.attr, + &poolStatsIndexUpdatesNotFoundAttr.attr, + &poolStatsIndexCurrDedupeQueriesAttr.attr, + &poolStatsIndexMaxDedupeQueriesAttr.attr, + NULL, +}; diff --git a/source/vdo/kernel/statusCodeBlocks.h b/source/vdo/kernel/statusCodeBlocks.h new file mode 100644 index 0000000..bca19c5 --- /dev/null +++ b/source/vdo/kernel/statusCodeBlocks.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusCodeBlocks.h#1 $ + */ + +#ifndef STATUS_CODE_BLOCKS_H +#define STATUS_CODE_BLOCKS_H + +enum { + UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, + VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, + VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, + PRP_BLOCK_START = VDO_BLOCK_END, + PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, +}; + +#endif // STATUS_CODE_BLOCKS_H diff --git a/source/vdo/kernel/statusProcfs.c b/source/vdo/kernel/statusProcfs.c new file mode 100644 index 0000000..70e8c9b --- /dev/null +++ b/source/vdo/kernel/statusProcfs.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusProcfs.c#4 $ + * + * Proc filesystem interface to the old GET_DEDUPE_STATS and + * GET_KERNEL_STATS ioctls, which can no longer be supported in 4.4 + * and later kernels. These files return the same data as the old + * ioctls do, in order to require minimal changes to our (and + * customers') utilties and test code. + * + * +--+----- /proc/vdo procfsRoot + * | + * +-+----- vdo config->poolName + * | + * +------- dedupe_stats GET_DEDUPE_STATS ioctl + * +------- kernel_stats GET_KERNEL_STATS ioctl + * + */ +#include "statusProcfs.h" + +#include + +#include "memoryAlloc.h" + +#include "releaseVersions.h" +#include "statistics.h" +#include "vdo.h" + +#include "dedupeIndex.h" +#include "ioSubmitter.h" +#include "kernelStatistics.h" +#include "logger.h" +#include "memoryUsage.h" +#include "threadDevice.h" +#include "vdoCommon.h" + +static struct proc_dir_entry *procfsRoot = NULL; + +/**********************************************************************/ +static int statusDedupeShow(struct seq_file *m, void *v) +{ + KernelLayer *layer = (KernelLayer *) m->private; + VDOStatistics *stats; + size_t len = sizeof(VDOStatistics); + RegisteredThread allocatingThread, instanceThread; + registerAllocatingThread(&allocatingThread, NULL); + registerThreadDevice(&instanceThread, layer); + int result = ALLOCATE(1, VDOStatistics, __func__, &stats); + if (result == VDO_SUCCESS) { + getKVDOStatistics(&layer->kvdo, stats); + seq_write(m, stats, len); + FREE(stats); + } + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return result; +} + +/**********************************************************************/ +static int statusDedupeOpen(struct inode *inode, struct file *file) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + return single_open(file, statusDedupeShow, PDE_DATA(inode)); +#else + return single_open(file, statusDedupeShow, PDE(inode)->data); +#endif +} + +static const struct file_operations vdoProcfsDedupeOps = { + .open = statusDedupeOpen, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/**********************************************************************/ +static void copyBioStat(BioStats *b, const AtomicBioStats *a) +{ + b->read = atomic64_read(&a->read); + b->write = atomic64_read(&a->write); + b->discard = atomic64_read(&a->discard); + b->flush = atomic64_read(&a->flush); + b->fua = atomic64_read(&a->fua); +} + +/**********************************************************************/ +static BioStats subtractBioStats(BioStats minuend, BioStats subtrahend) +{ + return (BioStats) { + .read = minuend.read - subtrahend.read, + .write = minuend.write - subtrahend.write, + .discard = minuend.discard - subtrahend.discard, + .flush = minuend.flush - subtrahend.flush, + .fua = minuend.fua - subtrahend.fua, + }; +} + +/**********************************************************************/ +void getKernelStats(KernelLayer *layer, KernelStatistics *stats) +{ + stats->version = STATISTICS_VERSION; + stats->releaseVersion = CURRENT_RELEASE_VERSION_NUMBER; + stats->instance = layer->instance; + getLimiterValuesAtomically(&layer->requestLimiter, + &stats->currentVIOsInProgress, &stats->maxVIOs); + // albireoTimeoutReport gives the number of timeouts, and dedupeContextBusy + // gives the number of queries not made because of earlier timeouts. + stats->dedupeAdviceTimeouts = (getEventCount(&layer->albireoTimeoutReporter) + + atomic64_read(&layer->dedupeContextBusy)); + stats->flushOut = atomic64_read(&layer->flushOut); + stats->logicalBlockSize = layer->deviceConfig->logicalBlockSize; + copyBioStat(&stats->biosIn, &layer->biosIn); + copyBioStat(&stats->biosInPartial, &layer->biosInPartial); + copyBioStat(&stats->biosOut, &layer->biosOut); + copyBioStat(&stats->biosMeta, &layer->biosMeta); + copyBioStat(&stats->biosJournal, &layer->biosJournal); + copyBioStat(&stats->biosPageCache, &layer->biosPageCache); + copyBioStat(&stats->biosOutCompleted, &layer->biosOutCompleted); + copyBioStat(&stats->biosMetaCompleted, &layer->biosMetaCompleted); + copyBioStat(&stats->biosJournalCompleted, &layer->biosJournalCompleted); + copyBioStat(&stats->biosPageCacheCompleted, + &layer->biosPageCacheCompleted); + copyBioStat(&stats->biosAcknowledged, &layer->biosAcknowledged); + copyBioStat(&stats->biosAcknowledgedPartial, + &layer->biosAcknowledgedPartial); + stats->biosInProgress = subtractBioStats(stats->biosIn, + stats->biosAcknowledged); + stats->memoryUsage = getMemoryUsage(); + getIndexStatistics(layer->dedupeIndex, &stats->index); +} + +/**********************************************************************/ +static int statusKernelShow(struct seq_file *m, void *v) +{ + KernelLayer *layer = (KernelLayer *) m->private; + KernelStatistics *stats; + size_t len = sizeof(KernelStatistics); + RegisteredThread allocatingThread, instanceThread; + registerAllocatingThread(&allocatingThread, NULL); + registerThreadDevice(&instanceThread, layer); + int result = ALLOCATE(1, KernelStatistics, __func__, &stats); + if (result == VDO_SUCCESS) { + getKernelStats(layer, stats); + seq_write(m, stats, len); + FREE(stats); + } + unregisterThreadDeviceID(); + unregisterAllocatingThread(); + return result; +} + +/**********************************************************************/ +static int statusKernelOpen(struct inode *inode, struct file *file) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + return single_open(file, statusKernelShow, PDE_DATA(inode)); +#else + return single_open(file, statusKernelShow, PDE(inode)->data); +#endif +} + +static const struct file_operations vdoProcfsKernelOps = { + .open = statusKernelOpen, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/**********************************************************************/ +int vdoInitProcfs() +{ + const char *procfsName = getProcRoot(); + procfsRoot = proc_mkdir(procfsName, NULL); + if (procfsRoot == NULL) { + logWarning("Could not create proc filesystem root %s\n", procfsName); + return -ENOMEM; + } + return VDO_SUCCESS; +} + +/**********************************************************************/ +void vdoDestroyProcfs() +{ + remove_proc_entry(getProcRoot(), NULL); + procfsRoot = NULL; +} + +/**********************************************************************/ +int vdoCreateProcfsEntry(KernelLayer *layer, const char *name, void **private) +{ + int result = VDO_SUCCESS; + + if (procfsRoot != NULL) { + struct proc_dir_entry *fsDir; + fsDir = proc_mkdir(name, procfsRoot); + if (fsDir == NULL) { + result = -ENOMEM; + } else { + if (proc_create_data(getVDOStatisticsProcFile(), 0644, fsDir, + &vdoProcfsDedupeOps, layer) == NULL) { + result = -ENOMEM; + } else if (proc_create_data(getKernelStatisticsProcFile(), 0644, fsDir, + &vdoProcfsKernelOps, layer) == NULL) { + result = -ENOMEM; + } + } + if (result < 0) { + vdoDestroyProcfsEntry(name, fsDir); + } else { + *private = fsDir; + } + } else { + logWarning("No proc filesystem root set, skipping %s\n", name); + } + return result; +} + +/**********************************************************************/ +void vdoDestroyProcfsEntry(const char *name, void *private) +{ + if (procfsRoot != NULL) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) + remove_proc_subtree(name, procfsRoot); +#else + struct proc_dir_entry *fsDir = (struct proc_dir_entry *) private; + remove_proc_entry(getVDOStatisticsProcFile(), fsDir); + remove_proc_entry(getKernelStatisticsProcFile(), fsDir); + remove_proc_entry(name, procfsRoot); +#endif + } +} diff --git a/source/vdo/kernel/statusProcfs.h b/source/vdo/kernel/statusProcfs.h new file mode 100644 index 0000000..a884c8e --- /dev/null +++ b/source/vdo/kernel/statusProcfs.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusProcfs.h#1 $ + * + */ + +#ifndef STATUS_PROC_H +#define STATUS_PROC_H + +#include +#include +#include "kernelLayer.h" + +/** + * Initializes the /proc/vdo directory. Should be called once when the + * module is loaded. + * + * @return 0 on success, nonzero on failure + */ +int vdoInitProcfs(void); + +/** + * Destroys the /proc/vdo directory. Should be called once when the + * module is unloaded. + */ +void vdoDestroyProcfs(void); + +/** + * Creates a subdirectory in the /proc/vdo filesystem for a particular + * vdo. + * + * @param layer the kernel layer + * @param name the subdirectory name + * @param private pointer to private storage for procfs data + * + * @return 0 on success, nonzero on failure + */ +int vdoCreateProcfsEntry(KernelLayer *layer, const char *name, void **private); + +/** + * Destroys a subdirectory in the /proc/vdo filesystem for a + * particular vdo. + * + * @param name the subdirectory name + * @param private private storage for procfs data + */ +void vdoDestroyProcfsEntry(const char *name, void *private); + +/** + * Retrieves the current kernel statistics. + * + * @param layer the kernel layer + * @param stats pointer to the structure to fill in + */ +void getKernelStats(KernelLayer *layer, KernelStatistics *stats); + +#endif /* STATUS_PROC_H */ diff --git a/source/vdo/kernel/sysfs.c b/source/vdo/kernel/sysfs.c new file mode 100644 index 0000000..9244bf1 --- /dev/null +++ b/source/vdo/kernel/sysfs.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/sysfs.c#5 $ + */ + +#include "sysfs.h" + +#include +#include + +#include "dedupeIndex.h" +#include "dmvdo.h" +#include "logger.h" + +extern int defaultMaxRequestsActive; + +typedef struct vdoAttribute { + struct attribute attr; + ssize_t (*show)(struct kvdoDevice *d, struct attribute *attr, char *buf); + ssize_t (*store)(struct kvdoDevice *d, const char *value, size_t count); + // Location of value, if .show == showInt or showUInt or showBool. + void *valuePtr; +} VDOAttribute; + +static char *statusStrings[] = { + "UNINITIALIZED", + "READY", + "SHUTTING DOWN", +}; + +/**********************************************************************/ +static ssize_t vdoStatusShow(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", statusStrings[device->status]); +} + +/**********************************************************************/ +static ssize_t vdoLogLevelShow(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", priorityToString(getLogLevel())); +} + +/**********************************************************************/ +static ssize_t vdoLogLevelStore(struct kvdoDevice *device, + const char *buf, size_t n) +{ + static char internalBuf[11]; + + if (n > 10) { + return -EINVAL; + } + + memset(internalBuf, '\000', sizeof(internalBuf)); + memcpy(internalBuf, buf, n); + if (internalBuf[n - 1] == '\n') { + internalBuf[n - 1] = '\000'; + } + setLogLevel(stringToPriority(internalBuf)); + return n; +} + +/**********************************************************************/ +static ssize_t scanInt(const char *buf, + size_t n, + int *valuePtr, + int minimum, + int maximum) +{ + if (n > 12) { + return -EINVAL; + } + unsigned int value; + if (sscanf(buf, "%d", &value) != 1) { + return -EINVAL; + } + if (value < minimum) { + value = minimum; + } else if (value > maximum) { + value = maximum; + } + *valuePtr = value; + return n; +} + +/**********************************************************************/ +static ssize_t showInt(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + + return sprintf(buf, "%d\n", *(int *)vdoAttr->valuePtr); +} + +/**********************************************************************/ +static ssize_t scanUInt(const char *buf, + size_t n, + unsigned int *valuePtr, + unsigned int minimum, + unsigned int maximum) +{ + if (n > 12) { + return -EINVAL; + } + unsigned int value; + if (sscanf(buf, "%u", &value) != 1) { + return -EINVAL; + } + if (value < minimum) { + value = minimum; + } else if (value > maximum) { + value = maximum; + } + *valuePtr = value; + return n; +} + +/**********************************************************************/ +static ssize_t showUInt(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + + return sprintf(buf, "%u\n", *(unsigned int *)vdoAttr->valuePtr); +} + +/**********************************************************************/ +static ssize_t scanBool(const char *buf, size_t n, bool *valuePtr) +{ + unsigned int intValue = 0; + n = scanUInt(buf, n, &intValue, 0, 1); + if (n > 0) { + *valuePtr = (intValue != 0); + } + return n; +} + +/**********************************************************************/ +static ssize_t showBool(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + + return sprintf(buf, "%u\n", *(bool *)vdoAttr->valuePtr ? 1 : 0); +} + +/**********************************************************************/ +static ssize_t vdoTraceRecordingStore(struct kvdoDevice *device, + const char *buf, + size_t n) +{ + return scanBool(buf, n, &traceRecording); +} + +/**********************************************************************/ +static ssize_t vdoMaxReqActiveStore(struct kvdoDevice *device, + const char *buf, + size_t n) +{ + /* + * The base code has some hardcoded assumptions about the maximum + * number of requests that can be in progress. Maybe someday we'll + * do calculations with the actual number; for now, just make sure + * the assumption holds. + */ + return scanInt(buf, n, &defaultMaxRequestsActive, 1, MAXIMUM_USER_VIOS); +} + +/**********************************************************************/ +static ssize_t vdoAlbireoTimeoutIntervalStore(struct kvdoDevice *device, + const char *buf, + size_t n) +{ + unsigned int value; + ssize_t result = scanUInt(buf, n, &value, 0, UINT_MAX); + if (result > 0) { + setAlbireoTimeoutInterval(value); + } + return result; +} + +/**********************************************************************/ +static ssize_t vdoMinAlbireoTimerIntervalStore(struct kvdoDevice *device, + const char *buf, + size_t n) +{ + unsigned int value; + ssize_t result = scanUInt(buf, n, &value, 0, UINT_MAX); + if (result > 0) { + setMinAlbireoTimerInterval(value); + } + return result; +} + +/**********************************************************************/ +static ssize_t vdoVersionShow(struct kvdoDevice *device, + struct attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", CURRENT_VERSION); +} + +/**********************************************************************/ +static ssize_t vdoAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + if (vdoAttr->show == NULL) { + return -EINVAL; + } + + struct kvdoDevice *device = container_of(kobj, struct kvdoDevice, kobj); + return (*vdoAttr->show)(device, attr, buf); +} + +/**********************************************************************/ +static ssize_t vdoAttrStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); + if (vdoAttr->store == NULL) { + return -EINVAL; + } + + struct kvdoDevice *device = container_of(kobj, struct kvdoDevice, kobj); + return (*vdoAttr->store)(device, buf, length); +} + +static VDOAttribute vdoStatusAttr = { + .attr = { .name = "status", .mode = 0444, }, + .show = vdoStatusShow, +}; + +static VDOAttribute vdoLogLevelAttr = { + .attr = {.name = "log_level", .mode = 0644, }, + .show = vdoLogLevelShow, + .store = vdoLogLevelStore, +}; + +static VDOAttribute vdoMaxReqActiveAttr = { + .attr = {.name = "max_requests_active", .mode = 0644, }, + .show = showInt, + .store = vdoMaxReqActiveStore, + .valuePtr = &defaultMaxRequestsActive, +}; + +static VDOAttribute vdoAlbireoTimeoutInterval = { + .attr = {.name = "deduplication_timeout_interval", .mode = 0644, }, + .show = showUInt, + .store = vdoAlbireoTimeoutIntervalStore, + .valuePtr = &albireoTimeoutInterval, +}; + +static VDOAttribute vdoMinAlbireoTimerInterval = { + .attr = {.name = "min_deduplication_timer_interval", .mode = 0644, }, + .show = showUInt, + .store = vdoMinAlbireoTimerIntervalStore, + .valuePtr = &minAlbireoTimerInterval, +}; + +static VDOAttribute vdoTraceRecording = { + .attr = {.name = "trace_recording", .mode = 0644, }, + .show = showBool, + .store = vdoTraceRecordingStore, + .valuePtr = &traceRecording, +}; + +static VDOAttribute vdoVersionAttr = { + .attr = { .name = "version", .mode = 0444, }, + .show = vdoVersionShow, +}; + +static struct attribute *defaultAttrs[] = { + &vdoStatusAttr.attr, + &vdoLogLevelAttr.attr, + &vdoMaxReqActiveAttr.attr, + &vdoAlbireoTimeoutInterval.attr, + &vdoMinAlbireoTimerInterval.attr, + &vdoTraceRecording.attr, + &vdoVersionAttr.attr, + NULL +}; + +static struct sysfs_ops vdoSysfsOps = { + .show = vdoAttrShow, + .store = vdoAttrStore, +}; + +/**********************************************************************/ +static void vdoRelease(struct kobject *kobj) +{ + return; +} + +struct kobj_type vdo_ktype = { + .release = vdoRelease, + .sysfs_ops = &vdoSysfsOps, + .default_attrs = defaultAttrs, +}; + +/**********************************************************************/ +int vdoInitSysfs(struct kobject *deviceObject) +{ + kobject_init(deviceObject, &vdo_ktype); + int result = kobject_add(deviceObject, NULL, THIS_MODULE->name); + if (result < 0) { + logError("kobject_add failed with status %d", -result); + kobject_put(deviceObject); + } + logDebug("added sysfs objects"); + return result; +}; + +/**********************************************************************/ +void vdoPutSysfs(struct kobject *deviceObject) +{ + kobject_put(deviceObject); +} diff --git a/source/vdo/kernel/sysfs.h b/source/vdo/kernel/sysfs.h new file mode 100644 index 0000000..3dbac04 --- /dev/null +++ b/source/vdo/kernel/sysfs.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/sysfs.h#2 $ + */ + +#ifndef ALBIREO_SYSFS_H +#define ALBIREO_SYSFS_H + +#include "kernelLayer.h" + +struct kvdoDevice; + +/** +* Initializes the sysfs objects global to all vdo devices. +* +* @param deviceObject the kobject of the kvdoDevice to initialize. +*/ +int vdoInitSysfs(struct kobject *deviceObject); + +/** + * Releases the global sysfs objects. + * + * @param deviceObject the kobject of the kvdoDevice to release. + */ +void vdoPutSysfs(struct kobject *deviceObject); + +#endif /* ALBIREO_SYSFS_H */ diff --git a/source/vdo/kernel/threadDevice.c b/source/vdo/kernel/threadDevice.c new file mode 100644 index 0000000..49fb909 --- /dev/null +++ b/source/vdo/kernel/threadDevice.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadDevice.c#1 $ + */ + +#include "threadDevice.h" + +#include "threadRegistry.h" + +/* + * A registry of all threads temporarily associated with particular + * VDO devices. + */ +static ThreadRegistry deviceIDThreadRegistry; + +/**********************************************************************/ +void registerThreadDeviceID(RegisteredThread *newThread, unsigned int *idPtr) +{ + registerThread(&deviceIDThreadRegistry, newThread, idPtr); +} + +/**********************************************************************/ +void unregisterThreadDeviceID(void) +{ + unregisterThread(&deviceIDThreadRegistry); +} + +/**********************************************************************/ +int getThreadDeviceID(void) +{ + const unsigned int *pointer = lookupThread(&deviceIDThreadRegistry); + return pointer ? *pointer : -1; +} + +/**********************************************************************/ +void initializeThreadDeviceRegistry(void) +{ + initializeThreadRegistry(&deviceIDThreadRegistry); +} diff --git a/source/vdo/kernel/threadDevice.h b/source/vdo/kernel/threadDevice.h new file mode 100644 index 0000000..61b4ce6 --- /dev/null +++ b/source/vdo/kernel/threadDevice.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadDevice.h#1 $ + */ + +#include "kernelLayer.h" + +/** + * Temporarily register the current thread as being associated with a + * VDO device id number, for logging purposes. + * + * Any such registered thread must later be unregistered via + * unregisterThreadDeviceID. + * + * The pointed-to ID number should be nonzero. + * + * @param newThread RegisteredThread structure to use for the current thread + * @param idPtr Location where the ID number is stored + **/ +void registerThreadDeviceID(RegisteredThread *newThread, unsigned int *idPtr); + +/** + * Temporarily register the current thread as being associated with an + * existing VDO device, for logging purposes. + * + * Any such registered thread must later be unregistered via + * unregisterThreadDeviceID. + * + * @param newThread RegisteredThread structure to use for the current thread + * @param layer The KernelLayer object for the VDO device + **/ +static inline void registerThreadDevice(RegisteredThread *newThread, + KernelLayer *layer) +{ + registerThreadDeviceID(newThread, &layer->instance); +} + +/** + * Cancel registration of the current thread as being associated with + * a VDO device or device ID number. + **/ +void unregisterThreadDeviceID(void); + +/** + * Get the VDO device ID number temporarily associated with the + * current thread, if any. + * + * @return the device ID number, if any, or -1 + **/ +int getThreadDeviceID(void); + +/** + * Initialize the thread device-ID registry. + **/ +void initializeThreadDeviceRegistry(void); diff --git a/source/vdo/kernel/threadRegistry.c b/source/vdo/kernel/threadRegistry.c new file mode 100644 index 0000000..6184d3c --- /dev/null +++ b/source/vdo/kernel/threadRegistry.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadRegistry.c#1 $ + */ + +#include "threadRegistry.h" + +#include +#include + +#include "permassert.h" + +/* + * We need to be careful when using other facilities that may use + * threadRegistry functions in their normal operation. For example, + * we do not want to invoke the logger while holding a lock. + */ + +/*****************************************************************************/ +void registerThread(ThreadRegistry *registry, + RegisteredThread *newThread, + const void *pointer) +{ + INIT_LIST_HEAD(&newThread->links); + newThread->pointer = pointer; + newThread->task = current; + + bool foundIt = false; + RegisteredThread *thread; + write_lock(®istry->lock); + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + // This should not have been there. + // We'll complain after releasing the lock. + list_del_init(&thread->links); + foundIt = true; + break; + } + } + list_add_tail(&newThread->links, ®istry->links); + write_unlock(®istry->lock); + ASSERT_LOG_ONLY(!foundIt, "new thread not already in registry"); +} + +/*****************************************************************************/ +void unregisterThread(ThreadRegistry *registry) +{ + bool foundIt = false; + RegisteredThread *thread; + write_lock(®istry->lock); + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + list_del_init(&thread->links); + foundIt = true; + break; + } + } + write_unlock(®istry->lock); + ASSERT_LOG_ONLY(foundIt, "thread found in registry"); +} + +/*****************************************************************************/ +void initializeThreadRegistry(ThreadRegistry *registry) +{ + INIT_LIST_HEAD(®istry->links); + rwlock_init(®istry->lock); +} + +/*****************************************************************************/ +const void *lookupThread(ThreadRegistry *registry) +{ + const void *result = NULL; + read_lock(®istry->lock); + RegisteredThread *thread; + list_for_each_entry(thread, ®istry->links, links) { + if (thread->task == current) { + result = thread->pointer; + break; + } + } + read_unlock(®istry->lock); + return result; +} diff --git a/source/vdo/kernel/threadRegistry.h b/source/vdo/kernel/threadRegistry.h new file mode 100644 index 0000000..f32325e --- /dev/null +++ b/source/vdo/kernel/threadRegistry.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadRegistry.h#1 $ + */ + +#ifndef THREAD_REGISTRY_H +#define THREAD_REGISTRY_H 1 + +#include +#include + +/* + * We don't expect this set to ever get really large, so a linked list + * is adequate. + */ + +typedef struct threadRegistry { + struct list_head links; + rwlock_t lock; +} ThreadRegistry; + +typedef struct registeredThread { + struct list_head links; + const void *pointer; + struct task_struct *task; +} RegisteredThread; + +/*****************************************************************************/ + +/** + * Initialize a registry of threads and associated data pointers. + * + * @param registry The registry to initialize + **/ +void initializeThreadRegistry(ThreadRegistry *registry); + +/** + * Register the current thread and associate it with a data pointer. + * + * This call will log messages if the thread is already registered. + * + * @param registry The thread registry + * @param newThread RegisteredThread structure to use for the current thread + * @param pointer The value to associated with the current thread + **/ +void registerThread(ThreadRegistry *registry, + RegisteredThread *newThread, + const void *pointer); + +/** + * Remove the registration for the current thread. + * + * A message may be logged if the thread was not registered. + * + * @param registry The thread registry + **/ +void unregisterThread(ThreadRegistry *registry); + +/** + * Fetch a pointer that may have been registered for the current + * thread. If the thread is not registered, a null pointer is + * returned. + * + * @param registry The thread registry + * + * @return the registered pointer, if any, or NULL + **/ +const void *lookupThread(ThreadRegistry *registry); + +#endif /* THREAD_REGISTRY_H */ diff --git a/source/vdo/kernel/threads.c b/source/vdo/kernel/threads.c new file mode 100644 index 0000000..2f905ed --- /dev/null +++ b/source/vdo/kernel/threads.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threads.c#1 $ + */ + +#include "threads.h" + +#include +#include + +/**********************************************************************/ +pid_t getThreadId(void) +{ + return in_interrupt() ? -1 : current->pid; +} diff --git a/source/vdo/kernel/threads.h b/source/vdo/kernel/threads.h new file mode 100644 index 0000000..25f8b47 --- /dev/null +++ b/source/vdo/kernel/threads.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threads.h#1 $ + */ + +#ifndef THREADS_H +#define THREADS_H + +#include + +/** + * Return the id of the current thread. + * In kernel interrupt context, returns -1. + * + * @return the thread id + **/ +pid_t getThreadId(void) + __attribute__((warn_unused_result)); + +#endif /* THREADS_H */ diff --git a/source/vdo/kernel/udsIndex.c b/source/vdo/kernel/udsIndex.c new file mode 100644 index 0000000..a202446 --- /dev/null +++ b/source/vdo/kernel/udsIndex.c @@ -0,0 +1,835 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/udsIndex.c#16 $ + */ + +#include "udsIndex.h" + +#include "logger.h" +#include "memoryAlloc.h" +#include "murmur/MurmurHash3.h" +#include "numeric.h" +#include "stringUtils.h" +#include "uds-block.h" + +/*****************************************************************************/ + +typedef struct udsAttribute { + struct attribute attr; + const char *(*showString)(DedupeIndex *); +} UDSAttribute; + +/*****************************************************************************/ + +enum { UDS_Q_ACTION }; + +/*****************************************************************************/ + +// These are the values in the atomic dedupeContext.requestState field +enum { + // The UdsRequest object is not in use. + UR_IDLE = 0, + // The UdsRequest object is in use, and VDO is waiting for the result. + UR_BUSY = 1, + // The UdsRequest object is in use, but has timed out. + UR_TIMED_OUT = 2, +}; + +/*****************************************************************************/ + +typedef enum { + // The UDS index is closed + IS_CLOSED = 0, + // The UDS index session is opening or closing + IS_CHANGING = 1, + // The UDS index is open. There is a UDS index session. + IS_OPENED = 2, +} IndexState; + +/*****************************************************************************/ + +typedef struct udsIndex { + DedupeIndex common; + struct kobject dedupeObject; + RegisteredThread allocatingThread; + char *indexName; + UdsConfiguration configuration; + struct uds_parameters udsParams; + struct uds_index_session *indexSession; + atomic_t active; + // This spinlock protects the state fields and the starting of dedupe + // requests. + spinlock_t stateLock; + KvdoWorkItem workItem; // protected by stateLock + KvdoWorkQueue *udsQueue; // protected by stateLock + unsigned int maximum; // protected by stateLock + IndexState indexState; // protected by stateLock + IndexState indexTarget; // protected by stateLock + bool changing; // protected by stateLock + bool createFlag; // protected by stateLock + bool dedupeFlag; // protected by stateLock + bool deduping; // protected by stateLock + bool errorFlag; // protected by stateLock + bool suspended; // protected by stateLock + // This spinlock protects the pending list, the pending flag in each KVIO, + // and the timeout list. + spinlock_t pendingLock; + struct list_head pendingHead; // protected by pendingLock + struct timer_list pendingTimer; // protected by pendingLock + bool startedTimer; // protected by pendingLock +} UDSIndex; + +/*****************************************************************************/ + +// Version 1: user space albireo index (limited to 32 bytes) +// Version 2: kernel space albireo index (limited to 16 bytes) +enum { + UDS_ADVICE_VERSION = 2, + // version byte + state byte + 64-bit little-endian PBN + UDS_ADVICE_SIZE = 1 + 1 + sizeof(uint64_t), +}; + +/*****************************************************************************/ + + // We want to ensure that there is only one copy of the following constants. +static const char *CLOSED = "closed"; +static const char *CLOSING = "closing"; +static const char *ERROR = "error"; +static const char *OFFLINE = "offline"; +static const char *ONLINE = "online"; +static const char *OPENING = "opening"; +static const char *SUSPENDED = "suspended"; +static const char *UNKNOWN = "unknown"; + +/*****************************************************************************/ +static const char *indexStateToString(UDSIndex *index, IndexState state) +{ + if (index->suspended) { + return SUSPENDED; + } + + switch (state) { + case IS_CLOSED: + // Closed. The errorFlag tells if it is because of an error. + return index->errorFlag ? ERROR : CLOSED; + case IS_CHANGING: + // The indexTarget tells if we are opening or closing the index. + return index->indexTarget == IS_OPENED ? OPENING : CLOSING; + case IS_OPENED: + // Opened. The dedupeFlag tells if we are online or offline. + return index->dedupeFlag ? ONLINE : OFFLINE; + default: + return UNKNOWN; + } +} + +/** + * Encode VDO duplicate advice into the newMetadata field of a UDS request. + * + * @param request The UDS request to receive the encoding + * @param advice The advice to encode + **/ +static void encodeUDSAdvice(UdsRequest *request, DataLocation advice) +{ + size_t offset = 0; + struct udsChunkData *encoding = &request->newMetadata; + encoding->data[offset++] = UDS_ADVICE_VERSION; + encoding->data[offset++] = advice.state; + encodeUInt64LE(encoding->data, &offset, advice.pbn); + BUG_ON(offset != UDS_ADVICE_SIZE); +} + +/** + * Decode VDO duplicate advice from the oldMetadata field of a UDS request. + * + * @param request The UDS request containing the encoding + * @param advice The DataLocation to receive the decoded advice + * + * @return true if valid advice was found and decoded + **/ +static bool decodeUDSAdvice(const UdsRequest *request, DataLocation *advice) +{ + if ((request->status != UDS_SUCCESS) || !request->found) { + return false; + } + + size_t offset = 0; + const struct udsChunkData *encoding = &request->oldMetadata; + byte version = encoding->data[offset++]; + if (version != UDS_ADVICE_VERSION) { + logError("invalid UDS advice version code %u", version); + return false; + } + + advice->state = encoding->data[offset++]; + decodeUInt64LE(encoding->data, &offset, &advice->pbn); + BUG_ON(offset != UDS_ADVICE_SIZE); + return true; +} + +/*****************************************************************************/ +static void finishIndexOperation(UdsRequest *udsRequest) +{ + DataKVIO *dataKVIO = container_of(udsRequest, DataKVIO, + dedupeContext.udsRequest); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + if (compareAndSwap32(&dedupeContext->requestState, UR_BUSY, UR_IDLE)) { + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); + + spin_lock_bh(&index->pendingLock); + if (dedupeContext->isPending) { + list_del(&dedupeContext->pendingList); + dedupeContext->isPending = false; + } + spin_unlock_bh(&index->pendingLock); + + dedupeContext->status = udsRequest->status; + if ((udsRequest->type == UDS_POST) || (udsRequest->type == UDS_QUERY)) { + DataLocation advice; + if (decodeUDSAdvice(udsRequest, &advice)) { + setDedupeAdvice(dedupeContext, &advice); + } else { + setDedupeAdvice(dedupeContext, NULL); + } + } + invokeDedupeCallback(dataKVIO); + atomic_dec(&index->active); + } else { + compareAndSwap32(&dedupeContext->requestState, UR_TIMED_OUT, UR_IDLE); + } +} + +/*****************************************************************************/ +static void startExpirationTimer(UDSIndex *index, DataKVIO *dataKVIO) +{ + if (!index->startedTimer) { + index->startedTimer = true; + mod_timer(&index->pendingTimer, + getAlbireoTimeout(dataKVIO->dedupeContext.submissionTime)); + } +} + +/*****************************************************************************/ +static void startIndexOperation(KvdoWorkItem *item) +{ + KVIO *kvio = workItemAsKVIO(item); + DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); + UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + + spin_lock_bh(&index->pendingLock); + list_add_tail(&dedupeContext->pendingList, &index->pendingHead); + dedupeContext->isPending = true; + startExpirationTimer(index, dataKVIO); + spin_unlock_bh(&index->pendingLock); + + UdsRequest *udsRequest = &dedupeContext->udsRequest; + int status = udsStartChunkOperation(udsRequest); + if (status != UDS_SUCCESS) { + udsRequest->status = status; + finishIndexOperation(udsRequest); + } +} + +/*****************************************************************************/ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) +static void timeoutIndexOperations(struct timer_list *t) +#else +static void timeoutIndexOperations(unsigned long arg) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) + UDSIndex *index = from_timer(index, t, pendingTimer); +#else + UDSIndex *index = (UDSIndex *) arg; +#endif + LIST_HEAD(expiredHead); + uint64_t timeoutJiffies = msecs_to_jiffies(albireoTimeoutInterval); + unsigned long earliestSubmissionAllowed = jiffies - timeoutJiffies; + spin_lock_bh(&index->pendingLock); + index->startedTimer = false; + while (!list_empty(&index->pendingHead)) { + DataKVIO *dataKVIO = list_first_entry(&index->pendingHead, DataKVIO, + dedupeContext.pendingList); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + if (earliestSubmissionAllowed <= dedupeContext->submissionTime) { + startExpirationTimer(index, dataKVIO); + break; + } + list_del(&dedupeContext->pendingList); + dedupeContext->isPending = false; + list_add_tail(&dedupeContext->pendingList, &expiredHead); + } + spin_unlock_bh(&index->pendingLock); + while (!list_empty(&expiredHead)) { + DataKVIO *dataKVIO = list_first_entry(&expiredHead, DataKVIO, + dedupeContext.pendingList); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + list_del(&dedupeContext->pendingList); + if (compareAndSwap32(&dedupeContext->requestState, + UR_BUSY, UR_TIMED_OUT)) { + dedupeContext->status = ETIMEDOUT; + invokeDedupeCallback(dataKVIO); + atomic_dec(&index->active); + kvdoReportDedupeTimeout(dataKVIOAsKVIO(dataKVIO)->layer, 1); + } + } +} + +/*****************************************************************************/ +static void enqueueIndexOperation(DataKVIO *dataKVIO, + UdsCallbackType operation) +{ + KVIO *kvio = dataKVIOAsKVIO(dataKVIO); + DedupeContext *dedupeContext = &dataKVIO->dedupeContext; + UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); + dedupeContext->status = UDS_SUCCESS; + dedupeContext->submissionTime = jiffies; + if (compareAndSwap32(&dedupeContext->requestState, UR_IDLE, UR_BUSY)) { + UdsRequest *udsRequest = &dataKVIO->dedupeContext.udsRequest; + udsRequest->chunkName = *dedupeContext->chunkName; + udsRequest->callback = finishIndexOperation; + udsRequest->session = index->indexSession; + udsRequest->type = operation; + udsRequest->update = true; + if ((operation == UDS_POST) || (operation == UDS_UPDATE)) { + encodeUDSAdvice(udsRequest, getDedupeAdvice(dedupeContext)); + } + + setupWorkItem(&kvio->enqueueable.workItem, startIndexOperation, NULL, + UDS_Q_ACTION); + + spin_lock(&index->stateLock); + if (index->deduping) { + enqueueWorkQueue(index->udsQueue, &kvio->enqueueable.workItem); + unsigned int active = atomic_inc_return(&index->active); + if (active > index->maximum) { + index->maximum = active; + } + kvio = NULL; + } else { + atomicStore32(&dedupeContext->requestState, UR_IDLE); + } + spin_unlock(&index->stateLock); + } else { + // A previous user of the KVIO had a dedupe timeout + // and its request is still outstanding. + atomic64_inc(&kvio->layer->dedupeContextBusy); + } + if (kvio != NULL) { + invokeDedupeCallback(dataKVIO); + } +} + +/*****************************************************************************/ +static void closeIndex(UDSIndex *index) +{ + // Change the index state so that getIndexStatistics will not try to + // use the index session we are closing. + index->indexState = IS_CHANGING; + spin_unlock(&index->stateLock); + int result = udsCloseIndex(index->indexSession); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Error closing index %s", + index->indexName); + } + spin_lock(&index->stateLock); + index->indexState = IS_CLOSED; + index->errorFlag |= result != UDS_SUCCESS; + // ASSERTION: We leave in IS_CLOSED state. +} + +/*****************************************************************************/ +static void openIndex(UDSIndex *index) +{ + // ASSERTION: We enter in IS_CLOSED state. + bool createFlag = index->createFlag; + index->createFlag = false; + // Change the index state so that the it will be reported to the outside + // world as "opening". + index->indexState = IS_CHANGING; + index->errorFlag = false; + // Open the index session, while not holding the stateLock + spin_unlock(&index->stateLock); + + int result = udsOpenIndex(createFlag ? UDS_CREATE : UDS_LOAD, + index->indexName, &index->udsParams, + index->configuration, index->indexSession); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Error opening index %s", + index->indexName); + } + spin_lock(&index->stateLock); + if (!createFlag) { + switch (result) { + case UDS_CORRUPT_COMPONENT: + case UDS_NO_INDEX: + // Either there is no index, or there is no way we can recover the index. + // We will be called again and try to create a new index. + index->indexState = IS_CLOSED; + index->createFlag = true; + return; + default: + break; + } + } + if (result == UDS_SUCCESS) { + index->indexState = IS_OPENED; + } else { + index->indexState = IS_CLOSED; + index->indexTarget = IS_CLOSED; + index->errorFlag = true; + spin_unlock(&index->stateLock); + logInfo("Setting UDS index target state to error"); + spin_lock(&index->stateLock); + } + // ASSERTION: On success, we leave in IS_OPEN state. + // ASSERTION: On failure, we leave in IS_CLOSED state. +} + +/*****************************************************************************/ +static void changeDedupeState(KvdoWorkItem *item) +{ + UDSIndex *index = container_of(item, UDSIndex, workItem); + spin_lock(&index->stateLock); + // Loop until the index is in the target state and the create flag is + // clear. + while (!index->suspended && + ((index->indexState != index->indexTarget) || + index->createFlag)) { + if (index->indexState == IS_OPENED) { + closeIndex(index); + } else { + openIndex(index); + } + } + index->changing = false; + index->deduping = index->dedupeFlag && (index->indexState == IS_OPENED); + spin_unlock(&index->stateLock); +} + + +/*****************************************************************************/ +static void launchDedupeStateChange(UDSIndex *index) +{ + // ASSERTION: We enter with the state_lock held. + if (index->changing || index->suspended) { + // Either a change is already in progress, or changes are + // not allowed. + return; + } + + if (index->createFlag || + (index->indexState != index->indexTarget)) { + index->changing = true; + index->deduping = false; + setupWorkItem(&index->workItem, + changeDedupeState, + NULL, + UDS_Q_ACTION); + enqueueWorkQueue(index->udsQueue, &index->workItem); + return; + } + + // Online vs. offline changes happen immediately + index->deduping = (index->dedupeFlag && !index->suspended && + (index->indexState == IS_OPENED)); + + // ASSERTION: We exit with the state_lock held. +} + +/*****************************************************************************/ +static void setTargetState(UDSIndex *index, + IndexState target, + bool changeDedupe, + bool dedupe, + bool setCreate) +{ + spin_lock(&index->stateLock); + const char *oldState = indexStateToString(index, index->indexTarget); + if (changeDedupe) { + index->dedupeFlag = dedupe; + } + if (setCreate) { + index->createFlag = true; + } + index->indexTarget = target; + launchDedupeStateChange(index); + const char *newState = indexStateToString(index, index->indexTarget); + spin_unlock(&index->stateLock); + if (oldState != newState) { + logInfo("Setting UDS index target state to %s", newState); + } +} + +/*****************************************************************************/ +static void suspendUDSIndex(DedupeIndex *dedupeIndex, bool saveFlag) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + spin_lock(&index->stateLock); + index->suspended = true; + IndexState indexState = index->indexState; + spin_unlock(&index->stateLock); + if (indexState != IS_CLOSED) { + int result = udsSuspendIndexSession(index->indexSession, saveFlag); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Error suspending dedupe index"); + } + } +} + +/*****************************************************************************/ +static void resumeUDSIndex(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + int result = udsResumeIndexSession(index->indexSession); + if (result != UDS_SUCCESS) { + logErrorWithStringError(result, "Error resuming dedupe index"); + } + spin_lock(&index->stateLock); + index->suspended = false; + launchDedupeStateChange(index); + spin_unlock(&index->stateLock); +} + +/*****************************************************************************/ + +/*****************************************************************************/ +static void dumpUDSIndex(DedupeIndex *dedupeIndex, bool showQueue) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + spin_lock(&index->stateLock); + const char *state = indexStateToString(index, index->indexState); + const char *target = (index->changing + ? indexStateToString(index, index->indexTarget) + : NULL); + spin_unlock(&index->stateLock); + logInfo("UDS index: state: %s", state); + if (target != NULL) { + logInfo("UDS index: changing to state: %s", target); + } + if (showQueue) { + dumpWorkQueue(index->udsQueue); + } +} + +/*****************************************************************************/ +static void finishUDSIndex(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + setTargetState(index, IS_CLOSED, false, false, false); + udsDestroyIndexSession(index->indexSession); + finishWorkQueue(index->udsQueue); +} + +/*****************************************************************************/ +static void freeUDSIndex(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + freeWorkQueue(&index->udsQueue); + spin_lock_bh(&index->pendingLock); + if (index->startedTimer) { + del_timer_sync(&index->pendingTimer); + } + spin_unlock_bh(&index->pendingLock); + kobject_put(&index->dedupeObject); +} + +/*****************************************************************************/ +static const char *getUDSStateName(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + spin_lock(&index->stateLock); + const char *state = indexStateToString(index, index->indexState); + spin_unlock(&index->stateLock); + return state; +} + +/*****************************************************************************/ +static void getUDSStatistics(DedupeIndex *dedupeIndex, IndexStatistics *stats) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + spin_lock(&index->stateLock); + IndexState indexState = index->indexState; + stats->maxDedupeQueries = index->maximum; + spin_unlock(&index->stateLock); + stats->currDedupeQueries = atomic_read(&index->active); + if (indexState == IS_OPENED) { + UdsIndexStats indexStats; + int result = udsGetIndexStats(index->indexSession, &indexStats); + if (result == UDS_SUCCESS) { + stats->entriesIndexed = indexStats.entriesIndexed; + } else { + logErrorWithStringError(result, "Error reading index stats"); + } + UdsContextStats contextStats; + result = udsGetIndexSessionStats(index->indexSession, &contextStats); + if (result == UDS_SUCCESS) { + stats->postsFound = contextStats.postsFound; + stats->postsNotFound = contextStats.postsNotFound; + stats->queriesFound = contextStats.queriesFound; + stats->queriesNotFound = contextStats.queriesNotFound; + stats->updatesFound = contextStats.updatesFound; + stats->updatesNotFound = contextStats.updatesNotFound; + } else { + logErrorWithStringError(result, "Error reading context stats"); + } + } +} + + +/*****************************************************************************/ +static int processMessage(DedupeIndex *dedupeIndex, const char *name) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + if (strcasecmp(name, "index-close") == 0) { + setTargetState(index, IS_CLOSED, false, false, false); + return 0; + } else if (strcasecmp(name, "index-create") == 0) { + setTargetState(index, IS_OPENED, false, false, true); + return 0; + } else if (strcasecmp(name, "index-disable") == 0) { + setTargetState(index, IS_OPENED, true, false, false); + return 0; + } else if (strcasecmp(name, "index-enable") == 0) { + setTargetState(index, IS_OPENED, true, true, false); + return 0; + } + return -EINVAL; +} + +/*****************************************************************************/ +static void udsPost(DataKVIO *dataKVIO) +{ + enqueueIndexOperation(dataKVIO, UDS_POST); +} + +/*****************************************************************************/ +static void udsQuery(DataKVIO *dataKVIO) +{ + enqueueIndexOperation(dataKVIO, UDS_QUERY); +} + +/*****************************************************************************/ +static void startUDSIndex(DedupeIndex *dedupeIndex, bool createFlag) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + setTargetState(index, IS_OPENED, true, true, createFlag); +} + +/*****************************************************************************/ +static void stopUDSIndex(DedupeIndex *dedupeIndex) +{ + UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); + setTargetState(index, IS_CLOSED, false, false, false); +} + +/*****************************************************************************/ +static void udsUpdate(DataKVIO *dataKVIO) +{ + enqueueIndexOperation(dataKVIO, UDS_UPDATE); +} + +/*****************************************************************************/ +static void dedupeKobjRelease(struct kobject *kobj) +{ + UDSIndex *index = container_of(kobj, UDSIndex, dedupeObject); + udsFreeConfiguration(index->configuration); + FREE(index->indexName); + FREE(index); +} + +/*****************************************************************************/ +static ssize_t dedupeStatusShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + UDSAttribute *ua = container_of(attr, UDSAttribute, attr); + UDSIndex *index = container_of(kobj, UDSIndex, dedupeObject); + if (ua->showString != NULL) { + return sprintf(buf, "%s\n", ua->showString(&index->common)); + } else { + return -EINVAL; + } +} + +/*****************************************************************************/ +static ssize_t dedupeStatusStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + return -EINVAL; +} + +/*****************************************************************************/ + +static struct sysfs_ops dedupeSysfsOps = { + .show = dedupeStatusShow, + .store = dedupeStatusStore, +}; + +static UDSAttribute dedupeStatusAttribute = { + .attr = {.name = "status", .mode = 0444, }, + .showString = getUDSStateName, +}; + +static struct attribute *dedupeAttributes[] = { + &dedupeStatusAttribute.attr, + NULL, +}; + +static struct kobj_type dedupeKobjType = { + .release = dedupeKobjRelease, + .sysfs_ops = &dedupeSysfsOps, + .default_attrs = dedupeAttributes, +}; + +/*****************************************************************************/ +static void startUDSQueue(void *ptr) +{ + /* + * Allow the UDS dedupe worker thread to do memory allocations. It will + * only do allocations during the UDS calls that open or close an index, + * but those allocations can safely sleep while reserving a large amount + * of memory. We could use an allocationsAllowed boolean (like the base + * threads do), but it would be an unnecessary embellishment. + */ + UDSIndex *index = ptr; + registerAllocatingThread(&index->allocatingThread, NULL); +} + +/*****************************************************************************/ +static void finishUDSQueue(void *ptr) +{ + unregisterAllocatingThread(); +} + +/*****************************************************************************/ +int makeUDSIndex(KernelLayer *layer, DedupeIndex **indexPtr) +{ + UDSIndex *index; + int result = ALLOCATE(1, UDSIndex, "UDS index data", &index); + if (result != UDS_SUCCESS) { + return result; + } + + result = allocSprintf("index name", &index->indexName, + "dev=%s offset=4096 size=%llu", + layer->deviceConfig->parentDeviceName, + getIndexRegionSize(layer->geometry) * VDO_BLOCK_SIZE); + if (result != UDS_SUCCESS) { + logError("Creating index name failed (%d)", result); + FREE(index); + return result; + } + + index->udsParams = (struct uds_parameters) UDS_PARAMETERS_INITIALIZER; + indexConfigToUdsParameters(&layer->geometry.indexConfig, &index->udsParams); + result = indexConfigToUdsConfiguration(&layer->geometry.indexConfig, + &index->configuration); + if (result != VDO_SUCCESS) { + FREE(index->indexName); + FREE(index); + return result; + } + udsConfigurationSetNonce(index->configuration, + (UdsNonce) layer->geometry.nonce); + + result = udsCreateIndexSession(&index->indexSession); + if (result != UDS_SUCCESS) { + udsFreeConfiguration(index->configuration); + FREE(index->indexName); + FREE(index); + return result; + } + + static const KvdoWorkQueueType udsQueueType = { + .start = startUDSQueue, + .finish = finishUDSQueue, + .actionTable = { + { .name = "uds_action", .code = UDS_Q_ACTION, .priority = 0 }, + }, + }; + result = makeWorkQueue(layer->threadNamePrefix, "dedupeQ", + &layer->wqDirectory, layer, index, &udsQueueType, 1, + &index->udsQueue); + if (result != VDO_SUCCESS) { + logError("UDS index queue initialization failed (%d)", result); + udsDestroyIndexSession(index->indexSession); + udsFreeConfiguration(index->configuration); + FREE(index->indexName); + FREE(index); + return result; + } + + kobject_init(&index->dedupeObject, &dedupeKobjType); + result = kobject_add(&index->dedupeObject, &layer->kobj, "dedupe"); + if (result != VDO_SUCCESS) { + freeWorkQueue(&index->udsQueue); + udsDestroyIndexSession(index->indexSession); + udsFreeConfiguration(index->configuration); + FREE(index->indexName); + FREE(index); + return result; + } + + index->common.dump = dumpUDSIndex; + index->common.free = freeUDSIndex; + index->common.getDedupeStateName = getUDSStateName; + index->common.getStatistics = getUDSStatistics; + index->common.message = processMessage; + index->common.post = udsPost; + index->common.query = udsQuery; + index->common.resume = resumeUDSIndex; + index->common.start = startUDSIndex; + index->common.stop = stopUDSIndex; + index->common.suspend = suspendUDSIndex; + index->common.finish = finishUDSIndex; + index->common.update = udsUpdate; + + INIT_LIST_HEAD(&index->pendingHead); + spin_lock_init(&index->pendingLock); + spin_lock_init(&index->stateLock); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) + timer_setup(&index->pendingTimer, timeoutIndexOperations, 0); +#else + setup_timer(&index->pendingTimer, timeoutIndexOperations, + (unsigned long) index); +#endif + + *indexPtr = &index->common; + return VDO_SUCCESS; +} diff --git a/source/vdo/kernel/udsIndex.h b/source/vdo/kernel/udsIndex.h new file mode 100644 index 0000000..19a7470 --- /dev/null +++ b/source/vdo/kernel/udsIndex.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/udsIndex.h#1 $ + */ + +#ifndef UDS_INDEX_H +#define UDS_INDEX_H + +#include "dedupeIndex.h" + +/** + * Make a UDS index + * + * @param layer the kernel layer + * @param indexPtr dedupe index returned here + * + * @return VDO_SUCCESS or an error code + **/ +int makeUDSIndex(KernelLayer *layer, DedupeIndex **indexPtr) + __attribute__ ((__warn_unused_result__)); + +#endif /* UDS_INDEX_H */ diff --git a/source/vdo/kernel/vdoCommon.h b/source/vdo/kernel/vdoCommon.h new file mode 100644 index 0000000..c83e066 --- /dev/null +++ b/source/vdo/kernel/vdoCommon.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoCommon.h#1 $ + */ + +#ifndef VDO_COMMON_H +#define VDO_COMMON_H + +enum { + // Whether the bio acknowledgement queue is used for acks of reads. + USE_BIO_ACK_QUEUE_FOR_READ = 0, +}; + +#endif /* VDO_COMMON_H */ diff --git a/source/vdo/kernel/vdoStringUtils.c b/source/vdo/kernel/vdoStringUtils.c new file mode 100644 index 0000000..d12580c --- /dev/null +++ b/source/vdo/kernel/vdoStringUtils.c @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoStringUtils.c#1 $ + */ + +#include "vdoStringUtils.h" + +#include "errors.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "stringUtils.h" + +#include "statusCodes.h" + +/**********************************************************************/ +char *vAppendToBuffer(char *buffer, + char *bufEnd, + const char *fmt, + va_list args) +{ + size_t n = vsnprintf(buffer, bufEnd - buffer, fmt, args); + if (n >= (size_t) (bufEnd - buffer)) { + buffer = bufEnd; + } else { + buffer += n; + } + return buffer; +} + +/**********************************************************************/ +char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + char *pos = vAppendToBuffer(buffer, bufEnd, fmt, ap); + va_end(ap); + return pos; +} + +/**********************************************************************/ +void freeStringArray(char **stringArray) +{ + for (unsigned int offset = 0; stringArray[offset] != NULL; offset++) { + FREE(stringArray[offset]); + } + FREE(stringArray); +} + +/**********************************************************************/ +int splitString(const char *string, char separator, char ***substringArrayPtr) +{ + unsigned int substringCount = 1; + for (const char *s = string; *s != 0; s++) { + if (*s == separator) { + substringCount++; + } + } + + char **substrings; + int result = ALLOCATE(substringCount + 1, char *, "string-splitting array", + &substrings); + if (result != UDS_SUCCESS) { + return result; + } + unsigned int currentSubstring = 0; + for (const char *s = string; *s != 0; s++) { + if (*s == separator) { + ptrdiff_t length = s - string; + result = ALLOCATE(length + 1, char, "split string", + &substrings[currentSubstring]); + if (result != UDS_SUCCESS) { + freeStringArray(substrings); + return result; + } + // Trailing NUL is already in place after allocation; deal with + // the zero or more non-NUL bytes in the string. + if (length > 0) { + memcpy(substrings[currentSubstring], string, length); + } + string = s + 1; + currentSubstring++; + BUG_ON(currentSubstring >= substringCount); + } + } + // Process final string, with no trailing separator. + BUG_ON(currentSubstring != (substringCount - 1)); + ptrdiff_t length = strlen(string); + result = ALLOCATE(length + 1, char, "split string", + &substrings[currentSubstring]); + if (result != UDS_SUCCESS) { + freeStringArray(substrings); + return result; + } + memcpy(substrings[currentSubstring], string, length); + currentSubstring++; + // substrings[currentSubstring] is NULL already + *substringArrayPtr = substrings; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int joinStrings(char **substringArray, + size_t arrayLength, + char separator, + char **stringPtr) +{ + size_t stringLength = 0; + for (size_t i = 0; (i < arrayLength) && (substringArray[i] != NULL); i++) { + stringLength += strlen(substringArray[i]) + 1; + } + + char *output; + int result = ALLOCATE(stringLength, char, __func__, &output); + if (result != VDO_SUCCESS) { + return result; + } + + char *currentPosition = &output[0]; + for (size_t i = 0; (i < arrayLength) && (substringArray[i] != NULL); i++) { + currentPosition = appendToBuffer(currentPosition, output + stringLength, + "%s", substringArray[i]); + *currentPosition = separator; + currentPosition++; + } + + // We output one too many separators; replace the last with a zero byte. + if (currentPosition != output) { + *(currentPosition - 1) = '\0'; + } + + *stringPtr = output; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int stringToUInt(const char *input, unsigned int *valuePtr) +{ + unsigned long longValue; + int result = kstrtoul(input, 10, &longValue); + if (result != 0) { + return result; + } + + if (longValue > UINT_MAX) { + return -ERANGE; + } + + *valuePtr = longValue; + return UDS_SUCCESS; +} diff --git a/source/vdo/kernel/vdoStringUtils.h b/source/vdo/kernel/vdoStringUtils.h new file mode 100644 index 0000000..067ed9e --- /dev/null +++ b/source/vdo/kernel/vdoStringUtils.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoStringUtils.h#1 $ + */ + +#ifndef VDO_STRING_UTILS_H +#define VDO_STRING_UTILS_H + +#include +#include + +/** + * Helper to append a string to a buffer. + * + * @param buffer the place at which to append the string + * @param bufEnd pointer to the end of the buffer + * @param fmt a printf format string + * + * @return the updated buffer position after the append + * + * if insufficient space is available, the contents are silently truncated + **/ +char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...); + +/** + * Variable-arglist helper to append a string to a buffer. + * If insufficient space is available, the contents are silently truncated. + * + * @param buffer the place at which to append the string + * @param bufEnd pointer to the end of the buffer + * @param fmt a printf format string + * @param args printf arguments + * + * @return the updated buffer position after the append + **/ +char *vAppendToBuffer(char *buffer, + char *bufEnd, + const char *fmt, + va_list args); + +/** + * Split the input string into substrings, separated at occurrences of + * the indicated character, returning a null-terminated list of string + * pointers. + * + * The string pointers and the pointer array itself should both be + * freed with FREE() when no longer needed. This can be done with + * freeStringArray (below) if the pointers in the array are not + * changed. Since the array and copied strings are allocated by this + * function, it may only be used in contexts where allocation is + * permitted. + * + * Empty substrings are not ignored; that is, returned substrings may + * be empty strings if the separator occurs twice in a row. + * + * @param [in] string The input string to be broken apart + * @param [in] separator The separator character + * @param [out] substringArrayPtr The NULL-terminated substring array + * + * @return UDS_SUCCESS or -ENOMEM + **/ +int splitString(const char *string, char separator, char ***substringArrayPtr) + __attribute__((warn_unused_result)); + +/** + * Join the input substrings into one string, joined with the indicated + * character, returning a string. + * + * @param [in] substringArray The NULL-terminated substring array + * @param [in] arrayLength A bound on the number of valid elements + * in substringArray, in case it is not + * NULL-terminated. + * @param [in] separator The separator character + * @param [out] stringPtr A pointer to hold the joined string + * + * @return VDO_SUCCESS or an error + **/ +int joinStrings(char **substringArray, + size_t arrayLength, + char separator, + char **stringPtr) + __attribute__((warn_unused_result)); + +/** + * Free a list of non-NULL string pointers, and then the list itself. + * + * @param stringArray The string list + **/ +void freeStringArray(char **stringArray); + +/** + * Parse a string as an "unsigned int" value, yielding the value. + * On overflow, -ERANGE is returned. On invalid number, -EINVAL is + * returned. + * + * @param [in] input The string to be processed + * @param [out] valuePtr The value of the number read + * + * @return UDS_SUCCESS or -EINVAL or -ERANGE. + **/ +int stringToUInt(const char *input, unsigned int *valuePtr) + __attribute__((warn_unused_result)); + +#endif /* VDO_STRING_UTILS_H */ diff --git a/source/vdo/kernel/verify.c b/source/vdo/kernel/verify.c new file mode 100644 index 0000000..672ac91 --- /dev/null +++ b/source/vdo/kernel/verify.c @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/verify.c#3 $ + */ + +#include "verify.h" + +#include "logger.h" + +#include "dataKVIO.h" +#include "numeric.h" + +/** + * Compare blocks of memory for equality. + * + * This assumes the blocks are likely to be large; it's not well + * optimized for comparing just a few bytes. This is desirable + * because the Linux kernel memcmp() routine on x86 is not well + * optimized for large blocks, and the performance penalty turns out + * to be significant if you're doing lots of 4KB comparisons. + * + * @param pointerArgument1 first data block + * @param pointerArgument2 second data block + * @param length length of the data block + * + * @return true iff the two blocks are equal + **/ +__attribute__((warn_unused_result)) +static bool memoryEqual(void *pointerArgument1, + void *pointerArgument2, + size_t length) +{ + byte *pointer1 = pointerArgument1; + byte *pointer2 = pointerArgument2; + while (length >= sizeof(uint64_t)) { + /* + * GET_UNALIGNED is just for paranoia. (1) On x86_64 it is + * treated the same as an aligned access. (2) In this use case, + * one or both of the inputs will almost(?) always be aligned. + */ + if (GET_UNALIGNED(uint64_t, pointer1) + != GET_UNALIGNED(uint64_t, pointer2)) { + return false; + } + pointer1 += sizeof(uint64_t); + pointer2 += sizeof(uint64_t); + length -= sizeof(uint64_t); + } + while (length > 0) { + if (*pointer1 != *pointer2) { + return false; + } + pointer1++; + pointer2++; + length--; + } + return true; +} + +/** + * Verify the Albireo-provided deduplication advice, and invoke a + * callback once the answer is available. + * + * After we've compared the stored data with the data to be written, + * or after we've failed to be able to do so, the stored VIO callback + * is queued to be run in the main (kvdoReqQ) thread. + * + * If the advice turns out to be stale and the deduplication session + * is still active, submit a correction. (Currently the correction + * must be sent before the callback can be invoked, if the dedupe + * session is still live.) + * + * @param item The workitem from the queue + **/ +static void verifyDuplicationWork(KvdoWorkItem *item) +{ + DataKVIO *dataKVIO = workItemAsDataKVIO(item); + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION("$F;j=dedupe;cb=verify")); + + if (likely(memoryEqual(dataKVIO->dataBlock, dataKVIO->readBlock.data, + VDO_BLOCK_SIZE))) { + // Leave dataKVIO->dataVIO.isDuplicate set to true. + } else { + dataKVIO->dataVIO.isDuplicate = false; + } + + kvdoEnqueueDataVIOCallback(dataKVIO); +} + +/** + * Verify the Albireo-provided deduplication advice, and invoke a + * callback once the answer is available. + * + * @param dataKVIO The DataKVIO that we are looking to dedupe. + **/ +static void verifyReadBlockCallback(DataKVIO *dataKVIO) +{ + dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); + int err = dataKVIO->readBlock.status; + if (unlikely(err != 0)) { + logDebug("%s: err %d", __func__, err); + dataKVIO->dataVIO.isDuplicate = false; + kvdoEnqueueDataVIOCallback(dataKVIO); + return; + } + + launchDataKVIOOnCPUQueue(dataKVIO, verifyDuplicationWork, NULL, + CPU_Q_ACTION_COMPRESS_BLOCK); +} + +/**********************************************************************/ +void kvdoVerifyDuplication(DataVIO *dataVIO) +{ + ASSERT_LOG_ONLY(dataVIO->isDuplicate, "advice to verify must be valid"); + ASSERT_LOG_ONLY(dataVIO->duplicate.state != MAPPING_STATE_UNMAPPED, + "advice to verify must not be a discard"); + ASSERT_LOG_ONLY(dataVIO->duplicate.pbn != ZERO_BLOCK, + "advice to verify must not point to the zero block"); + ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, + "zeroed block should not have advice to verify"); + + TraceLocation location + = THIS_LOCATION("verifyDuplication;dup=update(verify);io=verify"); + dataVIOAddTraceRecord(dataVIO, location); + kvdoReadBlock(dataVIO, dataVIO->duplicate.pbn, dataVIO->duplicate.state, + BIO_Q_ACTION_VERIFY, verifyReadBlockCallback); +} + +/**********************************************************************/ +bool kvdoCompareDataVIOs(DataVIO *first, DataVIO *second) +{ + dataVIOAddTraceRecord(second, THIS_LOCATION(NULL)); + DataKVIO *a = dataVIOAsDataKVIO(first); + DataKVIO *b = dataVIOAsDataKVIO(second); + return memoryEqual(a->dataBlock, b->dataBlock, VDO_BLOCK_SIZE); +} diff --git a/source/vdo/kernel/verify.h b/source/vdo/kernel/verify.h new file mode 100644 index 0000000..5b03dd7 --- /dev/null +++ b/source/vdo/kernel/verify.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/verify.h#1 $ + */ + +#include "kernelLayer.h" + +/** + * Verify the Albireo-provided deduplication advice, and invoke a callback once + * the answer is available. This is done through a call to kvdoReadBlock() + * which will eventually call back to verifyDuplication() once the block is + * read and possibly uncompressed. + * + * @param dataVIO The DataVIO with advice filled in. + **/ +void kvdoVerifyDuplication(DataVIO *dataVIO); + +/** + * Implements DataVIOComparator. + * + * @param first The first DataVIO to compare + * @param second The second DataVIO to compare + * + * @return true if the contents of the two DataVIOs are the same + **/ +bool kvdoCompareDataVIOs(DataVIO *first, DataVIO *second) + __attribute__((warn_unused_result)); diff --git a/source/vdo/kernel/workItemStats.c b/source/vdo/kernel/workItemStats.c new file mode 100644 index 0000000..2027cd8 --- /dev/null +++ b/source/vdo/kernel/workItemStats.c @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workItemStats.c#4 $ + */ + +#include "workItemStats.h" + +#include "atomic.h" +#include "logger.h" + +/** + * Scan the work queue stats table for the provided work function and + * priority value. If it's not found, see if an empty slot is + * available. + * + * @param table The work queue's function table + * @param work The function we want to record stats for + * @param priority The priority of the work item + * + * @return The index of the slot to use (matching or empty), or + * NUM_WORK_QUEUE_ITEM_STATS if the table is full of + * non-matching entries. + **/ +static inline unsigned int scanStatTable(const KvdoWorkFunctionTable *table, + KvdoWorkFunction work, + unsigned int priority) +{ + unsigned int i; + /* + * See comments in getStatTableIndex regarding order of memory + * accesses. Work function first, then a barrier, then priority. + */ + for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { + if (table->functions[i] == NULL) { + return i; + } else if (table->functions[i] == work) { + smp_rmb(); + if (table->priorities[i] == priority) { + return i; + } + } + } + return NUM_WORK_QUEUE_ITEM_STATS; +} + +/** + * Scan the work queue stats table for the provided work function and + * priority value. Assign an empty slot if necessary. + * + * @param stats The stats structure + * @param work The function we want to record stats for + * @param priority The priority of the work item + * + * @return The index of the matching slot, or NUM_WORK_QUEUE_ITEM_STATS + * if the table is full of non-matching entries. + **/ +static unsigned int getStatTableIndex(KvdoWorkItemStats *stats, + KvdoWorkFunction work, + unsigned int priority) +{ + KvdoWorkFunctionTable *functionTable = &stats->functionTable; + + unsigned int index = scanStatTable(functionTable, work, priority); + if (unlikely(index == NUM_WORK_QUEUE_ITEM_STATS) + || likely(functionTable->functions[index] != NULL)) { + return index; + } + + unsigned long flags = 0; + // The delayed-work-item processing uses queue->lock in some cases, + // and one case may call into this function, so we can't reuse + // queue->lock here. + spin_lock_irqsave(&functionTable->lock, flags); + // Recheck now that we've got the lock... + index = scanStatTable(functionTable, work, priority); + if ((index == NUM_WORK_QUEUE_ITEM_STATS) + || (functionTable->functions[index] != NULL)) { + spin_unlock_irqrestore(&functionTable->lock, flags); + return index; + } + + /* + * An uninitialized priority is indistinguishable from a zero + * priority. So store the priority first, and enforce the ordering, + * so that a non-null work function pointer indicates we've finished + * filling in the value. (And, to make this work, we have to read + * the work function first and priority second, when comparing.) + */ + functionTable->priorities[index] = priority; + smp_wmb(); + functionTable->functions[index] = work; + spin_unlock_irqrestore(&functionTable->lock, flags); + return index; +} + +/** + * Get counters on work items, identified by index into the internal + * array. + * + * @param [in] stats The collected statistics + * @param [in] index The index + * @param [out] enqueuedPtr The total work items enqueued + * @param [out] processedPtr The number of work items processed + * @param [out] pendingPtr The number of work items still pending + **/ +static void getWorkItemCountsByItem(const KvdoWorkItemStats *stats, + unsigned int index, + uint64_t *enqueuedPtr, + uint64_t *processedPtr, + unsigned int *pendingPtr) +{ + uint64_t enqueued = atomic64_read(&stats->enqueued[index]); + uint64_t processed = stats->times[index].count; + unsigned int pending; + if (enqueued < processed) { + // Probably just out of sync. + pending = 1; + } else { + pending = enqueued - processed; + // Pedantic paranoia: Check for overflow of the 32-bit "pending". + if ((pending + processed) < enqueued) { + pending = UINT_MAX; + } + } + *enqueuedPtr = enqueued; + *processedPtr = processed; + *pendingPtr = pending; +} + +/** + * Get counters on work items not covered by any index value. + * + * @param [in] stats The collected statistics + * @param [out] enqueuedPtr The total work items enqueued + * @param [out] processedPtr The number of work items processed + **/ +static void getOtherWorkItemCounts(const KvdoWorkItemStats *stats, + uint64_t *enqueuedPtr, + uint64_t *processedPtr) +{ + unsigned int pending; + getWorkItemCountsByItem(stats, NUM_WORK_QUEUE_ITEM_STATS, + enqueuedPtr, processedPtr, &pending); +} + +/** + * Get timing stats on work items, identified by index into the + * internal array. + * + * @param [in] stats The collected statistics + * @param [in] index The index into the array + * @param [out] min The minimum execution time + * @param [out] mean The mean execution time + * @param [out] max The maximum execution time + **/ +static void getWorkItemTimesByItem(const KvdoWorkItemStats *stats, + unsigned int index, + uint64_t *min, + uint64_t *mean, + uint64_t *max) +{ + *min = stats->times[index].min; + *mean = getSampleAverage(&stats->times[index]); + *max = stats->times[index].max; +} + +/**********************************************************************/ +void updateWorkItemStatsForEnqueue(KvdoWorkItemStats *stats, + KvdoWorkItem *item, + int priority) +{ + item->statTableIndex = getStatTableIndex(stats, item->statsFunction, + priority); + atomic64_add(1, &stats->enqueued[item->statTableIndex]); +} + +/**********************************************************************/ +char *getFunctionName(void *pointer, char *buffer, size_t bufferLength) +{ + if (pointer == NULL) { + /* + * Format "%ps" logs a null pointer as "(null)" with a bunch of + * leading spaces. We sometimes use this when logging lots of + * data; don't be so verbose. + */ + strncpy(buffer, "-", bufferLength); + } else { + /* + * Use a non-const array instead of a string literal below to + * defeat gcc's format checking, which doesn't understand that + * "%ps" actually does support a precision spec in Linux kernel + * code. + */ + static char truncatedFunctionNameFormatString[] = "%.*ps"; + snprintf(buffer, bufferLength, + truncatedFunctionNameFormatString, + bufferLength - 1, + pointer); + + char *space = strchr(buffer, ' '); + if (space != NULL) { + *space = '\0'; + } + } + + return buffer; +} + +/**********************************************************************/ +size_t formatWorkItemStats(const KvdoWorkItemStats *stats, + char *buffer, + size_t length) +{ + const KvdoWorkFunctionTable *functionIDs = &stats->functionTable; + size_t currentOffset = 0; + + uint64_t enqueued, processed; + int i; + for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { + if (functionIDs->functions[i] == NULL) { + break; + } + if (atomic64_read(&stats->enqueued[i]) == 0) { + continue; + } + /* + * The reporting of all of "pending", "enqueued" and "processed" + * here seems redundant, but "pending" is limited to 0 in the case + * where "processed" exceeds "enqueued", either through current + * activity and a lack of synchronization when fetching stats, or + * a coding bug. This report is intended largely for debugging, so + * we'll go ahead and print the not-necessarily-redundant values. + */ + unsigned int pending; + getWorkItemCountsByItem(stats, i, &enqueued, &processed, &pending); + + // Format: fn prio enq proc timeo [ min max mean ] + if (ENABLE_PER_FUNCTION_TIMING_STATS) { + uint64_t min, mean, max; + getWorkItemTimesByItem(stats, i, &min, &mean, &max); + currentOffset += snprintf(buffer + currentOffset, + length - currentOffset, + "%-36ps %d %10llu %10" PRIu64 + " %10llu %10llu %10" PRIu64 + "\n", + functionIDs->functions[i], + functionIDs->priorities[i], + enqueued, processed, + min, max, mean); + } else { + currentOffset += snprintf(buffer + currentOffset, + length - currentOffset, + "%-36ps %d %10llu %10" PRIu64 + "\n", + functionIDs->functions[i], + functionIDs->priorities[i], + enqueued, processed); + } + if (currentOffset >= length) { + break; + } + } + if ((i == NUM_WORK_QUEUE_ITEM_STATS) && (currentOffset < length)) { + uint64_t enqueued, processed; + getOtherWorkItemCounts(stats, &enqueued, &processed); + if (enqueued > 0) { + currentOffset += snprintf(buffer + currentOffset, + length - currentOffset, + "%-36s %d %10llu %10" PRIu64 + "\n", + "OTHER", 0, + enqueued, processed); + } + } + return currentOffset; +} + +/**********************************************************************/ +void logWorkItemStats(const KvdoWorkItemStats *stats) +{ + uint64_t totalEnqueued = 0; + uint64_t totalProcessed = 0; + + const KvdoWorkFunctionTable *functionIDs = &stats->functionTable; + + int i; + for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { + if (functionIDs->functions[i] == NULL) { + break; + } + if (atomic64_read(&stats->enqueued[i]) == 0) { + continue; + } + /* + * The reporting of all of "pending", "enqueued" and "processed" + * here seems redundant, but "pending" is limited to 0 in the case + * where "processed" exceeds "enqueued", either through current + * activity and a lack of synchronization when fetching stats, or + * a coding bug. This report is intended largely for debugging, so + * we'll go ahead and print the not-necessarily-redundant values. + */ + uint64_t enqueued, processed; + unsigned int pending; + getWorkItemCountsByItem(stats, i, &enqueued, &processed, &pending); + totalEnqueued += enqueued; + totalProcessed += processed; + + static char work[256]; // arbitrary size + getFunctionName(functionIDs->functions[i], work, sizeof(work)); + + if (ENABLE_PER_FUNCTION_TIMING_STATS) { + uint64_t min, mean, max; + getWorkItemTimesByItem(stats, i, &min, &mean, &max); + logInfo(" priority %d: %u pending" + " %llu enqueued %llu processed" + " %s" + " times %llu/%llu/%lluns", + functionIDs->priorities[i], + pending, enqueued, processed, work, + min, mean, max); + } else { + logInfo(" priority %d: %u pending" + " %llu enqueued %llu processed" + " %s", + functionIDs->priorities[i], + pending, enqueued, processed, work); + } + } + if (i == NUM_WORK_QUEUE_ITEM_STATS) { + uint64_t enqueued, processed; + getOtherWorkItemCounts(stats, &enqueued, &processed); + if (enqueued > 0) { + totalEnqueued += enqueued; + totalProcessed += processed; + logInfo(" ... others: %llu enqueued %llu processed", + enqueued, processed); + } + } + logInfo(" total: %llu enqueued %llu processed", + totalEnqueued, totalProcessed); +} diff --git a/source/vdo/kernel/workItemStats.h b/source/vdo/kernel/workItemStats.h new file mode 100644 index 0000000..0898f3b --- /dev/null +++ b/source/vdo/kernel/workItemStats.h @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workItemStats.h#2 $ + */ + +#ifndef WORK_ITEM_STATS_H +#define WORK_ITEM_STATS_H + +#include "timeUtils.h" + +#include "workQueue.h" + +enum { + // Whether to enable tracking of per-work-function run-time stats. + ENABLE_PER_FUNCTION_TIMING_STATS = 0, + // How many work function/priority pairs to track call stats for + NUM_WORK_QUEUE_ITEM_STATS = 18, +}; + +typedef struct simpleStats { + uint64_t count; + uint64_t sum; + uint64_t min; + uint64_t max; +} SimpleStats; + +/* + * We track numbers of work items handled (and optionally the + * wall-clock time to run the work functions), broken down by + * individual work functions (or alternate functions that the caller + * wants recorded, like the VIO completion callback function if we're + * just enqueueing a work function that invokes that indirectly) and + * priority. + * + * The first part of this structure manages the function/priority + * pairs, and is read frequently but updated rarely (once for each + * pair, plus possibly spin lock contention). + * + * The second part holds counters, and is updated often; different + * parts are updated by various threads as described below. The last + * element of each array, index NUM_WORK_QUEUE_ITEM_STATS, is updated + * only if we have filled the arrays and can't add the current work + * function/priority. See how the statTableIndex field is set in + * workItemStats.c. + * + * All fields may additionally be read when reporting statistics + * (including optionally reporting stats when the worker thread shuts + * down), but that's rare and shouldn't significantly affect cache + * contention issues. + * + * There is no "pending" count per work function here. For reporting + * statistics, it can be approximated by looking at the other fields. + * Do not rely on them being precise and synchronized, though. + */ +typedef struct kvdoWorkItemStatsFunctionTable { + /* + * The spin lock is used to protect .functions and .priorities + * during updates. All three are modified by producers (enqueueing + * threads) but only rarely. The .functions and .priorities arrays + * are read by producers very frequently. + */ + spinlock_t lock; + KvdoWorkFunction functions[NUM_WORK_QUEUE_ITEM_STATS]; + uint8_t priorities[NUM_WORK_QUEUE_ITEM_STATS]; +} KvdoWorkFunctionTable; + +typedef struct kvdoWorkItemStats { + /* + * Table of functions and priorities, for determining the index to + * use into the counter arrays below. + * + * This table is read by producers (usually multiple entries) for + * every work item enqueued, and when reporting stats. It is updated + * by producers, and only the first time a new (work-function, + * priority) combination is seen. + */ + KvdoWorkFunctionTable functionTable; + // Skip to (somewhere on) the next cache line + char pad[CACHE_LINE_BYTES - sizeof(atomic64_t)]; + /* + * The .enqueued field is updated by producers only, once per work + * item processed; __sync operations are used to update these + * values. + */ + atomic64_t enqueued[NUM_WORK_QUEUE_ITEM_STATS + 1]; + // Skip to (somewhere on) the next cache line + char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; + /* + * These values are updated only by the consumer (worker thread). We + * overload the .times[].count field as a count of items processed, + * so if we're not doing the optional processing-time tracking + * (controlled via an option in workQueue.c), we need to explicitly + * update the count. + * + * Since only one thread can ever update these values, no + * synchronization is used. + */ + SimpleStats times[NUM_WORK_QUEUE_ITEM_STATS + 1]; +} KvdoWorkItemStats; + +/** + * Initialize a statistics structure for tracking sample + * values. Assumes the storage was already zeroed out at allocation + * time. + * + * @param stats The statistics structure + **/ +static inline void initSimpleStats(SimpleStats *stats) +{ + // Assume other fields are initialized to zero at allocation. + stats->min = UINT64_MAX; +} + +/** + * Update the statistics being tracked for a new sample value. + * + * @param stats The statistics structure + * @param value The new value to be folded in + **/ +static inline void addSample(SimpleStats *stats, uint64_t value) +{ + stats->count++; + stats->sum += value; + if (stats->min > value) { + stats->min = value; + } + if (stats->max < value) { + stats->max = value; + } +} + +/** + * Return the average of the samples collected. + * + * @param stats The statistics structure + * + * @return The average sample value + **/ +static inline uint64_t getSampleAverage(const SimpleStats *stats) +{ + uint64_t slop = stats->count / 2; + return (stats->sum + slop) / stats->count; +} + +/** + * Update all work queue statistics (work-item and otherwise) after + * enqueueing a work item. + * + * @param stats The statistics structure + * @param item The work item enqueued + * @param priority The work item's priority + **/ +void updateWorkItemStatsForEnqueue(KvdoWorkItemStats *stats, + KvdoWorkItem *item, + int priority); + +/** + * Update all work queue statistics (work-item and otherwise) after enqueueing + * a work item. + * + * This is a very lightweight function (after optimizing away conditionals and + * no-ops) and is called for every work item processed, hence the inline + * definition. + * + * This function requires that recordStartTime and + * updateWorkItemStatsForWorkTime below both get called as well; in some cases + * counters may be updated in updateWorkItemStatsForWorkTime rather than here. + * + * @param stats The statistics structure + * @param item The work item enqueued + **/ +static inline void updateWorkItemStatsForDequeue(KvdoWorkItemStats *stats, + KvdoWorkItem *item) +{ + // The times[].count field is overloaded as a count of items + // processed. + if (!ENABLE_PER_FUNCTION_TIMING_STATS) { + stats->times[item->statTableIndex].count++; + } else { + // In this case, updateWorkItemStatsForWorkTime will bump the counter. + } +} + +/** + * Record the starting time for processing a work item, if timing + * stats are enabled and if we haven't run out of room for recording + * stats in the table. + * + * @param index The work item's index into the internal array + * + * @return The current time, or zero + **/ +static inline uint64_t recordStartTime(unsigned int index) +{ + return (ENABLE_PER_FUNCTION_TIMING_STATS ? currentTime(CLOCK_MONOTONIC) : 0); +} + +/** + * Update the work queue statistics with the wall-clock time for + * processing a work item, if timing stats are enabled and if we + * haven't run out of room for recording stats in the table. + * + * @param stats The statistics structure + * @param index The work item's index into the internal array + * @param startTime The start time as reported by recordStartTime + **/ +static inline void updateWorkItemStatsForWorkTime(KvdoWorkItemStats *stats, + unsigned int index, + uint64_t startTime) +{ + if (ENABLE_PER_FUNCTION_TIMING_STATS) { + uint64_t endTime = currentTime(CLOCK_MONOTONIC); + addSample(&stats->times[index], endTime - startTime); + } +} + +/** + * Convert the pointer into a string representation, using a function + * name if available. + * + * @param pointer The pointer to be converted + * @param buffer The output buffer + * @param bufferLength The size of the output buffer + **/ +char *getFunctionName(void *pointer, char *buffer, size_t bufferLength); + +/** + * Dump statistics broken down by work function and priority into the + * kernel log. + * + * @param stats The statistics structure + **/ +void logWorkItemStats(const KvdoWorkItemStats *stats); + +/** + * Format counters for per-work-function stats for reporting via /sys. + * + * @param [in] stats The statistics structure + * @param [out] buffer The output buffer + * @param [in] length The size of the output buffer + * + * @return The size of the string actually written + **/ +size_t formatWorkItemStats(const KvdoWorkItemStats *stats, + char *buffer, + size_t length); + +#endif // WORK_ITEM_STATS_H diff --git a/source/vdo/kernel/workQueue.c b/source/vdo/kernel/workQueue.c new file mode 100644 index 0000000..8be3285 --- /dev/null +++ b/source/vdo/kernel/workQueue.c @@ -0,0 +1,1152 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueue.c#11 $ + */ + +#include "workQueue.h" + +#include +#include +#include + +#include "atomic.h" +#include "logger.h" +#include "memoryAlloc.h" +#include "permassert.h" +#include "stringUtils.h" + +#include "numeric.h" +#include "workItemStats.h" +#include "workQueueHandle.h" +#include "workQueueInternals.h" +#include "workQueueStats.h" +#include "workQueueSysfs.h" + +enum { + // Time between work queue heartbeats in usec. The default kernel + // configurations generally have 1ms or 4ms tick rates, so let's make this a + // multiple for accuracy. + FUNNEL_HEARTBEAT_INTERVAL = 4000, + + // Time to wait for a work queue to flush remaining items during shutdown. + // Specified in milliseconds. + FUNNEL_FINISH_SLEEP = 5000, +}; + +static struct mutex queueDataLock; +static SimpleWorkQueue queueData; + +static void freeSimpleWorkQueue(SimpleWorkQueue *queue); +static void finishSimpleWorkQueue(SimpleWorkQueue *queue); + +// work item lists (used for delayed work items) + +/**********************************************************************/ +static void initializeWorkItemList(KvdoWorkItemList *list) +{ + list->tail = NULL; +} + +/**********************************************************************/ +static void addToWorkItemList(KvdoWorkItemList *list, KvdoWorkItem *item) +{ + if (list->tail == NULL) { + item->next = item; + } else { + KvdoWorkItem *head = list->tail->next; + list->tail->next = item; + item->next = head; + } + list->tail = item; +} + +/**********************************************************************/ +static bool isWorkItemListEmpty(KvdoWorkItemList *list) +{ + return list->tail == NULL; +} + +/**********************************************************************/ +static KvdoWorkItem *workItemListPoll(KvdoWorkItemList *list) +{ + KvdoWorkItem *tail = list->tail; + if (tail == NULL) { + return NULL; + } + // Extract and return head of list. + KvdoWorkItem *head = tail->next; + // Only one entry? + if (head == tail) { + list->tail = NULL; + } else { + tail->next = head->next; + } + head->next = NULL; + return head; +} + +/**********************************************************************/ +static KvdoWorkItem *workItemListPeek(KvdoWorkItemList *list) +{ + KvdoWorkItem *tail = list->tail; + return tail ? tail->next : NULL; +} + +// Finding the SimpleWorkQueue to actually operate on. + +/** + * Pick the next subordinate service queue in rotation. + * + * This doesn't need to be 100% precise in distributing work items around, so + * playing loose with concurrent field modifications isn't going to hurt us. + * (Avoiding the atomic ops may help us a bit in performance, but we'll still + * have contention over the fields.) + * + * @param queue The round-robin-type work queue + * + * @return A subordinate work queue + **/ +static inline SimpleWorkQueue *nextServiceQueue(RoundRobinWorkQueue *queue) +{ + unsigned int index = (queue->serviceQueueRotor++ % queue->numServiceQueues); + return queue->serviceQueues[index]; +} + +/** + * Find a simple work queue on which to operate. + * + * If the argument is already a simple work queue, use it. If it's a + * round-robin work queue, pick the next subordinate service queue and use it. + * + * @param queue a work queue (round-robin or simple) + * + * @return a simple work queue + **/ +static inline SimpleWorkQueue *pickSimpleQueue(KvdoWorkQueue *queue) +{ + return (queue->roundRobinMode + ? nextServiceQueue(asRoundRobinWorkQueue(queue)) + : asSimpleWorkQueue(queue)); +} + +// Processing normal work items. + +/** + * Scan the work queue's work item lists, and dequeue and return the next + * waiting work item, if any. + * + * We scan the funnel queues from highest priority to lowest, once; there is + * therefore a race condition where a high-priority work item can be enqueued + * followed by a lower-priority one, and we'll grab the latter (but we'll catch + * the high-priority item on the next call). If strict enforcement of + * priorities becomes necessary, this function will need fixing. + * + * @param queue the work queue + * + * @return a work item pointer, or NULL + **/ +static KvdoWorkItem *pollForWorkItem(SimpleWorkQueue *queue) +{ + KvdoWorkItem *item = NULL; + for (int i = READ_ONCE(queue->numPriorityLists) - 1; i >= 0; i--) { + FunnelQueueEntry *link = funnelQueuePoll(queue->priorityLists[i]); + if (link != NULL) { + item = container_of(link, KvdoWorkItem, workQueueEntryLink); + break; + } + } + + return item; +} + +/** + * Add a work item into the queue, and inform the caller of any additional + * processing necessary. + * + * If the worker thread may not be awake, true is returned, and the caller + * should attempt a wakeup. + * + * @param queue The work queue + * @param item The work item to add + * + * @return true iff the caller should wake the worker thread + **/ +__attribute__((warn_unused_result)) +static bool enqueueWorkQueueItem(SimpleWorkQueue *queue, KvdoWorkItem *item) +{ + ASSERT_LOG_ONLY(item->myQueue == NULL, + "item %" PRIptr " (fn %" PRIptr "/%" PRIptr + ") to enqueue (%" PRIptr + ") is not already queued (%" PRIptr ")", + item, item->work, item->statsFunction, queue, + item->myQueue); + if (ASSERT(item->action < WORK_QUEUE_ACTION_COUNT, + "action is in range for queue") != VDO_SUCCESS) { + item->action = 0; + } + unsigned int priority = READ_ONCE(queue->priorityMap[item->action]); + + // Update statistics. + updateStatsForEnqueue(&queue->stats, item, priority); + + item->myQueue = &queue->common; + + // Funnel queue handles the synchronization for the put. + funnelQueuePut(queue->priorityLists[priority], &item->workQueueEntryLink); + + /* + * Due to how funnel-queue synchronization is handled (just atomic + * operations), the simplest safe implementation here would be to wake-up any + * waiting threads after enqueueing each item. Even if the funnel queue is + * not empty at the time of adding an item to the queue, the consumer thread + * may not see this since it is not guaranteed to have the same view of the + * queue as a producer thread. + * + * However, the above is wasteful so instead we attempt to minimize the + * number of thread wakeups. This is normally unsafe due to the above + * consumer-producer synchronization constraints. To correct this a timeout + * mechanism is used to wake the thread periodically to handle the occasional + * race condition that triggers and results in this thread not being woken + * properly. + * + * In most cases, the above timeout will not occur prior to some other work + * item being added after the queue is set to idle state, so thread wakeups + * will generally be triggered much faster than this interval. The timeout + * provides protection against the cases where more work items are either not + * added or are added too infrequently. + * + * This is also why we can get away with the normally-unsafe optimization for + * the common case by checking queue->idle first without synchronization. The + * race condition exists, but another work item getting enqueued can wake us + * up, and if we don't get that either, we still have the timeout to fall + * back on. + * + * Developed and tuned for some x86 boxes; untested whether this is any + * better or worse for other platforms, with or without the explicit memory + * barrier. + */ + smp_mb(); + return ((atomic_read(&queue->idle) == 1) + && (atomic_cmpxchg(&queue->idle, 1, 0) == 1)); +} + +/** + * Compute an approximate indication of the number of pending work items. + * + * No synchronization is used, so it's guaranteed to be correct only if there + * is no activity. + * + * @param queue The work queue to examine + * + * @return the estimate of the number of pending work items + **/ +static unsigned int getPendingCount(SimpleWorkQueue *queue) +{ + KvdoWorkItemStats *stats = &queue->stats.workItemStats; + long long pending = 0; + for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { + pending += atomic64_read(&stats->enqueued[i]); + pending -= stats->times[i].count; + } + if (pending < 0) { + /* + * If we fetched numbers that were changing, we can get negative results. + * Just return an indication that there's some activity. + */ + pending = 1; + } + return pending; +} + +/** + * Run any start hook that may be defined for the work queue. + * + * @param queue The work queue + **/ +static void runStartHook(SimpleWorkQueue *queue) +{ + if (queue->type->start != NULL) { + queue->type->start(queue->private); + } +} + +/** + * Run any finish hook that may be defined for the work queue. + * + * @param queue The work queue + **/ +static void runFinishHook(SimpleWorkQueue *queue) +{ + if (queue->type->finish != NULL) { + queue->type->finish(queue->private); + } +} + +/** + * If the work queue has a suspend hook, invoke it, and when it finishes, check + * again for any pending work items. + * + * We assume a check for pending work items has just been done and turned up + * empty; so, if no suspend hook exists, we can just return NULL without doing + * another check. + * + * @param [in] queue The work queue preparing to suspend + * + * @return the newly found work item, if any + **/ +static KvdoWorkItem *runSuspendHook(SimpleWorkQueue *queue) +{ + if (queue->type->suspend == NULL) { + return NULL; + } + + queue->type->suspend(queue->private); + return pollForWorkItem(queue); +} + +/** + * Check whether a work queue has delayed work items pending. + * + * @param queue The work queue + * + * @return true iff delayed work items are pending + **/ +static bool hasDelayedWorkItems(SimpleWorkQueue *queue) +{ + bool result; + unsigned long flags; + spin_lock_irqsave(&queue->lock, flags); + result = !isWorkItemListEmpty(&queue->delayedItems); + spin_unlock_irqrestore(&queue->lock, flags); + return result; +} + +/** + * Wait for the next work item to process, or until kthread_should_stop + * indicates that it's time for us to shut down. + * + * If kthread_should_stop says it's time to stop but we have pending work + * items, return a work item. + * + * Update statistics relating to scheduler interactions. + * + * @param [in] queue The work queue to wait on + * @param [in] timeoutInterval How long to wait each iteration + * + * @return the next work item, or NULL to indicate shutdown is requested + **/ +static KvdoWorkItem *waitForNextWorkItem(SimpleWorkQueue *queue, + TimeoutJiffies timeoutInterval) +{ + KvdoWorkItem *item = runSuspendHook(queue); + if (item != NULL) { + return item; + } + + DEFINE_WAIT(wait); + while (true) { + atomic64_set(&queue->firstWakeup, 0); + prepare_to_wait(&queue->waitingWorkerThreads, &wait, TASK_INTERRUPTIBLE); + /* + * Don't set the idle flag until a wakeup will not be lost. + * + * Force synchronization between setting the idle flag and checking the + * funnel queue; the producer side will do them in the reverse order. + * (There's still a race condition we've chosen to allow, because we've got + * a timeout below that unwedges us if we hit it, but this may narrow the + * window a little.) + */ + atomic_set(&queue->idle, 1); + memoryFence(); // store-load barrier between "idle" and funnel queue + + item = pollForWorkItem(queue); + if (item != NULL) { + break; + } + + /* + * We need to check for thread-stop after setting TASK_INTERRUPTIBLE state + * up above. Otherwise, schedule() will put the thread to sleep and might + * miss a wakeup from kthread_stop() call in finishWorkQueue(). + * + * If there are delayed work items, we need to wait for them to + * get run. Then, when we check kthread_should_stop again, we'll + * finally exit. + */ + if (kthread_should_stop() && !hasDelayedWorkItems(queue)) { + /* + * Recheck once again in case we *just* converted a delayed work item to + * a regular enqueued work item. + * + * It's important that processDelayedWorkItems holds the spin lock until + * it finishes enqueueing the work item to run. + * + * Funnel queues aren't synchronized between producers and consumer. + * Normally a producer interrupted mid-update can hide a later producer's + * entry until the first completes. This would be a problem, except that + * when kthread_stop is called, we should already have ceased adding new + * work items and have waited for all the regular work items to finish; + * (recurring) delayed work items should be the only exception. + * + * Worker thread shutdown would be simpler if even the delayed work items + * were required to be completed and not re-queued before shutting down a + * work queue. + */ + item = pollForWorkItem(queue); + break; + } + + /* + * We don't need to update the wait count atomically since this is the only + * place it is modified and there is only one thread involved. + */ + queue->stats.waits++; + uint64_t timeBeforeSchedule = currentTime(CLOCK_MONOTONIC); + atomic64_add(timeBeforeSchedule - queue->mostRecentWakeup, + &queue->stats.runTime); + // Wake up often, to address the missed-wakeup race. + schedule_timeout(timeoutInterval); + queue->mostRecentWakeup = currentTime(CLOCK_MONOTONIC); + uint64_t callDurationNS = queue->mostRecentWakeup - timeBeforeSchedule; + enterHistogramSample(queue->stats.scheduleTimeHistogram, + callDurationNS / 1000); + + /* + * Check again before resetting firstWakeup for more accurate + * stats. (It's still racy, which can't be fixed without requiring + * tighter synchronization between producer and consumer sides.) + */ + item = pollForWorkItem(queue); + if (item != NULL) { + break; + } + } + + if (item != NULL) { + uint64_t firstWakeup = atomic64_read(&queue->firstWakeup); + /* + * We sometimes register negative wakeup latencies without this fencing. + * Whether it's forcing full serialization between the read of firstWakeup + * and the "rdtsc" that might be used depending on the clock source that + * helps, or some extra nanoseconds of delay covering for high-resolution + * clocks not being quite in sync between CPUs, is not yet clear. + */ + loadFence(); + if (firstWakeup != 0) { + enterHistogramSample(queue->stats.wakeupLatencyHistogram, + (currentTime(CLOCK_MONOTONIC) - firstWakeup) / 1000); + enterHistogramSample(queue->stats.wakeupQueueLengthHistogram, + getPendingCount(queue)); + } + } + finish_wait(&queue->waitingWorkerThreads, &wait); + atomic_set(&queue->idle, 0); + + return item; +} + +/** + * Get the next work item to process, possibly waiting for one, unless + * kthread_should_stop indicates that it's time for us to shut down. + * + * If kthread_should_stop says it's time to stop but we have pending work + * items, return a work item. + * + * @param [in] queue The work queue to wait on + * @param [in] timeoutInterval How long to wait each iteration + * + * @return the next work item, or NULL to indicate shutdown is requested + **/ +static KvdoWorkItem *getNextWorkItem(SimpleWorkQueue *queue, + TimeoutJiffies timeoutInterval) +{ + KvdoWorkItem *item = pollForWorkItem(queue); + if (item != NULL) { + return item; + } + return waitForNextWorkItem(queue, timeoutInterval); +} + +/** + * Execute a work item from a work queue, and do associated bookkeeping. + * + * @param [in] queue the work queue the item is from + * @param [in] item the work item to run + **/ +static void processWorkItem(SimpleWorkQueue *queue, + KvdoWorkItem *item) +{ + if (ASSERT(item->myQueue == &queue->common, + "item %" PRIptr " from queue %" PRIptr + " marked as being in this queue (%" PRIptr ")", + item, queue, item->myQueue) == UDS_SUCCESS) { + updateStatsForDequeue(&queue->stats, item); + item->myQueue = NULL; + } + + // Save the index, so we can use it after the work function. + unsigned int index = item->statTableIndex; + uint64_t workStartTime = recordStartTime(index); + item->work(item); + // We just surrendered control of the work item; no more access. + item = NULL; + updateWorkItemStatsForWorkTime(&queue->stats.workItemStats, index, + workStartTime); + + /* + * Be friendly to a CPU that has other work to do, if the kernel has told us + * to. This speeds up some performance tests; that "other work" might include + * other VDO threads. + * + * N.B.: We compute the pending count info here without any synchronization, + * but it's for stats reporting only, so being imprecise isn't too big a + * deal, as long as reads and writes are atomic operations. + */ + if (need_resched()) { + uint64_t timeBeforeReschedule = currentTime(CLOCK_MONOTONIC); + // Record the queue length we have *before* rescheduling. + unsigned int queueLen = getPendingCount(queue); + cond_resched(); + uint64_t timeAfterReschedule = currentTime(CLOCK_MONOTONIC); + + enterHistogramSample(queue->stats.rescheduleQueueLengthHistogram, + queueLen); + uint64_t runTimeNS = timeBeforeReschedule - queue->mostRecentWakeup; + enterHistogramSample(queue->stats.runTimeBeforeRescheduleHistogram, + runTimeNS / 1000); + atomic64_add(runTimeNS, &queue->stats.runTime); + uint64_t callTimeNS = timeAfterReschedule - timeBeforeReschedule; + enterHistogramSample(queue->stats.rescheduleTimeHistogram, + callTimeNS / 1000); + atomic64_add(callTimeNS, &queue->stats.rescheduleTime); + queue->mostRecentWakeup = timeAfterReschedule; + } +} + +/** + * Main loop of the work queue worker thread. + * + * Waits for work items and runs them, until told to stop. + * + * @param queue The work queue to run + **/ +static void serviceWorkQueue(SimpleWorkQueue *queue) +{ + TimeoutJiffies timeoutInterval = + maxLong(2, usecs_to_jiffies(FUNNEL_HEARTBEAT_INTERVAL + 1) - 1); + + runStartHook(queue); + + while (true) { + KvdoWorkItem *item = getNextWorkItem(queue, timeoutInterval); + if (item == NULL) { + // No work items but kthread_should_stop was triggered. + break; + } + // Process the work item + processWorkItem(queue, item); + } + + runFinishHook(queue); +} + +/** + * Initialize per-thread data for a new worker thread and run the work queue. + * Called in a new thread created by kthread_run(). + * + * @param ptr A pointer to the KvdoWorkQueue to run. + * + * @return 0 (indicating success to kthread_run()) + **/ +static int workQueueRunner(void *ptr) +{ + SimpleWorkQueue *queue = ptr; + kobject_get(&queue->common.kobj); + + WorkQueueStackHandle queueHandle; + initializeWorkQueueStackHandle(&queueHandle, queue); + queue->stats.startTime = queue->mostRecentWakeup = currentTime(CLOCK_MONOTONIC); + unsigned long flags; + spin_lock_irqsave(&queue->lock, flags); + queue->started = true; + spin_unlock_irqrestore(&queue->lock, flags); + wake_up(&queue->startWaiters); + serviceWorkQueue(queue); + + // Zero out handle structure for safety. + memset(&queueHandle, 0, sizeof(queueHandle)); + + kobject_put(&queue->common.kobj); + return 0; +} + +// Preparing work items + +/**********************************************************************/ +void setupWorkItem(KvdoWorkItem *item, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action) +{ + ASSERT_LOG_ONLY(item->myQueue == NULL, + "setupWorkItem not called on enqueued work item"); + item->work = work; + item->statsFunction = ((statsFunction == NULL) ? work : statsFunction); + item->statTableIndex = 0; + item->action = action; + item->myQueue = NULL; + item->executionTime = 0; + item->next = NULL; +} + +// Thread management + +/**********************************************************************/ +static inline void wakeWorkerThread(SimpleWorkQueue *queue) +{ + smp_mb(); + atomic64_cmpxchg(&queue->firstWakeup, 0, currentTime(CLOCK_MONOTONIC)); + // Despite the name, there's a maximum of one thread in this list. + wake_up(&queue->waitingWorkerThreads); +} + +// Delayed work items + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) +/** + * Timer function invoked when a delayed work item is ready to run. + * + * @param timer The timer which has just finished + **/ +static void processDelayedWorkItems(struct timer_list *timer) +#else +/** + * Timer function invoked when a delayed work item is ready to run. + * + * @param data The queue pointer, as an unsigned long + **/ +static void processDelayedWorkItems(unsigned long data) +#endif +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) + SimpleWorkQueue *queue = from_timer(queue, timer, delayedItemsTimer); +#else + SimpleWorkQueue *queue = (SimpleWorkQueue *) data; +#endif + Jiffies nextExecutionTime = 0; + bool reschedule = false; + bool needsWakeup = false; + + unsigned long flags; + spin_lock_irqsave(&queue->lock, flags); + while (!isWorkItemListEmpty(&queue->delayedItems)) { + KvdoWorkItem *item = workItemListPeek(&queue->delayedItems); + if (item->executionTime > jiffies) { + nextExecutionTime = item->executionTime; + reschedule = true; + break; + } + workItemListPoll(&queue->delayedItems); + item->executionTime = 0; // not actually looked at... + item->myQueue = NULL; + needsWakeup |= enqueueWorkQueueItem(queue, item); + } + spin_unlock_irqrestore(&queue->lock, flags); + if (reschedule) { + mod_timer(&queue->delayedItemsTimer, nextExecutionTime); + } + if (needsWakeup) { + wakeWorkerThread(queue); + } +} + +// Creation & teardown + +/**********************************************************************/ +static bool queueStarted(SimpleWorkQueue *queue) +{ + unsigned long flags; + spin_lock_irqsave(&queue->lock, flags); + bool started = queue->started; + spin_unlock_irqrestore(&queue->lock, flags); + return started; +} + +/** + * Create a simple work queue with a worker thread. + * + * @param [in] threadNamePrefix The per-device prefix to use in thread names + * @param [in] name The queue name + * @param [in] parentKobject The parent sysfs node + * @param [in] owner The kernel layer owning the work queue + * @param [in] private Private data of the queue for use by work + * items or other queue-specific functions + * @param [in] type The work queue type defining the lifecycle + * functions, queue actions, priorities, and + * timeout behavior + * @param [out] queuePtr Where to store the queue handle + * + * @return VDO_SUCCESS or an error code + **/ +static int makeSimpleWorkQueue(const char *threadNamePrefix, + const char *name, + struct kobject *parentKobject, + KernelLayer *owner, + void *private, + const KvdoWorkQueueType *type, + SimpleWorkQueue **queuePtr) +{ + SimpleWorkQueue *queue; + int result = ALLOCATE(1, SimpleWorkQueue, "simple work queue", &queue); + if (result != UDS_SUCCESS) { + return result; + } + + queue->type = type; + queue->private = private; + queue->common.owner = owner; + + unsigned int numPriorityLists = 1; + for (int i = 0; i < WORK_QUEUE_ACTION_COUNT; i++) { + const KvdoWorkQueueAction *action = &queue->type->actionTable[i]; + if (action->name == NULL) { + break; + } + unsigned int code = action->code; + unsigned int priority = action->priority; + + result = ASSERT(code < WORK_QUEUE_ACTION_COUNT, + "invalid action code %u in work queue initialization", + code); + if (result != VDO_SUCCESS) { + FREE(queue); + return result; + } + result = ASSERT(priority < WORK_QUEUE_PRIORITY_COUNT, + "invalid action priority %u in work queue initialization", + priority); + if (result != VDO_SUCCESS) { + FREE(queue); + return result; + } + queue->priorityMap[code] = priority; + if (numPriorityLists <= priority) { + numPriorityLists = priority + 1; + } + } + + result = duplicateString(name, "queue name", &queue->common.name); + if (result != VDO_SUCCESS) { + FREE(queue); + return -ENOMEM; + } + + init_waitqueue_head(&queue->waitingWorkerThreads); + init_waitqueue_head(&queue->startWaiters); + spin_lock_init(&queue->lock); + + initializeWorkItemList(&queue->delayedItems); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) + timer_setup(&queue->delayedItemsTimer, processDelayedWorkItems, 0); +#else + setup_timer(&queue->delayedItemsTimer, processDelayedWorkItems, + (unsigned long) queue); +#endif + + kobject_init(&queue->common.kobj, &simpleWorkQueueKobjType); + result = kobject_add(&queue->common.kobj, parentKobject, queue->common.name); + if (result != 0) { + logError("Cannot add sysfs node: %d", result); + freeSimpleWorkQueue(queue); + return result; + } + queue->numPriorityLists = numPriorityLists; + for (int i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) { + result = makeFunnelQueue(&queue->priorityLists[i]); + if (result != UDS_SUCCESS) { + freeSimpleWorkQueue(queue); + return result; + } + } + result = initializeWorkQueueStats(&queue->stats, &queue->common.kobj); + if (result != 0) { + logError("Cannot initialize statistics tracking: %d", result); + freeSimpleWorkQueue(queue); + return result; + } + + queue->started = false; + struct task_struct *thread = NULL; + thread = kthread_run(workQueueRunner, queue, "%s:%s", threadNamePrefix, + queue->common.name); + + if (IS_ERR(thread)) { + freeSimpleWorkQueue(queue); + return (int) PTR_ERR(thread); + } + queue->thread = thread; + atomic_set(&queue->threadID, thread->pid); + /* + * If we don't wait to ensure the thread is running VDO code, a + * quick kthread_stop (due to errors elsewhere) could cause it to + * never get as far as running VDO, skipping the cleanup code. + * + * Eventually we should just make that path safe too, and then we + * won't need this synchronization. + */ + wait_event(queue->startWaiters, queueStarted(queue) == true); + *queuePtr = queue; + return UDS_SUCCESS; +} + +/**********************************************************************/ +int makeWorkQueue(const char *threadNamePrefix, + const char *name, + struct kobject *parentKobject, + KernelLayer *owner, + void *private, + const KvdoWorkQueueType *type, + unsigned int threadCount, + KvdoWorkQueue **queuePtr) +{ + if (threadCount == 1) { + SimpleWorkQueue *simpleQueue; + int result = makeSimpleWorkQueue(threadNamePrefix, name, parentKobject, + owner, private, type, &simpleQueue); + if (result == VDO_SUCCESS) { + *queuePtr = &simpleQueue->common; + } + return result; + } + + RoundRobinWorkQueue *queue; + int result = ALLOCATE(1, RoundRobinWorkQueue, "round-robin work queue", + &queue); + if (result != UDS_SUCCESS) { + return result; + } + + result = ALLOCATE(threadCount, SimpleWorkQueue *, "subordinate work queues", + &queue->serviceQueues); + if (result != UDS_SUCCESS) { + FREE(queue); + return result; + } + + queue->numServiceQueues = threadCount; + queue->common.roundRobinMode = true; + queue->common.owner = owner; + + result = duplicateString(name, "queue name", &queue->common.name); + if (result != VDO_SUCCESS) { + FREE(queue->serviceQueues); + FREE(queue); + return -ENOMEM; + } + + kobject_init(&queue->common.kobj, &roundRobinWorkQueueKobjType); + result = kobject_add(&queue->common.kobj, parentKobject, queue->common.name); + if (result != 0) { + logError("Cannot add sysfs node: %d", result); + finishWorkQueue(&queue->common); + kobject_put(&queue->common.kobj); + return result; + } + + *queuePtr = &queue->common; + + char threadName[TASK_COMM_LEN]; + for (unsigned int i = 0; i < threadCount; i++) { + snprintf(threadName, sizeof(threadName), "%s%u", name, i); + result = makeSimpleWorkQueue(threadNamePrefix, threadName, + &queue->common.kobj, owner, private, type, + &queue->serviceQueues[i]); + if (result != VDO_SUCCESS) { + queue->numServiceQueues = i; + // Destroy previously created subordinates. + finishWorkQueue(*queuePtr); + freeWorkQueue(queuePtr); + return result; + } + queue->serviceQueues[i]->parentQueue = *queuePtr; + } + + return VDO_SUCCESS; +} + +/** + * Shut down a simple work queue's worker thread. + * + * @param queue The work queue to shut down + **/ +static void finishSimpleWorkQueue(SimpleWorkQueue *queue) +{ + // Tell the worker thread to shut down. + if (queue->thread != NULL) { + atomic_set(&queue->threadID, 0); + // Waits for thread to exit. + kthread_stop(queue->thread); + } + + queue->thread = NULL; +} + +/** + * Shut down a round-robin work queue's service queues. + * + * @param queue The work queue to shut down + **/ +static void finishRoundRobinWorkQueue(RoundRobinWorkQueue *queue) +{ + SimpleWorkQueue **queueTable = queue->serviceQueues; + unsigned int count = queue->numServiceQueues; + + for (unsigned int i = 0; i < count; i++) { + finishSimpleWorkQueue(queueTable[i]); + } +} + +/**********************************************************************/ +void finishWorkQueue(KvdoWorkQueue *queue) +{ + if (queue->roundRobinMode) { + finishRoundRobinWorkQueue(asRoundRobinWorkQueue(queue)); + } else { + finishSimpleWorkQueue(asSimpleWorkQueue(queue)); + } +} + +/** + * Tear down a simple work queue, and decrement the kobject reference + * count on it. + * + * @param queue The work queue + **/ +static void freeSimpleWorkQueue(SimpleWorkQueue *queue) +{ + for (unsigned int i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) { + freeFunnelQueue(queue->priorityLists[i]); + } + cleanupWorkQueueStats(&queue->stats); + kobject_put(&queue->common.kobj); +} + +/** + * Tear down a round-robin work queue and its service queues, and + * decrement the kobject reference count on it. + * + * @param queue The work queue + **/ +static void freeRoundRobinWorkQueue(RoundRobinWorkQueue *queue) +{ + SimpleWorkQueue **queueTable = queue->serviceQueues; + unsigned int count = queue->numServiceQueues; + + queue->serviceQueues = NULL; + for (unsigned int i = 0; i < count; i++) { + freeSimpleWorkQueue(queueTable[i]); + } + FREE(queueTable); + kobject_put(&queue->common.kobj); +} + +/**********************************************************************/ +void freeWorkQueue(KvdoWorkQueue **queuePtr) +{ + KvdoWorkQueue *queue = *queuePtr; + if (queue == NULL) { + return; + } + *queuePtr = NULL; + + finishWorkQueue(queue); + + if (queue->roundRobinMode) { + freeRoundRobinWorkQueue(asRoundRobinWorkQueue(queue)); + } else { + freeSimpleWorkQueue(asSimpleWorkQueue(queue)); + } +} + +// Debugging dumps + +/**********************************************************************/ +static void dumpSimpleWorkQueue(SimpleWorkQueue *queue) +{ + mutex_lock(&queueDataLock); + // Take a snapshot to reduce inconsistency in logged numbers. + queueData = *queue; + const char *threadStatus; + + char taskStateReport = '-'; + if (queueData.thread != NULL) { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0) + taskStateReport = task_state_to_char(queue->thread); +#else + unsigned int taskState = queue->thread->state & TASK_REPORT; + taskState &= 0x1ff; + unsigned int taskStateIndex; + if (taskState != 0) { + taskStateIndex = __ffs(taskState)+1; + BUG_ON(taskStateIndex >= sizeof(TASK_STATE_TO_CHAR_STR)); + } else { + taskStateIndex = 0; + } + taskStateReport = TASK_STATE_TO_CHAR_STR[taskStateIndex]; +#endif + } + + if (queueData.thread == NULL) { + threadStatus = "no threads"; + } else if (atomic_read(&queueData.idle)) { + threadStatus = "idle"; + } else { + threadStatus = "running"; + } + logInfo("workQ %" PRIptr " (%s) %u entries %llu waits, %s (%c)", + &queue->common, + queueData.common.name, + getPendingCount(&queueData), + queueData.stats.waits, + threadStatus, + taskStateReport); + + logWorkItemStats(&queueData.stats.workItemStats); + logWorkQueueStats(queue); + + mutex_unlock(&queueDataLock); + + // ->lock spin lock status? + // ->waitingWorkerThreads wait queue status? anyone waiting? +} + +/**********************************************************************/ +void dumpWorkQueue(KvdoWorkQueue *queue) +{ + if (queue->roundRobinMode) { + RoundRobinWorkQueue *roundRobinQueue = asRoundRobinWorkQueue(queue); + for (unsigned int i = 0; i < roundRobinQueue->numServiceQueues; i++) { + dumpSimpleWorkQueue(roundRobinQueue->serviceQueues[i]); + } + } else { + dumpSimpleWorkQueue(asSimpleWorkQueue(queue)); + } +} + +/**********************************************************************/ +void dumpWorkItemToBuffer(KvdoWorkItem *item, char *buffer, size_t length) +{ + size_t currentLength + = snprintf(buffer, length, "%.*s/", TASK_COMM_LEN, + item->myQueue == NULL ? "-" : item->myQueue->name); + if (currentLength < length) { + getFunctionName(item->statsFunction, buffer + currentLength, + length - currentLength); + } +} + +// Work submission + +/**********************************************************************/ +void enqueueWorkQueue(KvdoWorkQueue *kvdoWorkQueue, KvdoWorkItem *item) +{ + SimpleWorkQueue *queue = pickSimpleQueue(kvdoWorkQueue); + + item->executionTime = 0; + + if (enqueueWorkQueueItem(queue, item)) { + wakeWorkerThread(queue); + } +} + +/**********************************************************************/ +void enqueueWorkQueueDelayed(KvdoWorkQueue *kvdoWorkQueue, + KvdoWorkItem *item, + Jiffies executionTime) +{ + if (executionTime <= jiffies) { + enqueueWorkQueue(kvdoWorkQueue, item); + return; + } + + SimpleWorkQueue *queue = pickSimpleQueue(kvdoWorkQueue); + bool rescheduleTimer = false; + unsigned long flags; + + item->executionTime = executionTime; + + // Lock if the work item is delayed. All delayed items are handled via a + // single linked list. + spin_lock_irqsave(&queue->lock, flags); + + if (isWorkItemListEmpty(&queue->delayedItems)) { + rescheduleTimer = true; + } + /* + * XXX We should keep the list sorted, but at the moment the list won't + * grow above a single entry anyway. + */ + item->myQueue = &queue->common; + addToWorkItemList(&queue->delayedItems, item); + + spin_unlock_irqrestore(&queue->lock, flags); + + if (rescheduleTimer) { + mod_timer(&queue->delayedItemsTimer, executionTime); + } +} + +// Misc + + +/**********************************************************************/ +KvdoWorkQueue *getCurrentWorkQueue(void) +{ + SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); + return (queue == NULL) ? NULL : &queue->common; +} + +/**********************************************************************/ +KernelLayer *getWorkQueueOwner(KvdoWorkQueue *queue) +{ + return queue->owner; +} + +/**********************************************************************/ +void *getWorkQueuePrivateData(void) +{ + SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); + return (queue != NULL) ? queue->private : NULL; +} + +/**********************************************************************/ +void setWorkQueuePrivateData(void *newData) +{ + SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); + BUG_ON(queue == NULL); + queue->private = newData; +} + +/**********************************************************************/ +void initWorkQueueOnce(void) +{ + // We can't use DEFINE_MUTEX because it's not compatible with c99 mode. + mutex_init(&queueDataLock); + initWorkQueueStackHandleOnce(); +} diff --git a/source/vdo/kernel/workQueue.h b/source/vdo/kernel/workQueue.h new file mode 100644 index 0000000..4043295 --- /dev/null +++ b/source/vdo/kernel/workQueue.h @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueue.h#2 $ + */ + +#ifndef ALBIREO_WORK_QUEUE_H +#define ALBIREO_WORK_QUEUE_H + +#include +#include /* for TASK_COMM_LEN */ + +#include "kernelTypes.h" +#include "util/funnelQueue.h" + +enum { + MAX_QUEUE_NAME_LEN = TASK_COMM_LEN, + /** Maximum number of action definitions per work queue type */ + WORK_QUEUE_ACTION_COUNT = 8, + /** Number of priority values available */ + WORK_QUEUE_PRIORITY_COUNT = 4, +}; + +struct kvdoWorkItem { + /** Entry link for lock-free work queue */ + FunnelQueueEntry workQueueEntryLink; + /** Function to be called */ + KvdoWorkFunction work; + /** Optional alternate function for display in queue stats */ + void *statsFunction; + /** An index into the statistics table; filled in by workQueueStats code */ + unsigned int statTableIndex; + /** + * The action code given to setupWorkItem, from which a priority will be + * determined. + **/ + unsigned int action; + /** The work queue in which the item is enqueued, or NULL if not enqueued. */ + KvdoWorkQueue *myQueue; + /** + * Time at which to execute in jiffies for a delayed work item, or zero to + * queue for execution ASAP. + **/ + Jiffies executionTime; + /** List management for delayed or expired work items */ + KvdoWorkItem *next; + /** Time of enqueueing, in ns, for recording queue (waiting) time stats */ + uint64_t enqueueTime; +}; + +/** + * Table entries defining an action. + * + * Actions are intended to distinguish general classes of activity for + * prioritization purposes, but not necessarily to indicate specific work + * functions. They are indicated to setupWorkItem numerically, using an + * enumerator defined per kind of work queue -- bio submission work queue + * actions use BioQAction, cpu actions use CPUQAction, etc. For example, for + * the CPU work queues, data compression can be prioritized separately from + * final cleanup processing of a KVIO or from dedupe verification; base code + * threads prioritize all VIO callback invocation the same, but separate from + * sync or heartbeat operations. The bio acknowledgement work queue, on the + * other hand, only does one thing, so it only defines one action code. + * + * Action codes values must be small integers, 0 through + * WORK_QUEUE_ACTION_COUNT-1, and should not be duplicated for a queue type. + * + * A table of KvdoWorkQueueAction entries embedded in KvdoWorkQueueType + * specifies the name, code, and priority for each type of action in the work + * queue. The table can have at most WORK_QUEUE_ACTION_COUNT entries, but a + * NULL name indicates an earlier end to the table. + * + * Priorities may be specified as values from 0 through + * WORK_QUEUE_PRIORITY_COUNT-1, higher values indicating higher priority. + * Priorities are just strong suggestions; it's possible for a lower-priority + * work item scheduled right after a high-priority one to be run first, if the + * worker thread happens to be scanning its queues at just the wrong moment, + * but the high-priority item will be picked up next. + * + * Internally, the priorities in this table are used to initialize another + * table in the constructed work queue object, and in internal builds, + * device-mapper messages can be sent to change the priority for an action, + * identified by name, in a running VDO device. Doing so does not affect the + * priorities for other devices, or for future VDO device creation. + **/ +typedef struct kvdoWorkQueueAction { + /** Name of the action */ + char *name; + + /** The action code (per-type enum) */ + unsigned int code; + + /** The initial priority for this action */ + unsigned int priority; +} KvdoWorkQueueAction; + +typedef void (*KvdoWorkQueueFunction)(void *); + +/** + * Static attributes of a work queue that are fixed at compile time + * for a given call site. (Attributes that may be computed at run time + * are passed as separate arguments.) + **/ +typedef struct kvdoWorkQueueType { + /** A function to call in the new thread before servicing requests */ + KvdoWorkQueueFunction start; + + /** A function to call in the new thread when shutting down */ + KvdoWorkQueueFunction finish; + + /** A function to call in the new thread after running out of work */ + KvdoWorkQueueFunction suspend; + + /** Table of actions for this work queue */ + KvdoWorkQueueAction actionTable[WORK_QUEUE_ACTION_COUNT]; +} KvdoWorkQueueType; + +/** + * Create a work queue. + * + * If multiple threads are requested, work items will be distributed to them in + * round-robin fashion. + * + * @param [in] threadNamePrefix The per-device prefix to use in thread names + * @param [in] name The queue name + * @param [in] parentKobject The parent sysfs node + * @param [in] owner The kernel layer owning the work queue + * @param [in] private Private data of the queue for use by work + * items or other queue-specific functions + * @param [in] type The work queue type defining the lifecycle + * functions, queue actions, priorities, and + * timeout behavior + * @param [in] threadCount Number of service threads to set up + * @param [out] queuePtr Where to store the queue handle + * + * @return VDO_SUCCESS or an error code + **/ +int makeWorkQueue(const char *threadNamePrefix, + const char *name, + struct kobject *parentKobject, + KernelLayer *owner, + void *private, + const KvdoWorkQueueType *type, + unsigned int threadCount, + KvdoWorkQueue **queuePtr); + +/** + * Set up the fields of a work queue item. + * + * Before the first setup call (setupWorkItem or setupWorkItemWithTimeout), the + * work item must have been initialized to all-zero. Resetting a + * previously-used work item does not require another memset. + * + * The action code is typically defined in a work-queue-type-specific + * enumeration; see the description of KvdoWorkQueueAction. + * + * @param item The work item to initialize + * @param work The function pointer to execute + * @param statsFunction A function pointer to record for stats, or NULL + * @param action Action code, for determination of priority + **/ +void setupWorkItem(KvdoWorkItem *item, + KvdoWorkFunction work, + void *statsFunction, + unsigned int action); + +/** + * Add a work item to a work queue. + * + * If the work item has a timeout that has already passed, the timeout + * handler function may be invoked at this time. + * + * @param queue The queue handle + * @param item The work item to be processed + **/ +void enqueueWorkQueue(KvdoWorkQueue *queue, KvdoWorkItem *item); + +/** + * Add a work item to a work queue, to be run at a later point in time. + * + * Currently delayed work items are used only in a very limited fashion -- at + * most one at a time for any of the work queue types that use them -- and some + * shortcuts have been taken that assume that that's the case. Multiple delayed + * work items should work, but they will execute in the order they were + * enqueued. + * + * @param queue The queue handle + * @param item The work item to be processed + * @param executionTime When to run the work item (jiffies) + **/ +void enqueueWorkQueueDelayed(KvdoWorkQueue *queue, + KvdoWorkItem *item, + Jiffies executionTime); + +/** + * Shut down a work queue's worker thread. + * + * Alerts the worker thread that it should shut down, and then waits + * for it to do so. + * + * There should not be any new enqueueing of work items done once this + * function is called. Any pending delayed work items will be + * processed, as scheduled, before the worker thread shuts down, but + * they must not re-queue themselves to run again. + * + * @param queue The work queue to shut down + **/ +void finishWorkQueue(KvdoWorkQueue *queue); + +/** + * Free a work queue and null out the reference to it. + * + * @param queuePtr Where the queue handle is found + **/ +void freeWorkQueue(KvdoWorkQueue **queuePtr); + +/** + * Print work queue state and statistics to the kernel log. + * + * @param queue The work queue to examine + **/ +void dumpWorkQueue(KvdoWorkQueue *queue); + +/** + * Write to the buffer some info about the work item, for logging. + * Since the common use case is dumping info about a lot of work items + * to syslog all at once, the format favors brevity over readability. + * + * @param item The work item + * @param buffer The message buffer to fill in + * @param length The length of the message buffer + **/ +void dumpWorkItemToBuffer(KvdoWorkItem *item, char *buffer, size_t length); + + +/** + * Initialize work queue internals at module load time. + **/ +void initWorkQueueOnce(void); + +/** + * Checks whether two work items have the same action codes + * + * @param item1 The first item + * @param item2 The second item + * + * @return TRUE if the actions are the same, FALSE otherwise + */ +static inline bool areWorkItemActionsEqual(KvdoWorkItem *item1, + KvdoWorkItem *item2) +{ + return item1->action == item2->action; +} + +/** + * Returns the private data for the current thread's work queue. + * + * @return The private data pointer, or NULL if none or if the current + * thread is not a work queue thread. + **/ +void *getWorkQueuePrivateData(void); + +/** + * Updates the private data pointer for the current thread's work queue. + * + * @param newData The new private data pointer + **/ +void setWorkQueuePrivateData(void *newData); + +/** + * Returns the work queue pointer for the current thread, if any. + * + * @return The work queue pointer or NULL + **/ +KvdoWorkQueue *getCurrentWorkQueue(void); + +/** + * Returns the kernel layer that owns the work queue. + * + * @param queue The work queue + * + * @return The owner pointer supplied at work queue creation + **/ +KernelLayer *getWorkQueueOwner(KvdoWorkQueue *queue); + +#endif /* ALBIREO_WORK_QUEUE_H */ diff --git a/source/vdo/kernel/workQueueHandle.c b/source/vdo/kernel/workQueueHandle.c new file mode 100644 index 0000000..65b3e02 --- /dev/null +++ b/source/vdo/kernel/workQueueHandle.c @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueHandle.c#2 $ + */ + +#include "workQueueHandle.h" + +WorkQueueStackHandleGlobals workQueueStackHandleGlobals; + +/**********************************************************************/ +void initializeWorkQueueStackHandle(WorkQueueStackHandle *handle, + SimpleWorkQueue *queue) +{ + handle->nonce = workQueueStackHandleGlobals.nonce; + handle->queue = queue; + + long offset = (char *) handle - (char *) task_stack_page(current); + spin_lock(&workQueueStackHandleGlobals.offsetLock); + if (workQueueStackHandleGlobals.offset == 0) { + workQueueStackHandleGlobals.offset = offset; + spin_unlock(&workQueueStackHandleGlobals.offsetLock); + } else { + long foundOffset = workQueueStackHandleGlobals.offset; + spin_unlock(&workQueueStackHandleGlobals.offsetLock); + BUG_ON(foundOffset != offset); + } +} + +/**********************************************************************/ +void initWorkQueueStackHandleOnce(void) +{ + spin_lock_init(&workQueueStackHandleGlobals.offsetLock); + workQueueStackHandleGlobals.nonce = currentTime(CLOCK_MONOTONIC); +} diff --git a/source/vdo/kernel/workQueueHandle.h b/source/vdo/kernel/workQueueHandle.h new file mode 100644 index 0000000..e72ce42 --- /dev/null +++ b/source/vdo/kernel/workQueueHandle.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueHandle.h#1 $ + */ + +#ifndef WORK_QUEUE_HANDLE_H +#define WORK_QUEUE_HANDLE_H + +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0) +#include +#else +#include +#endif + +#include "workQueueInternals.h" + +/* + * Layout of a special structure stored at a consistent place on the + * stack in work queue threads. + */ +typedef struct workQueueStackHandle { + unsigned long nonce; + SimpleWorkQueue *queue; +} WorkQueueStackHandle; + +typedef struct workQueueStackHandleGlobals { + /* + * Location in the stack, relative to the task structure which is + * contained in the same memory allocation. + */ + long offset; + /* + * A lock is used to guard against multiple updaters, but once an + * update is done, the offset variable will be read-only. + */ + spinlock_t offsetLock; + /* + * A nonce chosen differently each time the module is loaded, used + * as a marker so we can check that the current thread really is a + * work queue thread. Set at module initialization time, before any + * work queues are created. + */ + unsigned long nonce; +} WorkQueueStackHandleGlobals; + +extern WorkQueueStackHandleGlobals workQueueStackHandleGlobals; + +/** + * Initialize a stack handle associated with a work queue. + * + * @param [out] handle The handle to be initialized + * @param [in] queue The work queue pointer + **/ +void initializeWorkQueueStackHandle(WorkQueueStackHandle *handle, + SimpleWorkQueue *queue); + +/** + * Return the work queue pointer recorded at initialization time in + * the work-queue stack handle initialized on the stack of the current + * thread, if any. + * + * @return the work queue pointer, or NULL + **/ +static inline SimpleWorkQueue *getCurrentThreadWorkQueue(void) +{ + WorkQueueStackHandle *handle + = (WorkQueueStackHandle *)(task_stack_page(current) + + workQueueStackHandleGlobals.offset); + if (likely(handle->nonce == workQueueStackHandleGlobals.nonce)) { + return handle->queue; + } else { + return NULL; + } +} + +/** + * Initialize the global state used by the work-queue stack-handle + * code. + **/ +void initWorkQueueStackHandleOnce(void); + +#endif // WORK_QUEUE_HANDLE_H diff --git a/source/vdo/kernel/workQueueInternals.h b/source/vdo/kernel/workQueueInternals.h new file mode 100644 index 0000000..fc7a2a3 --- /dev/null +++ b/source/vdo/kernel/workQueueInternals.h @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueInternals.h#4 $ + */ + +#ifndef WORK_QUEUE_INTERNALS_H +#define WORK_QUEUE_INTERNALS_H + +#include +#include +#include +#include +#include + +#include "workItemStats.h" +#include "workQueueStats.h" + +typedef struct kvdoWorkItemList { + KvdoWorkItem *tail; +} KvdoWorkItemList; + +/** + * Work queue definition. + * + * There are two types of work queues: simple, with one worker thread, and + * round-robin, which uses a group of the former to do the work, and assigns + * work to them in -- you guessed it -- round-robin fashion. Externally, both + * are represented via the same common sub-structure, though there's actually + * not a great deal of overlap between the two types internally. + **/ +struct kvdoWorkQueue { + /** Name of just the work queue (e.g., "cpuQ12") */ + char *name; + /** + * Whether this is a round-robin work queue or a simple (one-thread) + * work queue. + **/ + bool roundRobinMode; + /** A handle to a sysfs tree for reporting stats and other info */ + struct kobject kobj; + /** The kernel layer owning this work queue */ + KernelLayer *owner; +}; + +typedef struct simpleWorkQueue SimpleWorkQueue; +typedef struct roundRobinWorkQueue RoundRobinWorkQueue; + +struct simpleWorkQueue { + /** Common work queue bits */ + KvdoWorkQueue common; + /** A copy of .thread->pid, for safety in the sysfs support */ + atomic_t threadID; + /** + * Number of priorities actually used, so we don't keep re-checking unused + * funnel queues. + **/ + unsigned int numPriorityLists; + /** + * Map from action codes to priorities. + * + * This mapping can be changed at run time in internal builds, for tuning + * purposes. + **/ + uint8_t priorityMap[WORK_QUEUE_ACTION_COUNT]; + /** The funnel queues */ + FunnelQueue *priorityLists[WORK_QUEUE_PRIORITY_COUNT]; + /** The kernel thread */ + struct task_struct *thread; + /** Life cycle functions, etc */ + const KvdoWorkQueueType *type; + /** Opaque private data pointer, defined by higher level code */ + void *private; + /** In a subordinate work queue, a link back to the round-robin parent */ + KvdoWorkQueue *parentQueue; + /** Padding for cache line separation */ + char pad[CACHE_LINE_BYTES - sizeof(KvdoWorkQueue *)]; + /** Lock protecting delayedItems, priorityMap, numPriorityLists, started */ + spinlock_t lock; + /** Any worker threads (zero or one) waiting for new work to do */ + wait_queue_head_t waitingWorkerThreads; + /** + * Hack to reduce wakeup calls if the worker thread is running. See comments + * in workQueue.c. + * + * There is a lot of redundancy with "firstWakeup", though, and the pair + * should be re-examined. + **/ + atomic_t idle; + /** Wait list for synchronization during worker thread startup */ + wait_queue_head_t startWaiters; + /** Worker thread status (boolean) */ + bool started; + + /** List of delayed work items; usually only one, if any */ + KvdoWorkItemList delayedItems; + /** + * Timer for pulling delayed work items off their list and submitting them to + * run. + * + * If the spinlock "lock" above is not held, this timer is scheduled (or + * currently firing and the callback about to acquire the lock) iff + * delayedItems is nonempty. + **/ + struct timer_list delayedItemsTimer; + + /** + * Timestamp (ns) from the submitting thread that decided to wake us up; also + * used as a flag to indicate whether a wakeup is needed. + * + * Written by submitting threads with atomic64_cmpxchg, and by the worker + * thread setting to 0. + * + * If the value is 0, the worker is probably asleep; the submitting thread + * stores a non-zero value and becomes responsible for calling wake_up on the + * worker thread. If the value is non-zero, either the worker is running or + * another thread has the responsibility for issuing the wakeup. + * + * The "sleep" mode has periodic wakeups and the worker thread may happen to + * wake up while a work item is being enqueued. If that happens, the wakeup + * may be unneeded but will be attempted anyway. + * + * So the return value from cmpxchg(firstWakeup,0,nonzero) can always be + * done, and will tell the submitting thread whether to issue the wakeup or + * not; cmpxchg is atomic, so no other synchronization is needed. + * + * A timestamp is used rather than, say, 1, so that the worker thread can + * record stats on how long it takes to actually get the worker thread + * running. + * + * There is some redundancy between this and "idle" above. + **/ + atomic64_t firstWakeup; + /** Padding for cache line separation */ + char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; + /** Scheduling and work-function statistics */ + KvdoWorkQueueStats stats; + /** Last time (ns) the scheduler actually woke us up */ + uint64_t mostRecentWakeup; +}; + +struct roundRobinWorkQueue { + /** Common work queue bits */ + KvdoWorkQueue common; + /** Simple work queues, for actually getting stuff done */ + SimpleWorkQueue **serviceQueues; + /** Number of subordinate work queues */ + unsigned int numServiceQueues; + /** Padding for cache line separation */ + char pad[CACHE_LINE_BYTES - sizeof(unsigned int)]; + /** + * Rotor used for dispatching across subordinate service queues. + * + * Used and updated by submitting threads. (Not atomically or with locking, + * because we don't really care about it being precise, only about getting a + * roughly even spread; if an increment is missed here and there, it's not a + * problem.) + **/ + unsigned int serviceQueueRotor; +}; + +static inline SimpleWorkQueue *asSimpleWorkQueue(KvdoWorkQueue *queue) +{ + return ((queue == NULL) + ? NULL + : container_of(queue, SimpleWorkQueue, common)); +} + +static inline const SimpleWorkQueue * +asConstSimpleWorkQueue(const KvdoWorkQueue *queue) +{ + return ((queue == NULL) + ? NULL + : container_of(queue, SimpleWorkQueue, common)); +} + +static inline RoundRobinWorkQueue *asRoundRobinWorkQueue(KvdoWorkQueue *queue) +{ + return ((queue == NULL) + ? NULL + : container_of(queue, RoundRobinWorkQueue, common)); +} + +#endif // WORK_QUEUE_INTERNALS_H diff --git a/source/vdo/kernel/workQueueStats.c b/source/vdo/kernel/workQueueStats.c new file mode 100644 index 0000000..d5a38ae --- /dev/null +++ b/source/vdo/kernel/workQueueStats.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueStats.c#6 $ + */ + +#include "workQueueStats.h" + +#include "atomic.h" +#include "logger.h" +#include "workItemStats.h" +#include "workQueueInternals.h" + +/**********************************************************************/ +int initializeWorkQueueStats(KvdoWorkQueueStats *stats, + struct kobject *queueKObject) +{ + spin_lock_init(&stats->workItemStats.functionTable.lock); + if (ENABLE_PER_FUNCTION_TIMING_STATS) { + for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { + initSimpleStats(&stats->workItemStats.times[i]); + } + } + + stats->queueTimeHistogram + = makeLogarithmicHistogram(queueKObject, "queue_time", + "Queue Time", "work items", "wait time", + "microseconds", 9); + if (stats->queueTimeHistogram == NULL) { + return -ENOMEM; + } + + stats->rescheduleQueueLengthHistogram + = makeLogarithmicHistogram(queueKObject, "reschedule_queue_length", + "Reschedule Queue Length", "calls", + "queued work items", NULL, 4); + if (stats->rescheduleQueueLengthHistogram == NULL) { + return -ENOMEM; + } + + stats->rescheduleTimeHistogram + = makeLogarithmicHistogram(queueKObject, "reschedule_time", + "Reschedule Time", "calls", + "sleep interval", "microseconds", 9); + if (stats->rescheduleTimeHistogram == NULL) { + return -ENOMEM; + } + + stats->runTimeBeforeRescheduleHistogram + = makeLogarithmicHistogram(queueKObject, "run_time_before_reschedule", + "Run Time Before Reschedule", + "calls", "run time", "microseconds", 9); + if (stats->runTimeBeforeRescheduleHistogram == NULL) { + return -ENOMEM; + } + + stats->scheduleTimeHistogram + = makeLogarithmicHistogram(queueKObject, "schedule_time", + "Schedule Time", + "calls", "sleep interval", "microseconds", 9); + if (stats->scheduleTimeHistogram == NULL) { + return -ENOMEM; + } + + stats->wakeupLatencyHistogram + = makeLogarithmicHistogram(queueKObject, "wakeup_latency", + "Wakeup Latency", + "wakeups", "latency", "microseconds", 9); + if (stats->wakeupLatencyHistogram == NULL) { + return -ENOMEM; + } + + stats->wakeupQueueLengthHistogram + = makeLogarithmicHistogram(queueKObject, "wakeup_queue_length", + "Wakeup Queue Length", "wakeups", + "queued work items", NULL, 4); + if (stats->wakeupQueueLengthHistogram == NULL) { + return -ENOMEM; + } + + return 0; +} + +/**********************************************************************/ +void cleanupWorkQueueStats(KvdoWorkQueueStats *stats) +{ + freeHistogram(&stats->queueTimeHistogram); + freeHistogram(&stats->rescheduleQueueLengthHistogram); + freeHistogram(&stats->rescheduleTimeHistogram); + freeHistogram(&stats->runTimeBeforeRescheduleHistogram); + freeHistogram(&stats->scheduleTimeHistogram); + freeHistogram(&stats->wakeupLatencyHistogram); + freeHistogram(&stats->wakeupQueueLengthHistogram); +} + +/**********************************************************************/ +static uint64_t getTotalProcessed(const SimpleWorkQueue *queue) +{ + uint64_t totalProcessed = 0; + for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { + totalProcessed += queue->stats.workItemStats.times[i].count; + } + return totalProcessed; +} + +/**********************************************************************/ +void logWorkQueueStats(const SimpleWorkQueue *queue) +{ + uint64_t runtimeNS = 0; + if (queue->thread != NULL) { + runtimeNS += queue->thread->se.sum_exec_runtime; + } + + unsigned long nsPerWorkItem = 0; + uint64_t totalProcessed = getTotalProcessed(queue); + if (totalProcessed > 0) { + nsPerWorkItem = runtimeNS / totalProcessed; + } + unsigned long runtimeMS = runtimeNS / 1000; + logInfo("workQ %" PRIptr " (%s) thread cpu usage %lu.%06lus, %" PRIu64 + " tasks, %lu.%03luus/task", + queue, + queue->common.name, + runtimeMS / 1000000, runtimeMS % 1000000, + totalProcessed, + nsPerWorkItem / 1000, nsPerWorkItem % 1000); +} + +/**********************************************************************/ +ssize_t formatRunTimeStats(const KvdoWorkQueueStats *stats, char *buffer) +{ + // Get snapshots of all three at approximately the same time. + uint64_t startTime = stats->startTime; + uint64_t runTime = atomic64_read(&stats->runTime); + uint64_t rescheduleTime = atomic64_read(&stats->rescheduleTime); + loadFence(); // rdtsc barrier + uint64_t now = currentTime(CLOCK_MONOTONIC); + uint64_t lifetime = now - startTime; + + return sprintf(buffer, + "%llu %llu %llu\n", + lifetime, runTime, rescheduleTime); +} diff --git a/source/vdo/kernel/workQueueStats.h b/source/vdo/kernel/workQueueStats.h new file mode 100644 index 0000000..914f5f4 --- /dev/null +++ b/source/vdo/kernel/workQueueStats.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueStats.h#2 $ + */ + +#ifndef WORK_QUEUE_STATS_H +#define WORK_QUEUE_STATS_H + +#include "workQueue.h" + +#include "timeUtils.h" + +#include "histogram.h" +#include "workItemStats.h" + +// Defined in workQueueInternals.h after inclusion of workQueueStats.h. +struct simpleWorkQueue; + +/* + * Tracking statistics. + * + * Cache line contention issues: + * + * In workItemStats, there are read-only fields accessed mostly by + * work submitters, then fields updated by the work submitters (for + * which there will be contention), then fields rarely if ever updated + * (more than two cache lines' worth), then fields updated only by the + * worker thread. The trailing fields here are updated only by the + * worker thread. + */ +typedef struct kvdoWorkQueueStats { + // Per-work-function counters and optional nanosecond timing data + KvdoWorkItemStats workItemStats; + // How often we go to sleep waiting for work + uint64_t waits; + + // Run time data, for monitoring utilization levels. + + // Thread start time, from which we can compute lifetime thus far. + uint64_t startTime; + /* + * Time the thread has not been blocked waiting for a new work item, + * nor in cond_resched(). This will include time the thread has been + * blocked by some kernel function invoked by the work functions + * (e.g., waiting for socket buffer space). + * + * This is not redundant with runTimeBeforeRescheduleHistogram, as + * the latter doesn't count run time not followed by a cond_resched + * call. + */ + atomic64_t runTime; + // Time the thread has been suspended via cond_resched(). + // (Duplicates data hidden within rescheduleTimeHistogram.) + atomic64_t rescheduleTime; + + // Histogram of the queue times of work items (microseconds) + Histogram *queueTimeHistogram; + // How busy we are when cond_resched is called + Histogram *rescheduleQueueLengthHistogram; + // Histogram of the time cond_resched makes us sleep for (microseconds) + Histogram *rescheduleTimeHistogram; + // Histogram of the run time between cond_resched calls (microseconds) + Histogram *runTimeBeforeRescheduleHistogram; + // Histogram of the time schedule_timeout lets us sleep for (microseconds) + Histogram *scheduleTimeHistogram; + // How long from thread wakeup call to thread actually running (microseconds) + Histogram *wakeupLatencyHistogram; + // How much work is pending by the time we start running + Histogram *wakeupQueueLengthHistogram; +} KvdoWorkQueueStats; + +/** + * Initialize the work queue's statistics tracking. + * + * @param stats The statistics structure + * @param queueKObject The sysfs directory kobject for the work queue + * + * @return 0 or a kernel error code + **/ +int initializeWorkQueueStats(KvdoWorkQueueStats *stats, + struct kobject *queueKObject) + __attribute__((warn_unused_result)); + +/** + * Tear down any allocated storage or objects for statistics tracking. + * + * @param stats The statistics structure + **/ +void cleanupWorkQueueStats(KvdoWorkQueueStats *stats); + +/** + * Update the work queue statistics tracking to note the enqueueing of + * a work item. + * + * @param stats The statistics structure + * @param item The work item being enqueued + * @param priority The priority of the work item + **/ +static inline void updateStatsForEnqueue(KvdoWorkQueueStats *stats, + KvdoWorkItem *item, + int priority) +{ + updateWorkItemStatsForEnqueue(&stats->workItemStats, item, priority); + item->enqueueTime = currentTime(CLOCK_MONOTONIC); +} + +/** + * Update the work queue statistics tracking to note the dequeueing of + * a work item. + * + * @param stats The statistics structure + * @param item The work item being enqueued + **/ +static inline void updateStatsForDequeue(KvdoWorkQueueStats *stats, + KvdoWorkItem *item) +{ + updateWorkItemStatsForDequeue(&stats->workItemStats, item); + enterHistogramSample(stats->queueTimeHistogram, + (currentTime(CLOCK_MONOTONIC) - item->enqueueTime) / 1000); + item->enqueueTime = 0; +} + +/** + * Write the work queue's accumulated statistics to the kernel log. + * + * The queue pointer is needed so that its address and name can be + * logged along with the statistics. + * + * @param queue The work queue + **/ +void logWorkQueueStats(const struct simpleWorkQueue *queue); + +/** + * Format the thread lifetime, run time, and suspend time into a + * supplied buffer for reporting via sysfs. + * + * @param [in] stats The stats structure containing the run-time info + * @param [out] buffer The buffer in which to report the info + **/ +ssize_t formatRunTimeStats(const KvdoWorkQueueStats *stats, char *buffer); + +#endif // WORK_QUEUE_STATS_H diff --git a/source/vdo/kernel/workQueueSysfs.c b/source/vdo/kernel/workQueueSysfs.c new file mode 100644 index 0000000..f9dd9cb --- /dev/null +++ b/source/vdo/kernel/workQueueSysfs.c @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueSysfs.c#1 $ + */ + +#include "workQueueSysfs.h" + +#include + +#include "logger.h" +#include "memoryAlloc.h" + +#include "workQueueInternals.h" + +typedef struct workQueueAttribute { + struct attribute attr; + ssize_t (*show)(const KvdoWorkQueue *queue, char *buf); + ssize_t (*store)(KvdoWorkQueue *queue, const char *buf, size_t length); +} WorkQueueAttribute; + +/**********************************************************************/ +static ssize_t nameShow(const KvdoWorkQueue *queue, char *buf) +{ + return sprintf(buf, "%s\n", queue->name); +} + +/**********************************************************************/ +static ssize_t pidShow(const KvdoWorkQueue *queue, char *buf) +{ + return sprintf(buf, "%ld\n", + (long) atomic_read(&asConstSimpleWorkQueue(queue)->threadID)); +} + +/**********************************************************************/ +static ssize_t timesShow(const KvdoWorkQueue *queue, char *buf) +{ + return formatRunTimeStats(&asConstSimpleWorkQueue(queue)->stats, buf); +} + +/**********************************************************************/ +static ssize_t typeShow(const KvdoWorkQueue *queue, char *buf) +{ + strcpy(buf, queue->roundRobinMode ? "round-robin\n" : "simple\n"); + return strlen(buf); +} + +/**********************************************************************/ +static ssize_t workFunctionsShow(const KvdoWorkQueue *queue, char *buf) +{ + const SimpleWorkQueue *simpleQueue = asConstSimpleWorkQueue(queue); + return formatWorkItemStats(&simpleQueue->stats.workItemStats, buf, + PAGE_SIZE); +} + +/**********************************************************************/ +static WorkQueueAttribute nameAttr = { + .attr = { .name = "name", .mode = 0444, }, + .show = nameShow, +}; + +/**********************************************************************/ +static WorkQueueAttribute pidAttr = { + .attr = { .name = "pid", .mode = 0444, }, + .show = pidShow, +}; + +/**********************************************************************/ +static WorkQueueAttribute timesAttr = { + .attr = { .name = "times", .mode = 0444 }, + .show = timesShow, +}; + +/**********************************************************************/ +static WorkQueueAttribute typeAttr = { + .attr = { .name = "type", .mode = 0444, }, + .show = typeShow, +}; + +/**********************************************************************/ +static WorkQueueAttribute workFunctionsAttr = { + .attr = { .name = "work_functions", .mode = 0444, }, + .show = workFunctionsShow, +}; + +/**********************************************************************/ +static struct attribute *simpleWorkQueueAttrs[] = { + &nameAttr.attr, + &pidAttr.attr, + ×Attr.attr, + &typeAttr.attr, + &workFunctionsAttr.attr, + NULL, +}; + +/**********************************************************************/ +static struct attribute *roundRobinWorkQueueAttrs[] = { + &nameAttr.attr, + &typeAttr.attr, + NULL, +}; + +/**********************************************************************/ +static ssize_t workQueueAttrShow(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + WorkQueueAttribute *wqAttr = container_of(attr, WorkQueueAttribute, attr); + if (wqAttr->show == NULL) { + return -EINVAL; + } + KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); + return wqAttr->show(queue, buf); +} + +/**********************************************************************/ +static ssize_t workQueueAttrStore(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t length) +{ + WorkQueueAttribute *wqAttr = container_of(attr, WorkQueueAttribute, attr); + if (wqAttr->store == NULL) { + return -EINVAL; + } + KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); + return wqAttr->store(queue, buf, length); +} + +/**********************************************************************/ +static struct sysfs_ops workQueueSysfsOps = { + .show = workQueueAttrShow, + .store = workQueueAttrStore, +}; + +/**********************************************************************/ +static void workQueueRelease(struct kobject *kobj) +{ + KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); + FREE(queue->name); + if (queue->roundRobinMode) { + FREE(asRoundRobinWorkQueue(queue)); + } else { + FREE(asSimpleWorkQueue(queue)); + } +} + +/**********************************************************************/ +struct kobj_type simpleWorkQueueKobjType = { + .default_attrs = simpleWorkQueueAttrs, + .release = workQueueRelease, + .sysfs_ops = &workQueueSysfsOps, +}; + +/**********************************************************************/ +struct kobj_type roundRobinWorkQueueKobjType = { + .default_attrs = roundRobinWorkQueueAttrs, + .release = workQueueRelease, + .sysfs_ops = &workQueueSysfsOps, +}; diff --git a/source/vdo/kernel/workQueueSysfs.h b/source/vdo/kernel/workQueueSysfs.h new file mode 100644 index 0000000..41f6af5 --- /dev/null +++ b/source/vdo/kernel/workQueueSysfs.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2020 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueSysfs.h#1 $ + */ + +#ifndef WORK_QUEUE_SYSFS_H +#define WORK_QUEUE_SYSFS_H + +#include + +extern struct kobj_type roundRobinWorkQueueKobjType; +extern struct kobj_type simpleWorkQueueKobjType; + +#endif // WORK_QUEUE_SYSFS_H diff --git a/uds/Makefile b/uds/Makefile deleted file mode 100644 index 5afc64a..0000000 --- a/uds/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -UDS_VERSION = 8.0.2.4 - -SOURCES = $(notdir $(wildcard $(src)/*.c)) murmur/MurmurHash3.c -SOURCES += $(addprefix util/,$(notdir $(wildcard $(src)/util/*.c))) -OBJECTS = $(SOURCES:%.c=%.o) -INCLUDES = -I$(src) - -EXTRA_CFLAGS = -std=gnu99 \ - -fno-builtin-memset \ - -Werror \ - -Wframe-larger-than=400 \ - -Wno-declaration-after-statement \ - -DUDS_VERSION=\"$(UDS_VERSION)\" \ - $(INCLUDES) - -CFLAGS_REMOVE_deltaIndex.o = -std=gnu99 -CFLAGS_REMOVE_masterIndex005.o = -std=gnu99 - -obj-m += uds.o - -uds-objs = $(OBJECTS) diff --git a/uds/atomicDefs.h b/uds/atomicDefs.h deleted file mode 100644 index 0c82bca..0000000 --- a/uds/atomicDefs.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/atomicDefs.h#2 $ - */ - -#ifndef LINUX_KERNEL_ATOMIC_DEFS_H -#define LINUX_KERNEL_ATOMIC_DEFS_H - -#include - -#endif /* LINUX_KERNEL_ATOMIC_DEFS_H */ diff --git a/uds/bits.c b/uds/bits.c deleted file mode 100644 index eea4912..0000000 --- a/uds/bits.c +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/bits.c#1 $ - */ - -#include "bits.h" - -#include "compiler.h" - -/** - * This is the largest field size supported by getBigField & setBigField. - * Any field that is larger is not guaranteed to fit in a single, byte - * aligned uint64_t. - **/ -enum { MAX_BIG_FIELD_BITS = (sizeof(uint64_t) - 1) * CHAR_BIT + 1 }; - -/** - * Get a big bit field from a bit stream - * - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - * - * @return the bit field - **/ -static INLINE uint64_t getBigField(const byte *memory, - uint64_t offset, - int size) -{ - const void *addr = memory + offset / CHAR_BIT; - return (getUInt64LE(addr) >> (offset % CHAR_BIT)) & ((1UL << size) - 1); -} - -/** - * Set a big bit field in a bit stream - * - * @param value The value to put into the field - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - * - * @return the bit field - **/ -static INLINE void setBigField(uint64_t value, byte *memory, uint64_t offset, - int size) -{ - void *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - uint64_t data = getUInt64LE(addr); - data &= ~(((1UL << size) - 1) << shift); - data |= value << shift; - storeUInt64LE(addr, data); -} - -/***********************************************************************/ -void getBytes(const byte *memory, uint64_t offset, byte *destination, int size) -{ - const byte *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - while (--size >= 0) { - *destination++ = getUInt16LE(addr++) >> shift; - } -} - -/***********************************************************************/ -void setBytes(byte *memory, uint64_t offset, const byte *source, int size) -{ - byte *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - uint16_t mask = ~((uint16_t) 0xFF << shift); - while (--size >= 0) { - uint16_t data = (getUInt16LE(addr) & mask) | (*source++ << shift); - storeUInt16LE(addr++, data); - } -} - -/***********************************************************************/ -void moveBits(const byte *sMemory, uint64_t source, byte *dMemory, - uint64_t destination, int size) -{ - enum { UINT32_BIT = sizeof(uint32_t) * CHAR_BIT }; - if (size > MAX_BIG_FIELD_BITS) { - if (source > destination) { - // This is a large move from a higher to a lower address. We move - // the lower addressed bits first. Start by moving one field that - // ends on a destination int boundary - int count - = MAX_BIG_FIELD_BITS - (destination + MAX_BIG_FIELD_BITS) % UINT32_BIT; - uint64_t field = getBigField(sMemory, source, count); - setBigField(field, dMemory, destination, count); - source += count; - destination += count; - size -= count; - // Now do the main loop to copy 32 bit chunks that are int-aligned - // at the destination. - int offset = source % UINT32_BIT; - const byte *src = sMemory + (source - offset) / CHAR_BIT; - byte *dest = dMemory + destination / CHAR_BIT; - while (size > MAX_BIG_FIELD_BITS) { - storeUInt32LE(dest, getUInt64LE(src) >> offset); - src += sizeof(uint32_t); - dest += sizeof(uint32_t); - source += UINT32_BIT; - destination += UINT32_BIT; - size -= UINT32_BIT; - } - } else { - // This is a large move from a lower to a higher address. We move - // the higher addressed bits first. Start by moving one field that - // begins on a destination int boundary - int count = (destination + size) % UINT32_BIT; - if (count > 0) { - size -= count; - uint64_t field = getBigField(sMemory, source + size, count); - setBigField(field, dMemory, destination + size, count); - } - // Now do the main loop to copy 32 bit chunks that are int-aligned - // at the destination. - int offset = (source + size) % UINT32_BIT; - const byte *src = sMemory + (source + size - offset) / CHAR_BIT; - byte *dest = dMemory + (destination + size) / CHAR_BIT; - while (size > MAX_BIG_FIELD_BITS) { - src -= sizeof(uint32_t); - dest -= sizeof(uint32_t); - size -= UINT32_BIT; - storeUInt32LE(dest, getUInt64LE(src) >> offset); - } - } - } - // Finish up by doing the last chunk, which can have any arbitrary alignment - if (size > 0) { - uint64_t field = getBigField(sMemory, source, size); - setBigField(field, dMemory, destination, size); - } -} - -/***********************************************************************/ -bool sameBits(const byte *mem1, uint64_t offset1, const byte *mem2, - uint64_t offset2, int size) -{ - while (size >= MAX_FIELD_BITS) { - unsigned int field1 = getField(mem1, offset1, MAX_FIELD_BITS); - unsigned int field2 = getField(mem2, offset2, MAX_FIELD_BITS); - if (field1 != field2) return false; - offset1 += MAX_FIELD_BITS; - offset2 += MAX_FIELD_BITS; - size -= MAX_FIELD_BITS; - } - if (size > 0) { - unsigned int field1 = getField(mem1, offset1, size); - unsigned int field2 = getField(mem2, offset2, size); - if (field1 != field2) return false; - } - return true; -} diff --git a/uds/bits.h b/uds/bits.h deleted file mode 100644 index 2c2d4ea..0000000 --- a/uds/bits.h +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/bits.h#1 $ - */ - -#ifndef BITS_H -#define BITS_H 1 - -#include "compiler.h" -#include "numeric.h" -#include "typeDefs.h" - -/* - * These bit stream and bit field utility routines are used for the - * non-byte aligned delta indices. - * - * Bits and bytes are numbered in little endian order. For example: Within - * a byte, bit 0 is the least significant bit (0x1), and bit 7 is the most - * significant bit (0x80). Within a bit stream, bit 7 is the most - * signficant bit of byte 0, and bit 8 is the least significant bit of byte - * 1. Within a byte array, a byte's number corresponds to it's index in - * the array. - * - * The implementation assumes that the native machine is little endian, and - * that performance is very important. These assumptions match our current - * operating environment. - */ - -/** - * This is the largest field size supported by getField & setField. Any - * field that is larger is not guaranteed to fit in a single, byte aligned - * uint32_t. - **/ -enum { MAX_FIELD_BITS = (sizeof(uint32_t) - 1) * CHAR_BIT + 1 }; - -/** - * This is the number of guard bytes needed at the end of the memory byte - * array when using the bit utilities. 3 bytes are needed when getField & - * setField access a field, because they will access some "extra" bytes - * past the end of the field. And 7 bytes are needed when getBigField & - * setBigField access a big field, for the same reason. Note that moveBits - * calls getBigField & setBigField. 7 is rewritten to make it clear how it - * is derived. - **/ -enum { POST_FIELD_GUARD_BYTES = sizeof(uint64_t) - 1 }; - -/** - * Get a bit field from a bit stream - * - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - * - * @return the bit field - **/ -static INLINE unsigned int getField(const byte *memory, uint64_t offset, - int size) -{ - const void *addr = memory + offset / CHAR_BIT; - return (getUInt32LE(addr) >> (offset % CHAR_BIT)) & ((1 << size) - 1); -} - -/** - * Set a bit field in a bit stream - * - * @param value The value to put into the field - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - * - * @return the bit field - **/ -static INLINE void setField(unsigned int value, byte *memory, uint64_t offset, - int size) -{ - void *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - uint32_t data = getUInt32LE(addr); - data &= ~(((1 << size) - 1) << shift); - data |= value << shift; - storeUInt32LE(addr, data); -} - -/** - * Set a bit field in a bit stream to all ones - * - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - * - * @return the bit field - **/ -static INLINE void setOne(byte *memory, uint64_t offset, int size) -{ - if (size > 0) { - byte *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; - *addr++ |= ((1 << count) - 1) << shift; - for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { - *addr++ = 0xFF; - } - if (size) { - *addr |= ~(0xFF << size); - } - } -} - -/** - * Set a bit field in a bit stream to all zeros - * - * @param memory The base memory byte address - * @param offset The bit offset into the memory for the start of the field - * @param size The number of bits in the field - * - * @return the bit field - **/ -static INLINE void setZero(byte *memory, uint64_t offset, int size) -{ - if (size > 0) { - byte *addr = memory + offset / CHAR_BIT; - int shift = offset % CHAR_BIT; - int count = size + shift > CHAR_BIT ? CHAR_BIT - shift : size; - *addr++ &= ~(((1 << count) - 1) << shift); - for (size -= count; size > CHAR_BIT; size -= CHAR_BIT) { - *addr++ = 0; - } - if (size) { - *addr &= 0xFF << size; - } - } -} - -/** - * Get a byte stream from a bit stream, reading a whole number of bytes - * from an arbitrary bit boundary. - * - * @param memory The base memory byte address for the bit stream - * @param offset The bit offset of the start of the bit stream - * @param destination Where to store the bytes - * @param size The number of bytes - **/ -void getBytes(const byte *memory, uint64_t offset, byte *destination, int size); - -/** - * Store a byte stream into a bit stream, writing a whole number of bytes - * to an arbitrary bit boundary. - * - * @param memory The base memory byte address for the bit stream - * @param offset The bit offset of the start of the bit stream - * @param source Where to read the bytes - * @param size The number of bytes - **/ -void setBytes(byte *memory, uint64_t offset, const byte *source, int size); - -/** - * Move bits from one field to another. When the fields overlap, behave as - * if we first move all the bits from the source to a temporary value, and - * then move all the bits from the temporary value to the destination. - * - * @param sMemory The base source memory byte address - * @param source Bit offset into memory for the source start - * @param dMemory The base destination memory byte address - * @param destination Bit offset into memory for the destination start - * @param size The number of bits in the field - **/ -void moveBits(const byte *sMemory, uint64_t source, byte *dMemory, - uint64_t destination, int size); - -/** - * Compare bits from one field to another, testing for sameness - * - * @param mem1 The base memory byte address (first field) - * @param offset1 Bit offset into the memory for the start (first field) - * @param mem2 The base memory byte address (second field) - * @param offset2 Bit offset into the memory for the start (second field) - * @param size The number of bits in the field - * - * @return true if fields are the same, false if different - **/ -bool sameBits(const byte *mem1, uint64_t offset1, const byte *mem2, - uint64_t offset2, int size) - __attribute__((warn_unused_result)); - -#endif /* BITS_H */ diff --git a/uds/buffer.c b/uds/buffer.c deleted file mode 100644 index 2bf6d20..0000000 --- a/uds/buffer.c +++ /dev/null @@ -1,596 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/buffer.c#3 $ - */ - -#include "buffer.h" - -#include "bufferPrivate.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" -#include "typeDefs.h" - -/**********************************************************************/ -int wrapBuffer(byte *bytes, - size_t length, - size_t contentLength, - Buffer **bufferPtr) -{ - int result = ASSERT((contentLength <= length), - "content length, %zu, fits in buffer size, %zu", - length, contentLength); - Buffer *buffer; - result = ALLOCATE(1, Buffer, "buffer", &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - buffer->data = bytes; - buffer->start = 0; - buffer->end = contentLength; - buffer->length = length; - buffer->wrapped = true; - - *bufferPtr = buffer; - return UDS_SUCCESS; -} - -/***********************************************************************/ -int makeBuffer(size_t size, Buffer **newBuffer) -{ - byte *data; - int result = ALLOCATE(size, byte, "buffer data", &data); - if (result != UDS_SUCCESS) { - return result; - } - - Buffer *buffer; - result = wrapBuffer(data, size, 0, &buffer); - if (result != UDS_SUCCESS) { - FREE(data); - return result; - } - - buffer->wrapped = false; - *newBuffer = buffer; - return UDS_SUCCESS; -} - -/***********************************************************************/ -void freeBuffer(Buffer **pBuffer) -{ - Buffer *buffer = *pBuffer; - *pBuffer = NULL; - if (buffer == NULL) { - return; - } - if (!buffer->wrapped) { - FREE(buffer->data); - } - FREE(buffer); -} - -/**********************************************************************/ -size_t bufferLength(Buffer *buffer) -{ - return buffer->length; -} - -/**********************************************************************/ -size_t contentLength(Buffer *buffer) -{ - return buffer->end - buffer->start; -} - -/**********************************************************************/ -size_t uncompactedAmount(Buffer *buffer) -{ - return buffer->start; -} - -/**********************************************************************/ -size_t availableSpace(Buffer *buffer) -{ - return buffer->length - buffer->end; -} - -/**********************************************************************/ -size_t bufferUsed(Buffer *buffer) -{ - return buffer->end; -} - -/***********************************************************************/ -int growBuffer(Buffer *buffer, size_t length) -{ - if (buffer == NULL) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot resize NULL buffer"); - } - - if (buffer->wrapped) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot resize wrapped buffer"); - } - if (buffer->end > length) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot shrink buffer"); - } - - byte *data; - int result = reallocateMemory(buffer->data, buffer->length, length, - "buffer data", &data); - if (result != UDS_SUCCESS) { - return result; - } - - buffer->data = data; - buffer->length = length; - return UDS_SUCCESS; -} - -/***********************************************************************/ -bool ensureAvailableSpace(Buffer *buffer, size_t bytes) -{ - if (availableSpace(buffer) >= bytes) { - return true; - } - compactBuffer(buffer); - return (availableSpace(buffer) >= bytes); -} - -/***********************************************************************/ -void clearBuffer(Buffer *buffer) -{ - buffer->start = 0; - buffer->end = buffer->length; -} - -/***********************************************************************/ -void compactBuffer(Buffer *buffer) -{ - if ((buffer->start == 0) || (buffer->end == 0)) { - return; - } - size_t bytesToMove = buffer->end - buffer->start; - memmove(buffer->data, buffer->data + buffer->start, bytesToMove); - buffer->start = 0; - buffer->end = bytesToMove; -} - -/**********************************************************************/ -int resetBufferEnd(Buffer *buffer, size_t end) -{ - if (end > buffer->length) { - return UDS_BUFFER_ERROR; - } - buffer->end = end; - if (buffer->start > buffer->end) { - buffer->start = buffer->end; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int skipForward(Buffer *buffer, size_t bytesToSkip) -{ - if (contentLength(buffer) < bytesToSkip) { - return UDS_BUFFER_ERROR; - } - - buffer->start += bytesToSkip; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int rewindBuffer(Buffer *buffer, size_t bytesToRewind) -{ - if (buffer->start < bytesToRewind) { - return UDS_BUFFER_ERROR; - } - - buffer->start -= bytesToRewind; - return UDS_SUCCESS; -} - -/**********************************************************************/ -bool hasSameBytes(Buffer *buffer, const byte *data, size_t length) -{ - return ((contentLength(buffer) >= length) - && (memcmp(buffer->data + buffer->start, data, length) == 0)); -} - -/**********************************************************************/ -bool equalBuffers(Buffer *buffer1, Buffer *buffer2) -{ - return hasSameBytes(buffer1, buffer2->data + buffer2->start, - contentLength(buffer2)); -} - -/**********************************************************************/ -int getByte(Buffer *buffer, byte *bytePtr) -{ - if (contentLength(buffer) < sizeof(byte)) { - return UDS_BUFFER_ERROR; - } - - *bytePtr = buffer->data[buffer->start++]; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int peekByte(Buffer *buffer, size_t offset, byte *bytePtr) -{ - if (contentLength(buffer) < (offset + sizeof(byte))) { - return UDS_BUFFER_ERROR; - } - - *bytePtr = buffer->data[buffer->start + offset]; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putByte(Buffer *buffer, byte b) -{ - if (!ensureAvailableSpace(buffer, sizeof(byte))) { - return UDS_BUFFER_ERROR; - } - - buffer->data[buffer->end++] = b; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getBytesFromBuffer(Buffer *buffer, size_t length, void *destination) -{ - if (contentLength(buffer) < length) { - return UDS_BUFFER_ERROR; - } - - memcpy(destination, buffer->data + buffer->start, length); - buffer->start += length; - return UDS_SUCCESS; -} - -/**********************************************************************/ -byte *getBufferContents(Buffer *buffer) -{ - return buffer->data + buffer->start; -} - -/**********************************************************************/ -int copyBytes(Buffer *buffer, size_t length, byte **destinationPtr) -{ - byte *destination; - int result = ALLOCATE(length, byte, "copyBytes() buffer", - &destination); - if (result != UDS_SUCCESS) { - return result; - } - - result = getBytesFromBuffer(buffer, length, destination); - if (result != UDS_SUCCESS) { - FREE(destination); - } else { - *destinationPtr = destination; - } - return result; -} - -/**********************************************************************/ -int putBytes(Buffer *buffer, size_t length, const void *source) -{ - if (!ensureAvailableSpace(buffer, length)) { - return UDS_BUFFER_ERROR; - } - memcpy(buffer->data + buffer->end, source, length); - buffer->end += length; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putBuffer(Buffer *target, Buffer *source, size_t length) -{ - if (contentLength(source) < length) { - return UDS_BUFFER_ERROR; - } - - int result = putBytes(target, length, getBufferContents(source)); - if (result != UDS_SUCCESS) { - return result; - } - - source->start += length; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int zeroBytes(Buffer *buffer, size_t length) -{ - if (!ensureAvailableSpace(buffer, length)) { - return UDS_BUFFER_ERROR; - } - memset(buffer->data + buffer->end, 0, length); - buffer->end += length; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getBoolean(Buffer *buffer, bool *b) -{ - byte by; - int result = getByte(buffer, &by); - if (result == UDS_SUCCESS) { - *b = (by == 1); - } - return result; -} - -/**********************************************************************/ -int putBoolean(Buffer *buffer, bool b) -{ - return putByte(buffer, (byte) (b ? 1 : 0)); -} - -/**********************************************************************/ -int getUInt16BEFromBuffer(Buffer *buffer, uint16_t *ui) -{ - if (contentLength(buffer) < sizeof(uint16_t)) { - return UDS_BUFFER_ERROR; - } - - decodeUInt16BE(buffer->data, &buffer->start, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt16BEIntoBuffer(Buffer *buffer, uint16_t ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint16_t))) { - return UDS_BUFFER_ERROR; - } - - encodeUInt16BE(buffer->data, &buffer->end, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getUInt32BEFromBuffer(Buffer *buffer, uint32_t *ui) -{ - if (contentLength(buffer) < sizeof(uint32_t)) { - return UDS_BUFFER_ERROR; - } - - decodeUInt32BE(buffer->data, &buffer->start, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt32BEIntoBuffer(Buffer *buffer, uint32_t ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint32_t))) { - return UDS_BUFFER_ERROR; - } - - encodeUInt32BE(buffer->data, &buffer->end, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getUInt32BEsFromBuffer(Buffer *buffer, size_t count, uint32_t *ui) -{ - if (contentLength(buffer) < (sizeof(uint32_t) * count)) { - return UDS_BUFFER_ERROR; - } - - unsigned int i; - for (i = 0; i < count; i++) { - decodeUInt32BE(buffer->data, &buffer->start, ui + i); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt32BEsIntoBuffer(Buffer *buffer, size_t count, const uint32_t *ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint32_t) * count)) { - return UDS_BUFFER_ERROR; - } - - unsigned int i; - for (i = 0; i < count; i++) { - encodeUInt32BE(buffer->data, &buffer->end, ui[i]); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getUInt64BEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) -{ - if (contentLength(buffer) < (sizeof(uint64_t) * count)) { - return UDS_BUFFER_ERROR; - } - - unsigned int i; - for (i = 0; i < count; i++) { - decodeUInt64BE(buffer->data, &buffer->start, ui + i); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt64BEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint64_t) * count)) { - return UDS_BUFFER_ERROR; - } - - unsigned int i; - for (i = 0; i < count; i++) { - encodeUInt64BE(buffer->data, &buffer->end, ui[i]); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getUInt16LEFromBuffer(Buffer *buffer, uint16_t *ui) -{ - if (contentLength(buffer) < sizeof(uint16_t)) { - return UDS_BUFFER_ERROR; - } - - decodeUInt16LE(buffer->data, &buffer->start, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt16LEIntoBuffer(Buffer *buffer, uint16_t ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint16_t))) { - return UDS_BUFFER_ERROR; - } - - encodeUInt16LE(buffer->data, &buffer->end, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getUInt16LEsFromBuffer(Buffer *buffer, size_t count, uint16_t *ui) -{ - if (contentLength(buffer) < (sizeof(uint16_t) * count)) { - return UDS_BUFFER_ERROR; - } - - unsigned int i; - for (i = 0; i < count; i++) { - decodeUInt16LE(buffer->data, &buffer->start, ui + i); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt16LEsIntoBuffer(Buffer *buffer, size_t count, const uint16_t *ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint16_t) * count)) { - return UDS_BUFFER_ERROR; - } - - unsigned int i; - for (i = 0; i < count; i++) { - encodeUInt16LE(buffer->data, &buffer->end, ui[i]); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getInt32LEFromBuffer(Buffer *buffer, int32_t *i) -{ - if (contentLength(buffer) < sizeof(int32_t)) { - return UDS_BUFFER_ERROR; - } - - decodeInt32LE(buffer->data, &buffer->start, i); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getUInt32LEFromBuffer(Buffer *buffer, uint32_t *ui) -{ - if (contentLength(buffer) < sizeof(uint32_t)) { - return UDS_BUFFER_ERROR; - } - - decodeUInt32LE(buffer->data, &buffer->start, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt32LEIntoBuffer(Buffer *buffer, uint32_t ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint32_t))) { - return UDS_BUFFER_ERROR; - } - - encodeUInt32LE(buffer->data, &buffer->end, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putInt64LEIntoBuffer(Buffer *buffer, int64_t i) -{ - if (!ensureAvailableSpace(buffer, sizeof(int64_t))) { - return UDS_BUFFER_ERROR; - } - - encodeInt64LE(buffer->data, &buffer->end, i); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getUInt64LEFromBuffer(Buffer *buffer, uint64_t *ui) -{ - if (contentLength(buffer) < sizeof(uint64_t)) { - return UDS_BUFFER_ERROR; - } - - decodeUInt64LE(buffer->data, &buffer->start, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt64LEIntoBuffer(Buffer *buffer, uint64_t ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint64_t))) { - return UDS_BUFFER_ERROR; - } - - encodeUInt64LE(buffer->data, &buffer->end, ui); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getUInt64LEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) -{ - if (contentLength(buffer) < (sizeof(uint64_t) * count)) { - return UDS_BUFFER_ERROR; - } - - unsigned int i; - for (i = 0; i < count; i++) { - decodeUInt64LE(buffer->data, &buffer->start, ui + i); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putUInt64LEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) -{ - if (!ensureAvailableSpace(buffer, sizeof(uint64_t) * count)) { - return UDS_BUFFER_ERROR; - } - - unsigned int i; - for (i = 0; i < count; i++) { - encodeUInt64LE(buffer->data, &buffer->end, ui[i]); - } - return UDS_SUCCESS; -} - diff --git a/uds/buffer.h b/uds/buffer.h deleted file mode 100644 index 22df042..0000000 --- a/uds/buffer.h +++ /dev/null @@ -1,614 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/buffer.h#2 $ - */ - -#ifndef BUFFER_H -#define BUFFER_H - -#include "common.h" - -typedef struct buffer Buffer; - -/** - * Create a buffer which wraps an existing byte array. - * - * @param bytes The bytes to wrap - * @param length The length of the buffer - * @param contentLength The length of the current contents of the buffer - * @param bufferPtr A pointer to hold the buffer - * - * @return UDS_SUCCESS or an error code - **/ -int wrapBuffer(byte *bytes, - size_t length, - size_t contentLength, - Buffer **bufferPtr) - __attribute__((warn_unused_result)); - -/** - * Create a new buffer and allocate its memory. - * - * @param length The length of the buffer - * @param bufferPtr A pointer to hold the buffer - * - * @return UDS_SUCCESS or an error code - **/ -int makeBuffer(size_t length, Buffer **bufferPtr) - __attribute__((warn_unused_result)); - -/** - * Release a buffer and, if not wrapped, free its memory. - * - * @param pBuffer Pointer to the buffer to release - **/ -void freeBuffer(Buffer **pBuffer); - -/** - * Grow a non-wrapped buffer. - * - * @param buffer The buffer to resize - * @param length The new length of the buffer - * - * @return UDS_SUCCESS or an error code - **/ -int growBuffer(Buffer *buffer, size_t length) - __attribute__((warn_unused_result)); - -/** - * Ensure that a buffer has a given amount of space available, compacting the - * buffer if necessary. - * - * @param buffer The buffer - * @param bytes The number of available bytes desired - * - * @return true if the requested number of bytes are now available - **/ -bool ensureAvailableSpace(Buffer *buffer, size_t bytes) - __attribute__((warn_unused_result)); - -/** - * Clear the buffer. The start position is set to zero and the end position - * is set to the buffer length. - **/ -void clearBuffer(Buffer *buffer); - -/** - * Eliminate buffer contents which have been extracted. This function copies - * any data between the start and end pointers to the beginning of the buffer, - * moves the start pointer to the beginning, and the end pointer to the end - * of the copied data. - * - * @param buffer The buffer to compact - **/ -void compactBuffer(Buffer *buffer); - -/** - * Skip forward the specified number of bytes in a buffer (advance the - * start pointer). - * - * @param buffer The buffer - * @param bytesToSkip The number of bytes to skip - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long - * enough to skip forward the requested number of bytes - **/ -int skipForward(Buffer *buffer, size_t bytesToSkip) - __attribute__((warn_unused_result)); - -/** - * Rewind the specified number of bytes in a buffer (back up the start - * pointer). - * - * @param buffer The buffer - * @param bytesToRewind The number of bytes to rewind - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer is not long - * enough to rewind backward the requested number of bytes - **/ -int rewindBuffer(Buffer *buffer, size_t bytesToRewind) - __attribute__((warn_unused_result)); - -/** - * Return the length of the buffer. - * - * @param buffer the buffer - * - * @return the buffer length - **/ -size_t bufferLength(Buffer *buffer); - -/** - * Compute the amount of data current in the buffer. - * - * @param buffer The buffer to examine - * - * @return The number of bytes between the start and end pointers of the buffer - **/ -size_t contentLength(Buffer *buffer); - -/** - * Compute the amount of available space in this buffer. - * - * @param buffer The buffer to examine - * - * @return The number of bytes between the end pointer and the end of the buffer - **/ -size_t availableSpace(Buffer *buffer); - -/** - * Amount of buffer that has already been processed. - * - * @param buffer the buffer to examine - * - * @return The number of bytes between the beginning of the buffer and the - * start pointer. - **/ -size_t uncompactedAmount(Buffer *buffer); - -/** - * Return the amount of the buffer that is currently utilized. - * - * @param buffer the buffer to examine - * - * @return The number of bytes between the beginning of the buffer and - * the end pointer. - **/ -size_t bufferUsed(Buffer *buffer); - -/** - * Reset the end of buffer to a different position. - * - * @param buffer the buffer - * @param end the new end of the buffer - * - * @return UDS_SUCCESS unless the end is larger than can fit - **/ -int resetBufferEnd(Buffer *buffer, size_t end) - __attribute__((warn_unused_result)); - -/** - * Check whether the start of the content of a buffer matches a specified - * array of bytes. - * - * @param buffer The buffer to check - * @param data The desired data - * @param length The length of the desired data - * - * @return true if the first length bytes of the buffer's - * contents match data - **/ -bool hasSameBytes(Buffer *buffer, const byte *data, size_t length) - __attribute__((warn_unused_result)); - -/** - * Check whether two buffers have the same contents. - * - * @param buffer1 The first buffer - * @param buffer2 The second buffer - * - * @return true if the contents of the two buffers are the - * same - **/ -bool equalBuffers(Buffer *buffer1, Buffer *buffer2); - -/** - * Get a single byte from a buffer and advance the start pointer. - * - * @param buffer The buffer - * @param bytePtr A pointer to hold the byte - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are no bytes to - * retrieve - **/ -int getByte(Buffer *buffer, byte *bytePtr) __attribute__((warn_unused_result)); - -/** - * Get a single byte from a buffer without advancing the start pointer. - * - * @param buffer The buffer - * @param offset The offset past the start pointer of the desired byte - * @param bytePtr A pointer to hold the byte - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the offset is past the end - * of the buffer - **/ -int peekByte(Buffer *buffer, size_t offset, byte *bytePtr) - __attribute__((warn_unused_result)); - -/** - * Put a single byte into a buffer and advance the end pointer. - * - * @param buffer The buffer - * @param b The byte to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer - **/ -int putByte(Buffer *buffer, byte b) __attribute__((warn_unused_result)); - -/** - * Get bytes out of a buffer and advance the start of the buffer past the - * copied data. - * - * @param buffer The buffer from which to copy - * @param length The number of bytes to copy - * @param destination A pointer to hold the data - * - * @return UDS_SUCCESS or an error code - **/ -int getBytesFromBuffer(Buffer *buffer, size_t length, void *destination) - __attribute__((warn_unused_result)); - -/** - * Get a pointer to the current contents of the buffer. This will be a pointer - * to the actual memory managed by the buffer. It is the caller's responsibility - * to ensure that the buffer is not modified while this pointer is in use. - * - * @param buffer The buffer from which to get the contents - * - * @return a pointer to the current contents of the buffer - **/ -byte *getBufferContents(Buffer *buffer); - -/** - * Copy bytes out of a buffer and advance the start of the buffer past the - * copied data. Memory will be allocated to hold the copy. - * - * @param buffer The buffer from which to copy - * @param length The number of bytes to copy - * @param destinationPtr A pointer to hold the copied data - * - * @return UDS_SUCCESS or an error code - **/ -int copyBytes(Buffer *buffer, size_t length, byte **destinationPtr) - __attribute__((warn_unused_result)); - -/** - * Copy bytes into a buffer and advance the end of the buffer past the - * copied data. - * - * @param buffer The buffer to copy into - * @param length The length of the data to copy - * @param source The data to copy - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have - * length bytes available - **/ -int putBytes(Buffer *buffer, size_t length, const void *source) - __attribute__((warn_unused_result)); - -/** - * Copy the contents of a source buffer into the target buffer. Advances the - * start of the source buffer and the end of the target buffer past the copied - * data. - * - * @param target The buffer to receive the copy of the data - * @param source The buffer containing the data to copy - * @param length The length of the data to copy - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the target buffer does not have - * length bytes available or if the source buffer does not have length - * bytes of content - **/ -int putBuffer(Buffer *target, Buffer *source, size_t length) - __attribute__((warn_unused_result)); - -/** - * Zero bytes in a buffer starting at the start pointer, and advance the - * end of the buffer past the zeros. - * - * @param buffer The buffer to zero - * @param length The number of bytes to zero - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if the buffer does not have - * length bytes available - **/ -int zeroBytes(Buffer *buffer, size_t length) - __attribute__((warn_unused_result)); - -/** - * Get a boolean value from a buffer and advance the start pointer. - * - * @param buffer The buffer - * @param b A pointer to hold the boolean value - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data - * in the buffer - **/ -int getBoolean(Buffer *buffer, bool *b) __attribute__((warn_unused_result)); - -/** - * Put a boolean value into a buffer and advance the end pointer. - * - * @param buffer The buffer - * @param b The boolean to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is no space in the buffer - **/ -int putBoolean(Buffer *buffer, bool b) __attribute__((warn_unused_result)); - -/** - * Get a 2 byte, big endian encoded integer from a buffer and advance the - * start pointer past it. - * - * @param buffer The buffer - * @param ui A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 - * bytes available - **/ -int getUInt16BEFromBuffer(Buffer *buffer, uint16_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put a 2 byte, big endian encoded integer into a buffer and advance the - * end pointer past it. - * - * @param buffer The buffer - * @param ui The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 - * bytes available - **/ -int putUInt16BEIntoBuffer(Buffer *buffer, uint16_t ui) - __attribute__((warn_unused_result)); - -/** - * Get a 4 byte, big endian encoded integer from a buffer and advance the - * start pointer past it. - * - * @param buffer The buffer - * @param ui A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 - * bytes available - **/ -int getUInt32BEFromBuffer(Buffer *buffer, uint32_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put a 4 byte, big endian encoded integer into a buffer and advance the - * end pointer past it. - * - * @param buffer The buffer - * @param ui The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 - * bytes available - **/ -int putUInt32BEIntoBuffer(Buffer *buffer, uint32_t ui) - __attribute__((warn_unused_result)); - -/** - * Get a series of 4 byte, big endian encoded integer from a buffer and - * advance the start pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to get - * @param ui A pointer to hold the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data - * in the buffer - **/ -int getUInt32BEsFromBuffer(Buffer *buffer, size_t count, uint32_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put a series of 4 byte, big endian encoded integers into a buffer and - * advance the end pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to put - * @param ui A pointer to the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space - * in the buffer - **/ -int putUInt32BEsIntoBuffer(Buffer *buffer, size_t count, const uint32_t *ui) - __attribute__((warn_unused_result)); - -/** - * Get a series of 8 byte, big endian encoded integer from a buffer and - * advance the start pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to get - * @param ui A pointer to hold the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data - * in the buffer - **/ -int getUInt64BEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put a series of 8 byte, big endian encoded integers into a buffer and - * advance the end pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to put - * @param ui A pointer to the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space - * in the buffer - **/ -int putUInt64BEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) - __attribute__((warn_unused_result)); - -/** - * Get a 2 byte, little endian encoded integer from a buffer and - * advance the start pointer past it. - * - * @param buffer The buffer - * @param ui A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 - * bytes available - **/ -int getUInt16LEFromBuffer(Buffer *buffer, uint16_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put a 2 byte, little endian encoded integer into a buffer and advance the - * end pointer past it. - * - * @param buffer The buffer - * @param ui The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 2 - * bytes available - **/ -int putUInt16LEIntoBuffer(Buffer *buffer, uint16_t ui) - __attribute__((warn_unused_result)); - -/** - * Get a series of 2 byte, little endian encoded integer from a buffer - * and advance the start pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to get - * @param ui A pointer to hold the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data - * in the buffer - **/ -int getUInt16LEsFromBuffer(Buffer *buffer, size_t count, uint16_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put a series of 2 byte, little endian encoded integers into a - * buffer and advance the end pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to put - * @param ui A pointer to the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space - * in the buffer - **/ -int putUInt16LEsIntoBuffer(Buffer *buffer, size_t count, const uint16_t *ui) - __attribute__((warn_unused_result)); - -/** - * Get a 4 byte, little endian encoded integer from a buffer and advance the - * start pointer past it. - * - * @param buffer The buffer - * @param i A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 - * bytes available - **/ -int getInt32LEFromBuffer(Buffer *buffer, int32_t *i) - __attribute__((warn_unused_result)); - -/** - * Get a 4 byte, little endian encoded integer from a buffer and advance the - * start pointer past it. - * - * @param buffer The buffer - * @param ui A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 - * bytes available - **/ -int getUInt32LEFromBuffer(Buffer *buffer, uint32_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put a 4 byte, little endian encoded integer into a buffer and advance the - * end pointer past it. - * - * @param buffer The buffer - * @param ui The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 4 - * bytes available - **/ -int putUInt32LEIntoBuffer(Buffer *buffer, uint32_t ui) - __attribute__((warn_unused_result)); - -/** - * Get an 8 byte, little endian encoded, unsigned integer from a - * buffer and advance the start pointer past it. - * - * @param buffer The buffer - * @param ui A pointer to hold the integer - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 - * bytes available - **/ -int getUInt64LEFromBuffer(Buffer *buffer, uint64_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put an 8 byte, little endian encoded signed integer into a buffer - * and advance the end pointer past it. - * - * @param buffer The buffer - * @param i The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 - * bytes available - **/ -int putInt64LEIntoBuffer(Buffer *buffer, int64_t i) - __attribute__((warn_unused_result)); - - /** - * Put an 8 byte, little endian encoded integer into a buffer and advance the - * end pointer past it. - * - * @param buffer The buffer - * @param ui The integer to put - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there are fewer than 8 - * bytes available - **/ -int putUInt64LEIntoBuffer(Buffer *buffer, uint64_t ui) - __attribute__((warn_unused_result)); - -/** - * Get a series of 8 byte, little endian encoded integer from a buffer - * and advance the start pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to get - * @param ui A pointer to hold the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough data - * in the buffer - **/ -int getUInt64LEsFromBuffer(Buffer *buffer, size_t count, uint64_t *ui) - __attribute__((warn_unused_result)); - -/** - * Put a series of 8 byte, little endian encoded integers into a buffer and - * advance the end pointer past them. - * - * @param buffer The buffer - * @param count The number of integers to put - * @param ui A pointer to the integers - * - * @return UDS_SUCCESS or UDS_BUFFER_ERROR if there is not enough space - * in the buffer - **/ -int putUInt64LEsIntoBuffer(Buffer *buffer, size_t count, const uint64_t *ui) - __attribute__((warn_unused_result)); - -#endif /* BUFFER_H */ diff --git a/uds/bufferPrivate.h b/uds/bufferPrivate.h deleted file mode 100644 index 8a0f46a..0000000 --- a/uds/bufferPrivate.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/bufferPrivate.h#1 $ - */ - -#ifndef BUFFER_PRIVATE_H -#define BUFFER_PRIVATE_H - -#include "common.h" - -struct buffer { - size_t start; - size_t end; - size_t length; - byte *data; - bool wrapped; -}; - -#endif /* BUFFER_PRIVATE_H */ diff --git a/uds/bufferedReader.c b/uds/bufferedReader.c deleted file mode 100644 index b67d33d..0000000 --- a/uds/bufferedReader.c +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/bufferedReader.c#5 $ - */ - -#include "bufferedReader.h" - -#include "compiler.h" -#include "ioFactory.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" - -#ifndef __KERNEL__ -/* - * Define sector_t. The kernel really wants us to use it. The code becomes - * ugly if we need to #ifdef every usage of sector_t. Note that the of #define - * means that even if a user mode include typedefs sector_t, it will not affect - * this module. - */ -#define sector_t uint64_t -#endif - -struct bufferedReader { -#ifdef __KERNEL__ - // IOFactory owning the block device - IOFactory *br_factory; - // The dm_bufio_client to read from - struct dm_bufio_client *br_client; - // The current dm_buffer - struct dm_buffer *br_buffer; - // The number of blocks that can be read from - sector_t br_limit; - // Number of the current block - sector_t br_blockNumber; -#else - // Region to read from - IORegion *br_region; - // Number of the current block - uint64_t br_blockNumber; -#endif - // Start of the buffer - byte *br_start; - // End of the data read from the buffer - byte *br_pointer; -}; - -#ifdef __KERNEL__ -/*****************************************************************************/ -static void readAhead(BufferedReader *br, sector_t blockNumber) -{ - if (blockNumber < br->br_limit) { - enum { MAX_READ_AHEAD = 4 }; - size_t readAhead = minSizeT(MAX_READ_AHEAD, br->br_limit - blockNumber); - dm_bufio_prefetch(br->br_client, blockNumber, readAhead); - } -} -#endif - -/*****************************************************************************/ -#ifdef __KERNEL__ -int makeBufferedReader(IOFactory *factory, - struct dm_bufio_client *client, - sector_t blockLimit, - BufferedReader **readerPtr) -{ - BufferedReader *reader = NULL; - int result = ALLOCATE(1, BufferedReader, "buffered reader", &reader); - if (result != UDS_SUCCESS) { - return result; - } - - *reader = (BufferedReader) { - .br_factory = factory, - .br_client = client, - .br_buffer = NULL, - .br_limit = blockLimit, - .br_blockNumber = 0, - .br_start = NULL, - .br_pointer = NULL, - }; - - readAhead(reader,0); - getIOFactory(factory); - *readerPtr = reader; - return UDS_SUCCESS; -} -#else -int makeBufferedReader(IORegion *region, BufferedReader **readerPtr) -{ - byte *data; - int result = ALLOCATE_IO_ALIGNED(UDS_BLOCK_SIZE, byte, - "buffer writer buffer", &data); - if (result != UDS_SUCCESS) { - return result; - } - - BufferedReader *reader = NULL; - result = ALLOCATE(1, BufferedReader, "buffered reader", &reader); - if (result != UDS_SUCCESS) { - FREE(data); - return result; - } - - *reader = (BufferedReader) { - .br_region = region, - .br_blockNumber = 0, - .br_start = data, - .br_pointer = NULL, - }; - - getIORegion(region); - *readerPtr = reader; - return UDS_SUCCESS; -} -#endif - -/*****************************************************************************/ -void freeBufferedReader(BufferedReader *br) -{ - if (br == NULL) { - return; - } -#ifdef __KERNEL__ - if (br->br_buffer != NULL) { - dm_bufio_release(br->br_buffer); - } - dm_bufio_client_destroy(br->br_client); - putIOFactory(br->br_factory); -#else - putIORegion(br->br_region); - FREE(br->br_start); -#endif - FREE(br); -} - -/*****************************************************************************/ -static int positionReader(BufferedReader *br, - sector_t blockNumber, - off_t offset) -{ - if ((br->br_pointer == NULL) || (blockNumber != br->br_blockNumber)) { -#ifdef __KERNEL__ - if (blockNumber >= br->br_limit) { - return UDS_OUT_OF_RANGE; - } - if (br->br_buffer != NULL) { - dm_bufio_release(br->br_buffer); - br->br_buffer = NULL; - } - struct dm_buffer *buffer = NULL; - void *data = dm_bufio_read(br->br_client, blockNumber, &buffer); - if (IS_ERR(data)) { - return -PTR_ERR(data); - } - br->br_buffer = buffer; - br->br_start = data; - if (blockNumber == br->br_blockNumber + 1) { - readAhead(br, blockNumber + 1); - } -#else - int result = readFromRegion(br->br_region, blockNumber * UDS_BLOCK_SIZE, - br->br_start, UDS_BLOCK_SIZE, NULL); - if (result != UDS_SUCCESS) { - logWarningWithStringError(result, "%s got readFromRegion error", - __func__); - return result; - } -#endif - } - br->br_blockNumber = blockNumber; - br->br_pointer = br->br_start + offset; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static size_t bytesRemainingInReadBuffer(BufferedReader *br) -{ - return (br->br_pointer == NULL - ? 0 - : br->br_start + UDS_BLOCK_SIZE - br->br_pointer); -} - -/*****************************************************************************/ -int readFromBufferedReader(BufferedReader *br, void *data, size_t length) -{ - byte *dp = data; - int result = UDS_SUCCESS; - while (length > 0) { - if (bytesRemainingInReadBuffer(br) == 0) { - sector_t blockNumber = br->br_blockNumber; - if (br->br_pointer != NULL) { - ++blockNumber; - } - result = positionReader(br, blockNumber, 0); - if (result != UDS_SUCCESS) { - break; - } - } - - size_t avail = bytesRemainingInReadBuffer(br); - size_t chunk = minSizeT(length, avail); - memcpy(dp, br->br_pointer, chunk); - length -= chunk; - dp += chunk; - br->br_pointer += chunk; - } - - if (((result == UDS_OUT_OF_RANGE) || (result == UDS_END_OF_FILE)) - && (dp - (byte *) data > 0)) { - result = UDS_SHORT_READ; - } - return result; -} - -/*****************************************************************************/ -int verifyBufferedData(BufferedReader *br, - const void *value, - size_t length) -{ - const byte *vp = value; - sector_t startingBlockNumber = br->br_blockNumber; - int startingOffset = br->br_pointer - br->br_start; - while (length > 0) { - if (bytesRemainingInReadBuffer(br) == 0) { - sector_t blockNumber = br->br_blockNumber; - if (br->br_pointer != NULL) { - ++blockNumber; - } - int result = positionReader(br, blockNumber, 0); - if (result != UDS_SUCCESS) { - positionReader(br, startingBlockNumber, startingOffset); - return UDS_CORRUPT_FILE; - } - } - - size_t avail = bytesRemainingInReadBuffer(br); - size_t chunk = minSizeT(length, avail); - if (memcmp(vp, br->br_pointer, chunk) != 0) { - positionReader(br, startingBlockNumber, startingOffset); - return UDS_CORRUPT_FILE; - } - length -= chunk; - vp += chunk; - br->br_pointer += chunk; - } - - return UDS_SUCCESS; -} diff --git a/uds/bufferedReader.h b/uds/bufferedReader.h deleted file mode 100644 index 4da8119..0000000 --- a/uds/bufferedReader.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/bufferedReader.h#3 $ - */ - -#ifndef BUFFERED_READER_H -#define BUFFERED_READER_H 1 - -#include "common.h" - -#ifdef __KERNEL__ -struct dm_bufio_client; -struct ioFactory; -#else -struct ioRegion; -#endif - -/** - * The buffered reader allows efficient IO for IORegions, which may be - * file- or block-based. The internal buffer always reads aligned data - * from the underlying region. - **/ -typedef struct bufferedReader BufferedReader; - -#ifdef __KERNEL__ -/** - * Make a new buffered reader. - * - * @param factory The IOFactory creating the buffered reader. - * @param client The dm_bufio_client to read from. - * @param blockLimit The number of blocks that may be read. - * @param readerPtr The pointer to hold the newly allocated buffered reader - * - * @return UDS_SUCCESS or error code. - **/ -int makeBufferedReader(struct ioFactory *factory, - struct dm_bufio_client *client, - sector_t blockLimit, - BufferedReader **readerPtr) - __attribute__((warn_unused_result)); -#else -/** - * Make a new buffered reader. - * - * @param region An IORegion to read from. - * @param readerPtr The pointer to hold the newly allocated buffered reader. - * - * @return UDS_SUCCESS or error code. - **/ -int makeBufferedReader(struct ioRegion *region, BufferedReader **readerPtr) - __attribute__((warn_unused_result)); -#endif - -/** - * Free a buffered reader. - * - * @param reader The buffered reader - **/ -void freeBufferedReader(BufferedReader *reader); - -/** - * Retrieve data from a buffered reader, reading from the region when needed. - * - * @param reader The buffered reader - * @param data The buffer to read data into - * @param length The length of the data to read - * - * @return UDS_SUCCESS or an error code. - **/ -int readFromBufferedReader(BufferedReader *reader, void *data, size_t length) - __attribute__((warn_unused_result)); - -/** - * Verify that the data currently in the buffer matches the required value. - * - * @param reader The buffered reader. - * @param value The value that must match the buffer contents. - * @param length The length of the value that must match. - * - * @return UDS_SUCCESS or an error code, specifically UDS_CORRUPT_FILE - * if the required value fails to match. - * - * @note If the value matches, the matching contents are consumed. However, - * if the match fails, any buffer contents are left as is. - **/ -int verifyBufferedData(BufferedReader *reader, - const void *value, - size_t length) - __attribute__((warn_unused_result)); - -#endif // BUFFERED_READER_H diff --git a/uds/bufferedWriter.c b/uds/bufferedWriter.c deleted file mode 100644 index abfb9cf..0000000 --- a/uds/bufferedWriter.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/bufferedWriter.c#6 $ - */ - -#include "bufferedWriter.h" - -#include "compiler.h" -#include "errors.h" -#include "ioFactory.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" - - -struct bufferedWriter { -#ifdef __KERNEL__ - // IOFactory owning the block device - IOFactory *bw_factory; - // The dm_bufio_client to write to - struct dm_bufio_client *bw_client; - // The current dm_buffer - struct dm_buffer *bw_buffer; - // The number of blocks that can be written to - sector_t bw_limit; - // Number of the current block - sector_t bw_blockNumber; -#else - // Region to write to - IORegion *bw_region; - // Number of the current block - uint64_t bw_blockNumber; -#endif - // Start of the buffer - byte *bw_start; - // End of the data written to the buffer - byte *bw_pointer; - // Error code - int bw_error; - // Have writes been done? - bool bw_used; -}; - -#ifdef __KERNEL__ -/*****************************************************************************/ -__attribute__((warn_unused_result)) -int prepareNextBuffer(BufferedWriter *bw) -{ - if (bw->bw_blockNumber >= bw->bw_limit) { - bw->bw_error = UDS_OUT_OF_RANGE; - return UDS_OUT_OF_RANGE; - } - - struct dm_buffer *buffer = NULL; - void *data = dm_bufio_new(bw->bw_client, bw->bw_blockNumber, &buffer); - if (IS_ERR(data)) { - bw->bw_error = -PTR_ERR(data); - return bw->bw_error; - } - bw->bw_buffer = buffer; - bw->bw_start = data; - bw->bw_pointer = data; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int flushPreviousBuffer(BufferedWriter *bw) -{ - if (bw->bw_buffer != NULL) { - if (bw->bw_error == UDS_SUCCESS) { - size_t avail = spaceRemainingInWriteBuffer(bw); - if (avail > 0) { - memset(bw->bw_pointer, 0, avail); - } - dm_bufio_mark_buffer_dirty(bw->bw_buffer); - } - dm_bufio_release(bw->bw_buffer); - bw->bw_buffer = NULL; - bw->bw_start = NULL; - bw->bw_pointer = NULL; - bw->bw_blockNumber++; - } - return bw->bw_error; -} -#endif - -/*****************************************************************************/ -#ifdef __KERNEL__ -int makeBufferedWriter(IOFactory *factory, - struct dm_bufio_client *client, - sector_t blockLimit, - BufferedWriter **writerPtr) -{ - BufferedWriter *writer; - int result = ALLOCATE(1, BufferedWriter, "buffered writer", &writer); - if (result != UDS_SUCCESS) { - return result; - } - - *writer = (BufferedWriter) { - .bw_factory = factory, - .bw_client = client, - .bw_buffer = NULL, - .bw_limit = blockLimit, - .bw_start = NULL, - .bw_pointer = NULL, - .bw_blockNumber = 0, - .bw_error = UDS_SUCCESS, - .bw_used = false, - }; - - getIOFactory(factory); - *writerPtr = writer; - return UDS_SUCCESS; -} -#else -int makeBufferedWriter(IORegion *region, BufferedWriter **writerPtr) -{ - byte *data; - int result = ALLOCATE_IO_ALIGNED(UDS_BLOCK_SIZE, byte, - "buffer writer buffer", &data); - if (result != UDS_SUCCESS) { - return result; - } - - BufferedWriter *writer; - result = ALLOCATE(1, BufferedWriter, "buffered writer", &writer); - if (result != UDS_SUCCESS) { - FREE(data); - return result; - } - - *writer = (BufferedWriter) { - .bw_region = region, - .bw_start = data, - .bw_pointer = data, - .bw_blockNumber = 0, - .bw_error = UDS_SUCCESS, - .bw_used = false, - }; - - getIORegion(region); - *writerPtr = writer; - return UDS_SUCCESS; -} -#endif - -/*****************************************************************************/ -void freeBufferedWriter(BufferedWriter *bw) -{ - if (bw == NULL) { - return; - } -#ifdef __KERNEL__ - flushPreviousBuffer(bw); - int result = -dm_bufio_write_dirty_buffers(bw->bw_client); -#else - int result = syncRegionContents(bw->bw_region); -#endif - if (result != UDS_SUCCESS) { - logWarningWithStringError(result, "%s cannot sync storage", __func__); - } -#ifdef __KERNEL__ - dm_bufio_client_destroy(bw->bw_client); - putIOFactory(bw->bw_factory); -#else - putIORegion(bw->bw_region); - FREE(bw->bw_start); -#endif - FREE(bw); -} - -/*****************************************************************************/ -static INLINE size_t spaceUsedInBuffer(BufferedWriter *bw) -{ - return bw->bw_pointer - bw->bw_start; -} - -/*****************************************************************************/ -size_t spaceRemainingInWriteBuffer(BufferedWriter *bw) -{ - return UDS_BLOCK_SIZE - spaceUsedInBuffer(bw); -} - -/*****************************************************************************/ -int writeToBufferedWriter(BufferedWriter *bw, const void *data, size_t len) -{ - if (bw->bw_error != UDS_SUCCESS) { - return bw->bw_error; - } - - const byte *dp = data; - int result = UDS_SUCCESS; - while ((len > 0) && (result == UDS_SUCCESS)) { -#ifdef __KERNEL__ - if (bw->bw_buffer == NULL) { - result = prepareNextBuffer(bw); - continue; - } -#endif - - size_t avail = spaceRemainingInWriteBuffer(bw); - size_t chunk = minSizeT(len, avail); - memcpy(bw->bw_pointer, dp, chunk); - len -= chunk; - dp += chunk; - bw->bw_pointer += chunk; - - if (spaceRemainingInWriteBuffer(bw) == 0) { - result = flushBufferedWriter(bw); - } - } - - bw->bw_used = true; - return result; -} - -/*****************************************************************************/ -int writeZerosToBufferedWriter(BufferedWriter *bw, size_t len) -{ - if (bw->bw_error != UDS_SUCCESS) { - return bw->bw_error; - } - - int result = UDS_SUCCESS; - while ((len > 0) && (result == UDS_SUCCESS)) { -#ifdef __KERNEL__ - if (bw->bw_buffer == NULL) { - result = prepareNextBuffer(bw); - continue; - } -#endif - - size_t avail = spaceRemainingInWriteBuffer(bw); - size_t chunk = minSizeT(len, avail); - memset(bw->bw_pointer, 0, chunk); - len -= chunk; - bw->bw_pointer += chunk; - - if (spaceRemainingInWriteBuffer(bw) == 0) { - result = flushBufferedWriter(bw); - } - } - - bw->bw_used = true; - return result; -} - -/*****************************************************************************/ -int flushBufferedWriter(BufferedWriter *bw) -{ - if (bw->bw_error != UDS_SUCCESS) { - return bw->bw_error; - } - -#ifdef __KERNEL__ - return flushPreviousBuffer(bw); -#else - size_t n = spaceUsedInBuffer(bw); - if (n > 0) { - int result = writeToRegion(bw->bw_region, - bw->bw_blockNumber * UDS_BLOCK_SIZE, - bw->bw_start, UDS_BLOCK_SIZE, n); - if (result != UDS_SUCCESS) { - return bw->bw_error = result; - } else { - bw->bw_pointer = bw->bw_start; - bw->bw_blockNumber++; - } - } - return UDS_SUCCESS; -#endif -} - -/*****************************************************************************/ -bool wasBufferedWriterUsed(const BufferedWriter *bw) -{ - return bw->bw_used; -} - -/*****************************************************************************/ -void noteBufferedWriterUsed(BufferedWriter *bw) -{ - bw->bw_used = true; -} diff --git a/uds/bufferedWriter.h b/uds/bufferedWriter.h deleted file mode 100644 index 8774b5b..0000000 --- a/uds/bufferedWriter.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/bufferedWriter.h#5 $ - */ - -#ifndef BUFFERED_WRITER_H -#define BUFFERED_WRITER_H 1 - -#include "common.h" - -#ifdef __KERNEL__ -struct dm_bufio_client; -struct ioFactory; -#else -struct ioRegion; -#endif - -typedef struct bufferedWriter BufferedWriter; - -#ifdef __KERNEL__ -/** - * Make a new buffered writer. - * - * @param factory The IOFactory creating the buffered writer - * @param client The dm_bufio_client to write to. - * @param blockLimit The number of blocks that may be written to. - * @param writerPtr The new buffered writer goes here. - * - * @return UDS_SUCCESS or an error code. - **/ -int makeBufferedWriter(struct ioFactory *factory, - struct dm_bufio_client *client, - sector_t blockLimit, - BufferedWriter **writerPtr) - __attribute__((warn_unused_result)); -#else -/** - * Make a new buffered writer. - * - * @param region The IOregion to write to. - * @param writerPtr The new buffered writer goes here. - * - * @return UDS_SUCCESS or an error code. - **/ -int makeBufferedWriter(struct ioRegion *region, BufferedWriter **writerPtr) - __attribute__((warn_unused_result)); -#endif - -/** - * Free a buffered writer, without flushing. - * - * @param [in] buffer The buffered writer object. - **/ -void freeBufferedWriter(BufferedWriter *buffer); - -/** - * Append data to buffer, writing as needed. - * - * @param buffer The buffered writer object. - * @param data The data to write. - * @param len The length of the data written. - * - * @return UDS_SUCCESS or an error code. - * The error may reflect previous attempts to write - * or flush the buffer. Once a write or flush error - * occurs it is sticky. - **/ -int writeToBufferedWriter(BufferedWriter *buffer, const void *data, size_t len) - __attribute__((warn_unused_result)); - -/** - * Zero data in the buffer, writing as needed. - * - * @param buffer The buffered writer object. - * @param len The number of zero bytes to write. - * - * @return UDS_SUCCESS or an error code. - * The error may reflect previous attempts to write - * or flush the buffer. Once a write or flush error - * occurs it is sticky. - **/ -int writeZerosToBufferedWriter(BufferedWriter *bw, size_t len) - __attribute__((warn_unused_result)); - - -/** - * Flush any partial data from the buffer. - * - * @param buffer The buffered writer object. - * - * @return UDS_SUCCESS or an error code. - * The error may reflect previous attempts to write - * or flush the buffer. Once a write or flush error - * occurs it is sticky. - **/ -int flushBufferedWriter(BufferedWriter *buffer) - __attribute__((warn_unused_result)); - -/** - * Return the size of the remaining space in the buffer (for testing) - * - * @param [in] buffer The buffered writer object. - * - * @return The number of available bytes in the buffer. - **/ -size_t spaceRemainingInWriteBuffer(BufferedWriter *buffer) - __attribute__((warn_unused_result)); - -/** - * Return whether the buffer was ever written to. - * - * @param buffer The buffered writer object. - * - * @return True if at least one call to writeToBufferedWriter - * was made. - **/ -bool wasBufferedWriterUsed(const BufferedWriter *buffer) - __attribute__((warn_unused_result)); - -/** - * Note the buffer has been used. - * - * @param buffer The buffered writer object. - **/ -void noteBufferedWriterUsed(BufferedWriter *buffer); - -#endif // BUFFERED_WRITER_H diff --git a/uds/cacheCounters.c b/uds/cacheCounters.c deleted file mode 100644 index 8bf7ad4..0000000 --- a/uds/cacheCounters.c +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/cacheCounters.c#1 $ - */ - -#include "cacheCounters.h" - -#include "atomicDefs.h" -#include "compiler.h" -#include "errors.h" -#include "permassert.h" -#include "stringUtils.h" -#include "uds.h" - -/**********************************************************************/ -void incrementCacheCounter(CacheCounters *counters, - int probeType, - CacheResultKind kind) -{ - CacheProbeType basicProbeType = probeType & ~CACHE_PROBE_IGNORE_FAILURE; - int result = ASSERT(basicProbeType <= CACHE_PROBE_RECORD_RETRY, - "invalid cache probe type %#x", probeType); - if (result != UDS_SUCCESS) { - return; - } - result = ASSERT(kind <= CACHE_RESULT_QUEUED, - "invalid cache probe result type %#x", kind); - if (result != UDS_SUCCESS) { - return; - } - - if (((probeType & CACHE_PROBE_IGNORE_FAILURE) != 0) - && (kind != CACHE_RESULT_HIT)) { - return; - } - - CacheCountsByKind *kindCounts; - switch (basicProbeType) { - case CACHE_PROBE_INDEX_FIRST: - kindCounts = &counters->firstTime.indexPage; - break; - case CACHE_PROBE_RECORD_FIRST: - kindCounts = &counters->firstTime.recordPage; - break; - case CACHE_PROBE_INDEX_RETRY: - kindCounts = &counters->retried.indexPage; - break; - case CACHE_PROBE_RECORD_RETRY: - kindCounts = &counters->retried.recordPage; - break; - default: - // Never used but the compiler hasn't figured that out. - return; - } - - uint64_t *myCounter; - switch (kind) { - case CACHE_RESULT_MISS: - myCounter = &kindCounts->misses; - break; - case CACHE_RESULT_QUEUED: - myCounter = &kindCounts->queued; - break; - case CACHE_RESULT_HIT: - myCounter = &kindCounts->hits; - break; - default: - // Never used but the compiler hasn't figured that out. - return; - } - // XXX Vile case makes many assumptions. Counters should be declared atomic. - atomic64_inc((atomic64_t *) myCounter); -} diff --git a/uds/cacheCounters.h b/uds/cacheCounters.h deleted file mode 100644 index 9029453..0000000 --- a/uds/cacheCounters.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/cacheCounters.h#1 $ - */ - -#ifndef CACHE_COUNTERS_H -#define CACHE_COUNTERS_H - -#include "typeDefs.h" - -/** - * Basic counts of hits and misses for a given type of cache probe. - **/ -typedef struct cacheCountsByKind { - /** Number of hits */ - uint64_t hits; - /** Number of misses */ - uint64_t misses; - /** Number of probes for data already queued for read */ - uint64_t queued; -} CacheCountsByKind; - -/** - * The various types of cache probes we care about. - **/ -typedef enum cacheProbeType { - /** First attempt to look up an index page, for a given request. */ - CACHE_PROBE_INDEX_FIRST = 0, - /** First attempt to look up a record page, for a given request. */ - CACHE_PROBE_RECORD_FIRST, - /** Second or later attempt to look up an index page, for a given request. */ - CACHE_PROBE_INDEX_RETRY, - /** Second or later attempt to look up a record page, for a given request. */ - CACHE_PROBE_RECORD_RETRY -} CacheProbeType; - -enum { - /** Flag bit to indicate that failures shouldn't be recorded. */ - CACHE_PROBE_IGNORE_FAILURE = 128 -}; - -/** - * Result-type counts for both kinds of data pages in the page cache. - **/ -typedef struct cacheCountsByPageType { - /** His/miss counts for index pages. */ - CacheCountsByKind indexPage; - /** Hit/miss counts for record pages. */ - CacheCountsByKind recordPage; -} CacheCountsByPageType; - -/** - * All the counters used for an entry cache. - **/ -typedef struct cacheCounters { - // counters for the page cache - /** Hit/miss counts for the first attempt per request */ - CacheCountsByPageType firstTime; - /** Hit/miss counts when a second (or later) attempt is needed */ - CacheCountsByPageType retried; - - /** Number of cache entry invalidations due to single-entry eviction */ - uint64_t evictions; - /** Number of cache entry invalidations due to chapter expiration */ - uint64_t expirations; - - // counters for the sparse chapter index cache - /** Hit/miss counts for the sparse cache chapter probes */ - CacheCountsByKind sparseChapters; - /** Hit/miss counts for the sparce cache name searches */ - CacheCountsByKind sparseSearches; -} CacheCounters; - -/** - * Success/failure assessment of cache probe result. - **/ -typedef enum cacheResultKind { - /** The requested entry was found in the cache */ - CACHE_RESULT_HIT, - /** The requested entry was not found in the cache */ - CACHE_RESULT_MISS, - /** The requested entry wasn't found in the cache but is queued for read */ - CACHE_RESULT_QUEUED -} CacheResultKind; - -/** - * Increment one of the cache counters. - * - * @param counters pointer to the counters - * @param probeType type of access done - * @param kind result of probe - **/ -void incrementCacheCounter(CacheCounters *counters, - int probeType, - CacheResultKind kind); - -#endif /* CACHE_COUNTERS_H */ diff --git a/uds/cachedChapterIndex.c b/uds/cachedChapterIndex.c deleted file mode 100644 index ae0a22c..0000000 --- a/uds/cachedChapterIndex.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/cachedChapterIndex.c#3 $ - */ - -#include "cachedChapterIndex.h" - -#include "memoryAlloc.h" - -/**********************************************************************/ -int initializeCachedChapterIndex(CachedChapterIndex *chapter, - const Geometry *geometry) -{ - chapter->virtualChapter = UINT64_MAX; - chapter->indexPagesCount = geometry->indexPagesPerChapter; - - int result = ALLOCATE(chapter->indexPagesCount, DeltaIndexPage, __func__, - &chapter->indexPages); - if (result != UDS_SUCCESS) { - return result; - } - - result = ALLOCATE(chapter->indexPagesCount, struct volume_page, - "sparse index VolumePages", &chapter->volumePages); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int i; - for (i = 0; i < chapter->indexPagesCount; i++) { - result = initializeVolumePage(geometry, &chapter->volumePages[i]); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void destroyCachedChapterIndex(CachedChapterIndex *chapter) -{ - if (chapter->volumePages != NULL) { - unsigned int i; - for (i = 0; i < chapter->indexPagesCount; i++) { - destroyVolumePage(&chapter->volumePages[i]); - } - } - FREE(chapter->indexPages); - FREE(chapter->volumePages); -} - -/**********************************************************************/ -int cacheChapterIndex(CachedChapterIndex *chapter, - uint64_t virtualChapter, - const Volume *volume) -{ - // Mark the cached chapter as unused in case the update fails midway. - chapter->virtualChapter = UINT64_MAX; - - // Read all the page data and initialize the entire DeltaIndexPage array. - // (It's not safe for the zone threads to do it lazily--they'll race.) - int result = readChapterIndexFromVolume(volume, virtualChapter, - chapter->volumePages, - chapter->indexPages); - if (result != UDS_SUCCESS) { - return result; - } - - // Reset all chapter counter values to zero. - chapter->counters.searchHits = 0; - chapter->counters.searchMisses = 0; - chapter->counters.consecutiveMisses = 0; - - // Mark the entry as valid--it's now in the cache. - chapter->virtualChapter = virtualChapter; - chapter->skipSearch = false; - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int searchCachedChapterIndex(CachedChapterIndex *chapter, - const Geometry *geometry, - const IndexPageMap *indexPageMap, - const UdsChunkName *name, - int *recordPagePtr) -{ - // Find the indexPageNumber in the chapter that would have the chunk name. - unsigned int physicalChapter - = mapToPhysicalChapter(geometry, chapter->virtualChapter); - unsigned int indexPageNumber; - int result = findIndexPageNumber(indexPageMap, name, physicalChapter, - &indexPageNumber); - if (result != UDS_SUCCESS) { - return result; - } - - return searchChapterIndexPage(&chapter->indexPages[indexPageNumber], - geometry, name, recordPagePtr); -} diff --git a/uds/cachedChapterIndex.h b/uds/cachedChapterIndex.h deleted file mode 100644 index f759d5d..0000000 --- a/uds/cachedChapterIndex.h +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/cachedChapterIndex.h#3 $ - */ - -#ifndef CACHED_CHAPTER_INDEX_H -#define CACHED_CHAPTER_INDEX_H - -#include "chapterIndex.h" -#include "common.h" -#include "compiler.h" -#include "cpu.h" -#include "geometry.h" -#include "indexPageMap.h" -#include "typeDefs.h" -#include "volume.h" -#include "volumeStore.h" - -/** - * These counters are essentially fields of the CachedChapterIndex, but are - * segregated into this structure because they are frequently modified. They - * are grouped and aligned to keep them on different cache lines from the - * chapter fields that are accessed far more often than they are updated. - **/ -struct __attribute__((aligned(CACHE_LINE_BYTES))) cachedIndexCounters { - /** the total number of search hits since this chapter was cached */ - uint64_t searchHits; - - /** the total number of search misses since this chapter was cached */ - uint64_t searchMisses; - - /** the number of consecutive search misses since the last cache hit */ - uint64_t consecutiveMisses; -}; -typedef struct cachedIndexCounters CachedIndexCounters; - -/** - * CachedChapterIndex is the structure for a cache entry, representing a - * single cached chapter index in the sparse chapter index cache. - **/ -struct __attribute__((aligned(CACHE_LINE_BYTES))) cachedChapterIndex { - /* - * The virtual chapter number of the cached chapter index. UINT64_MAX means - * this cache entry is unused. Must only be modified in the critical section - * in updateSparseCache(). - */ - uint64_t virtualChapter; - - /* The number of index pages in a chapter */ - unsigned int indexPagesCount; - - /* - * This flag is mutable between cache updates, but it rarely changes and - * is frequently accessed, so it groups with the immutable fields. - * - * If set, skip the chapter when searching the entire cache. This flag is - * just a performance optimization. If we do not see a recent change to it, - * it will be corrected when we pass through a memory barrier while getting - * the next request from the queue. So we may do one extra search of the - * chapter index, or miss one deduplication opportunity. - */ - bool skipSearch; - - // These pointers are immutable during the life of the cache. The contents - // of the arrays change when the cache entry is replaced. - - /* pointer to a cache-aligned array of ChapterIndexPages */ - DeltaIndexPage *indexPages; - - /* pointer to an array of VolumePages containing the index pages */ - struct volume_page *volumePages; - - // The cache-aligned counters change often and are placed at the end of the - // structure to prevent false sharing with the more stable fields above. - - /* counter values updated by the thread servicing zone zero */ - CachedIndexCounters counters; -}; -typedef struct cachedChapterIndex CachedChapterIndex; - -/** - * Initialize a CachedChapterIndex, allocating the memory for the array of - * ChapterIndexPages and the raw index page data. The chapter index will be - * marked as unused (virtualChapter == UINT64_MAX). - * - * @param chapter the chapter index cache entry to initialize - * @param geometry the geometry governing the volume - **/ -int initializeCachedChapterIndex(CachedChapterIndex *chapter, - const Geometry *geometry) - __attribute__((warn_unused_result)); - -/** - * Destroy a CachedChapterIndex, freeing the memory allocated for the - * ChapterIndexPages and raw index page data. - * - * @param chapter the chapter index cache entry to destroy - **/ -void destroyCachedChapterIndex(CachedChapterIndex *chapter); - -/** - * Assign a new value to the skipSearch flag of a cached chapter index. - * - * @param chapter the chapter index cache entry to modify - * @param skipSearch the new value of the skipSearch falg - **/ -static INLINE void setSkipSearch(CachedChapterIndex *chapter, bool skipSearch) -{ - // Explicitly check if the field is set so we don't keep dirtying the memory - // cache line on continued search hits. - if (READ_ONCE(chapter->skipSearch) != skipSearch) { - WRITE_ONCE(chapter->skipSearch, skipSearch); - } -} - -/** - * Check if a cached sparse chapter index should be skipped over in the search - * for a chunk name. Filters out unused, invalid, disabled, and irrelevant - * cache entries. - * - * @param zone the zone doing the check - * @param chapter the cache entry search candidate - * @param virtualChapter the virtualChapter containing a hook, or UINT64_MAX - * if searching the whole cache for a non-hook - * - * @return true if the provided chapter index should be skipped - **/ -static INLINE bool shouldSkipChapterIndex(const IndexZone *zone, - const CachedChapterIndex *chapter, - uint64_t virtualChapter) -{ - // Don't search unused entries (contents undefined) or invalid entries - // (the chapter is no longer the zone's view of the volume). - if ((chapter->virtualChapter == UINT64_MAX) - || (chapter->virtualChapter < zone->oldestVirtualChapter)) { - return true; - } - - if (virtualChapter != UINT64_MAX) { - // If the caller specified a virtual chapter, only search the cache - // entry containing that chapter. - return (virtualChapter != chapter->virtualChapter); - } else { - // When searching the entire cache, save time by skipping over chapters - // that have had too many consecutive misses. - return READ_ONCE(chapter->skipSearch); - } -} - -/** - * Cache a chapter index, reading all the index pages from the volume and - * initializing the array of ChapterIndexPages in the cache entry to represent - * them. The virtualChapter field of the cache entry will be set to UINT64_MAX - * if there is any error since the remaining mutable fields will be in an - * undefined state. - * - * @param chapter the chapter index cache entry to replace - * @param virtualChapter the virtual chapter number of the index to read - * @param volume the volume containing the chapter index - * - * @return UDS_SUCCESS or an error code - **/ -int cacheChapterIndex(CachedChapterIndex *chapter, - uint64_t virtualChapter, - const Volume *volume) - __attribute__((warn_unused_result)); - -/** - * Search a single cached sparse chapter index for a chunk name, returning the - * record page number that may contain the name. - * - * @param [in] chapter the cache entry for the chapter to search - * @param [in] geometry the geometry governing the volume - * @param [in] indexPageMap the index page number map for the volume - * @param [in] name the chunk name to search for - * @param [out] recordPagePtr the record page number of a match, else - * NO_CHAPTER_INDEX_ENTRY if nothing matched - * - * @return UDS_SUCCESS or an error code - **/ -int searchCachedChapterIndex(CachedChapterIndex *chapter, - const Geometry *geometry, - const IndexPageMap *indexPageMap, - const UdsChunkName *name, - int *recordPagePtr) - __attribute__((warn_unused_result)); - -#endif /* CACHED_CHAPTER_INDEX_H */ diff --git a/uds/chapterIndex.c b/uds/chapterIndex.c deleted file mode 100644 index 5653a41..0000000 --- a/uds/chapterIndex.c +++ /dev/null @@ -1,305 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/chapterIndex.c#5 $ - */ - -#include "chapterIndex.h" - -#include "compiler.h" -#include "errors.h" -#include "hashUtils.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" - - -/**********************************************************************/ -int makeOpenChapterIndex(OpenChapterIndex **openChapterIndex, - const Geometry *geometry, - bool chapterIndexHeaderNativeEndian, - uint64_t volumeNonce) -{ - - int result = ALLOCATE(1, OpenChapterIndex, "open chapter index", - openChapterIndex); - if (result != UDS_SUCCESS) { - return result; - } - - // The delta index will rebalance delta lists when memory gets tight, so - // give the chapter index one extra page. - size_t memorySize - = (geometry->indexPagesPerChapter + 1) * geometry->bytesPerPage; - (*openChapterIndex)->geometry = geometry; - (*openChapterIndex)->volumeNonce = volumeNonce; - (*openChapterIndex)->headerNativeEndian = chapterIndexHeaderNativeEndian, - result = initializeDeltaIndex(&(*openChapterIndex)->deltaIndex, 1, - geometry->deltaListsPerChapter, - geometry->chapterMeanDelta, - geometry->chapterPayloadBits, memorySize); - if (result != UDS_SUCCESS) { - FREE(*openChapterIndex); - *openChapterIndex = NULL; - } - return result; -} - -/**********************************************************************/ -void freeOpenChapterIndex(OpenChapterIndex *openChapterIndex) -{ - if (openChapterIndex == NULL) { - return; - } - - - uninitializeDeltaIndex(&openChapterIndex->deltaIndex); - FREE(openChapterIndex); -} - -/**********************************************************************/ -void emptyOpenChapterIndex(OpenChapterIndex *openChapterIndex, - uint64_t virtualChapterNumber) -{ - emptyDeltaIndex(&openChapterIndex->deltaIndex); - openChapterIndex->virtualChapterNumber = virtualChapterNumber; -} - -/** - * Check whether a delta list entry reflects a successful search for a given - * address. - * - * @param entry the delta list entry from the search - * @param address the address of the desired entry - * - * @return true iff the address was found - **/ -static INLINE bool wasEntryFound(const DeltaIndexEntry *entry, - unsigned int address) -{ - return (!entry->atEnd && (entry->key == address)); -} - -/**********************************************************************/ -int putOpenChapterIndexRecord(OpenChapterIndex *openChapterIndex, - const UdsChunkName *name, - unsigned int pageNumber) -{ - const Geometry *geometry = openChapterIndex->geometry; - int result - = ASSERT_WITH_ERROR_CODE(pageNumber < geometry->recordPagesPerChapter, - UDS_INVALID_ARGUMENT, - "Page number within chapter (%u) exceeds" - " the maximum value %u", - pageNumber, geometry->recordPagesPerChapter); - if (result != UDS_SUCCESS) { - return result; - } - - DeltaIndexEntry entry; - unsigned int address = hashToChapterDeltaAddress(name, geometry); - result = getDeltaIndexEntry(&openChapterIndex->deltaIndex, - hashToChapterDeltaList(name, geometry), - address, name->name, false, &entry); - if (result != UDS_SUCCESS) { - return result; - } - bool found = wasEntryFound(&entry, address); - result = ASSERT_WITH_ERROR_CODE(!(found && entry.isCollision), - UDS_BAD_STATE, - "Chunk appears more than once in chapter %" - PRIu64, - openChapterIndex->virtualChapterNumber); - if (result != UDS_SUCCESS) { - return result; - } - return putDeltaIndexEntry(&entry, address, pageNumber, - (found ? name->name : NULL)); -} - -/**********************************************************************/ -int packOpenChapterIndexPage(OpenChapterIndex *openChapterIndex, - byte *memory, - unsigned int firstList, - bool lastPage, - unsigned int *numLists) -{ - DeltaIndex *deltaIndex = &openChapterIndex->deltaIndex; - const Geometry *geometry = openChapterIndex->geometry; - unsigned int removals = 0; - for (;;) { - int result = packDeltaIndexPage(deltaIndex, openChapterIndex->volumeNonce, - openChapterIndex->headerNativeEndian, - memory, geometry->bytesPerPage, - openChapterIndex->virtualChapterNumber, - firstList, numLists); - if (result != UDS_SUCCESS) { - return result; - } - if ((firstList + *numLists) == geometry->deltaListsPerChapter) { - // All lists are packed - break; - } else if (*numLists == 0) { - // The next delta list does not fit on a page. This delta list will - // be removed. - } else if (lastPage) { - /* - * This is the last page and there are lists left unpacked, but all of - * the remaining lists must fit on the page. Find a list that contains - * entries and remove the entire list. Try the first list that does not - * fit. If it is empty, we will select the last list that already fits - * and has any entries. - */ - } else { - // This page is done - break; - } - if (removals == 0) { - DeltaIndexStats stats; - getDeltaIndexStats(deltaIndex, &stats); - logWarning("The chapter index for chapter %" PRIu64 - " contains %ld entries with %ld collisions", - openChapterIndex->virtualChapterNumber, - stats.recordCount, stats.collisionCount); - } - DeltaIndexEntry entry; - int listNumber = *numLists; - do { - if (listNumber < 0) { - return UDS_OVERFLOW; - } - result = startDeltaIndexSearch(deltaIndex, firstList + listNumber--, - 0, false, &entry); - if (result != UDS_SUCCESS) { - return result; - } - result = nextDeltaIndexEntry(&entry); - if (result != UDS_SUCCESS) { - return result; - } - } while (entry.atEnd); - do { - result = removeDeltaIndexEntry(&entry); - if (result != UDS_SUCCESS) { - return result; - } - removals++; - } while (!entry.atEnd); - } - if (removals > 0) { - logWarning("To avoid chapter index page overflow in chapter %" PRIu64 - ", %u entries were removed from the chapter index", - openChapterIndex->virtualChapterNumber, removals); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getOpenChapterIndexSize(OpenChapterIndex *openChapterIndex) -{ - DeltaIndexStats stats; - getDeltaIndexStats(&openChapterIndex->deltaIndex, &stats); - return stats.recordCount; -} - -/**********************************************************************/ -size_t getOpenChapterIndexMemoryAllocated(OpenChapterIndex *openChapterIndex) -{ - DeltaIndexStats stats; - getDeltaIndexStats(&openChapterIndex->deltaIndex, &stats); - return stats.memoryAllocated + sizeof(OpenChapterIndex); -} - -/**********************************************************************/ -int initializeChapterIndexPage(DeltaIndexPage *chapterIndexPage, - const Geometry *geometry, - byte *indexPage, - uint64_t volumeNonce) -{ - return initializeDeltaIndexPage(chapterIndexPage, volumeNonce, - geometry->chapterMeanDelta, - geometry->chapterPayloadBits, - indexPage, geometry->bytesPerPage); -} - -/**********************************************************************/ -int validateChapterIndexPage(const DeltaIndexPage *chapterIndexPage, - const Geometry *geometry) -{ - const DeltaIndex *deltaIndex = &chapterIndexPage->deltaIndex; - unsigned int first = chapterIndexPage->lowestListNumber; - unsigned int last = chapterIndexPage->highestListNumber; - // We walk every delta list from start to finish. - unsigned int listNumber; - for (listNumber = first; listNumber <= last; listNumber++) { - DeltaIndexEntry entry; - int result = startDeltaIndexSearch(deltaIndex, listNumber - first, 0, true, - &entry); - if (result != UDS_SUCCESS) { - return result; - } - for (;;) { - result = nextDeltaIndexEntry(&entry); - if (result != UDS_SUCCESS) { - if (result == UDS_CORRUPT_DATA) { - // A random bit stream is highly likely to arrive here when we go - // past the end of the delta list - return UDS_CORRUPT_COMPONENT; - } - return result; - } - if (entry.atEnd) { - break; - } - // Also make sure that the record page field contains a plausible value - if (getDeltaEntryValue(&entry) >= geometry->recordPagesPerChapter) { - // Do not log this as an error. It happens in normal operation when - // we are doing a rebuild but haven't written the entire volume once. - return UDS_CORRUPT_COMPONENT; - } - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int searchChapterIndexPage(DeltaIndexPage *chapterIndexPage, - const Geometry *geometry, - const UdsChunkName *name, - int *recordPagePtr) -{ - DeltaIndex *deltaIndex = &chapterIndexPage->deltaIndex; - unsigned int address = hashToChapterDeltaAddress(name, geometry); - unsigned int deltaListNumber = hashToChapterDeltaList(name, geometry); - unsigned int subListNumber - = deltaListNumber - chapterIndexPage->lowestListNumber;; - DeltaIndexEntry entry; - int result = getDeltaIndexEntry(deltaIndex, subListNumber, address, - name->name, true, &entry); - if (result != UDS_SUCCESS) { - return result; - } - - if (wasEntryFound(&entry, address)) { - *recordPagePtr = getDeltaEntryValue(&entry); - } else { - *recordPagePtr = NO_CHAPTER_INDEX_ENTRY; - } - return UDS_SUCCESS; -} diff --git a/uds/chapterIndex.h b/uds/chapterIndex.h deleted file mode 100644 index 4dd425b..0000000 --- a/uds/chapterIndex.h +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/chapterIndex.h#4 $ - */ - -#ifndef CHAPTER_INDEX_H -#define CHAPTER_INDEX_H 1 - -#include "deltaIndex.h" -#include "geometry.h" - -enum { - // The value returned as the record page number when an entry is not found - // in the chapter index. - NO_CHAPTER_INDEX_ENTRY = -1 -}; - -typedef struct openChapterIndex { - const Geometry *geometry; - DeltaIndex deltaIndex; - uint64_t virtualChapterNumber; - bool headerNativeEndian; - uint64_t volumeNonce; -} OpenChapterIndex; - - -/** - * Make a new open chapter index. - * - * @param openChapterIndex Location to hold new open chapter index pointer - * @param geometry The geometry - * @param chapterIndexHeaderNativeEndian chapter index header format - * @param volumeNonce The volume nonce. - * - * @return error code or UDS_SUCCESS - **/ -int makeOpenChapterIndex(OpenChapterIndex **openChapterIndex, - const Geometry *geometry, - bool chapterIndexHeaderNativeEndian, - uint64_t volumeNonce) - __attribute__((warn_unused_result)); - -/** - * Terminate and clean up an open chapter index. - * - * @param openChapterIndex The open chapter index to terminate - **/ -void freeOpenChapterIndex(OpenChapterIndex *openChapterIndex); - -/** - * Empty an open chapter index, and prepare it for writing a new virtual - * chapter. - * - * @param openChapterIndex The open chapter index to empty - * @param virtualChapterNumber The virtual chapter number - **/ -void emptyOpenChapterIndex(OpenChapterIndex *openChapterIndex, - uint64_t virtualChapterNumber); - -/** - * Create a new record in an open chapter index, associating a chunk name with - * the number of the record page containing the metadata for the chunk. - * - * @param openChapterIndex The open chapter index - * @param name The chunk name - * @param pageNumber The number of the record page containing the name - * - * @return UDS_SUCCESS or an error code - **/ -int putOpenChapterIndexRecord(OpenChapterIndex *openChapterIndex, - const UdsChunkName *name, - unsigned int pageNumber) - __attribute__((warn_unused_result)); - -/** - * Pack a section of an open chapter index into a chapter index page. A - * range of delta lists (starting with a specified list index) is copied - * from the open chapter index into a memory page. The number of lists - * copied onto the page is returned to the caller. - * - * @param openChapterIndex The open chapter index - * @param memory The memory page to use - * @param firstList The first delta list number to be copied - * @param lastPage If true, this is the last page of the chapter - * index and all the remaining lists must be packed - * onto this page - * @param numLists The number of delta lists that were copied - * - * @return error code or UDS_SUCCESS. On UDS_SUCCESS, the numLists - * argument contains the number of lists copied. - **/ -int packOpenChapterIndexPage(OpenChapterIndex *openChapterIndex, - byte *memory, - unsigned int firstList, - bool lastPage, - unsigned int *numLists) - __attribute__((warn_unused_result)); - -/** - * Get the number of records in an open chapter index. - * - * @param openChapterIndex The open chapter index - * - * @return The number of records - **/ -int getOpenChapterIndexSize(OpenChapterIndex *openChapterIndex) - __attribute__((warn_unused_result)); - -/** - * Get the number of bytes allocated for the open chapter index. - * - * @param openChapterIndex The open chapter index - * - * @return the number of bytes allocated - **/ -size_t getOpenChapterIndexMemoryAllocated(OpenChapterIndex *openChapterIndex); - -/** - * Make a new chapter index page, initializing it with the data from the - * given buffer. - * - * @param chapterIndexPage The new chapter index page - * @param geometry The geometry - * @param indexPage The memory page to use - * @param volumeNonce If non-zero, the volume nonce to verify - * - * @return UDS_SUCCESS or an error code - **/ -int initializeChapterIndexPage(DeltaIndexPage *chapterIndexPage, - const Geometry *geometry, - byte *indexPage, - uint64_t volumeNonce) - __attribute__((warn_unused_result)); - -/** - * Validate a chapter index page. This is called at rebuild time to ensure - * that the volume file contains a coherent chapter index. - * - * @param chapterIndexPage The chapter index page - * @param geometry The geometry of the volume - * - * @return The result code: - * UDS_SUCCESS for a good chapter index page - * UDS_CORRUPT_COMPONENT if the chapter index code detects invalid data - * UDS_CORRUPT_DATA if there is a problem in a delta list bit stream - * UDS_BAD_STATE if the code follows an invalid code path - **/ -int validateChapterIndexPage(const DeltaIndexPage *chapterIndexPage, - const Geometry *geometry) - __attribute__((warn_unused_result)); - -/** - * Search a chapter index page for a chunk name, returning the record page - * number that may contain the name. - * - * @param [in] chapterIndexPage The chapter index page - * @param [in] geometry The geometry of the volume - * @param [in] name The chunk name - * @param [out] recordPagePtr The record page number - * or NO_CHAPTER_INDEX_ENTRY if not found - * - * @return UDS_SUCCESS or an error code - **/ -int searchChapterIndexPage(DeltaIndexPage *chapterIndexPage, - const Geometry *geometry, - const UdsChunkName *name, - int *recordPagePtr) - __attribute__((warn_unused_result)); - -#endif /* CHAPTER_INDEX_H */ diff --git a/uds/chapterWriter.c b/uds/chapterWriter.c deleted file mode 100644 index 3a926ab..0000000 --- a/uds/chapterWriter.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/chapterWriter.c#2 $ - */ - -#include "chapterWriter.h" - -#include "errors.h" -#include "index.h" -#include "indexCheckpoint.h" -#include "indexComponent.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "openChapter.h" -#include "threads.h" - - -struct chapterWriter { - /* The index to which we belong */ - Index *index; - /* The thread to do the writing */ - Thread thread; - /* lock protecting the following fields */ - Mutex mutex; - /* condition signalled on state changes */ - CondVar cond; - /* Set to true to stop the thread */ - bool stop; - /* The result from the most recent write */ - int result; - /* The number of bytes allocated by the chapter writer */ - size_t memoryAllocated; - /* The number of zones which have submitted a chapter for writing */ - unsigned int zonesToWrite; - /* Open chapter index used by closeOpenChapter() */ - OpenChapterIndex *openChapterIndex; - /* Collated records used by closeOpenChapter() */ - UdsChunkRecord *collatedRecords; - /* The chapters to write (one per zone) */ - OpenChapterZone *chapters[]; -}; - -/** - * This is the driver function for the writer thread. It loops until - * terminated, waiting for a chapter to provided to close. - **/ -static void closeChapters(void *arg) -{ - ChapterWriter *writer = arg; - logDebug("chapter writer starting"); - lockMutex(&writer->mutex); - for (;;) { - while (writer->zonesToWrite < writer->index->zoneCount) { - if (writer->stop && (writer->zonesToWrite == 0)) { - // We've been told to stop, and all of the zones are in the same - // open chapter, so we can exit now. - unlockMutex(&writer->mutex); - logDebug("chapter writer stopping"); - return; - } - waitCond(&writer->cond, &writer->mutex); - } - - /* - * Release the lock while closing a chapter. We probably don't need to do - * this, but it seems safer in principle. It's OK to access the chapter - * and chapterNumber fields without the lock since those aren't allowed to - * change until we're done. - */ - unlockMutex(&writer->mutex); - - if (writer->index->hasSavedOpenChapter) { - writer->index->hasSavedOpenChapter = false; - /* - * Remove the saved open chapter as that chapter is about to be written - * to the volume. This matters the first time we close the open chapter - * after loading from a clean shutdown, or after doing a clean save. - */ - IndexComponent *oc = findIndexComponent(writer->index->state, - &OPEN_CHAPTER_INFO); - int result = discardIndexComponent(oc); - if (result == UDS_SUCCESS) { - logDebug("Discarding saved open chapter"); - } - } - - int result = closeOpenChapter(writer->chapters, - writer->index->zoneCount, - writer->index->volume, - writer->openChapterIndex, - writer->collatedRecords, - writer->index->newestVirtualChapter); - - if (result == UDS_SUCCESS) { - result = processChapterWriterCheckpointSaves(writer->index); - } - - - lockMutex(&writer->mutex); - // Note that the index is totally finished with the writing chapter - advanceActiveChapters(writer->index); - writer->result = result; - writer->zonesToWrite = 0; - broadcastCond(&writer->cond); - } -} - -/**********************************************************************/ -int makeChapterWriter(Index *index, - const struct index_version *indexVersion, - ChapterWriter **writerPtr) -{ - size_t collatedRecordsSize - = (sizeof(UdsChunkRecord) - * (1 + index->volume->geometry->recordsPerChapter)); - ChapterWriter *writer; - int result = ALLOCATE_EXTENDED(ChapterWriter, - index->zoneCount, OpenChapterZone *, - "Chapter Writer", &writer); - if (result != UDS_SUCCESS) { - return result; - } - writer->index = index; - - result = initMutex(&writer->mutex); - if (result != UDS_SUCCESS) { - FREE(writer); - return result; - } - result = initCond(&writer->cond); - if (result != UDS_SUCCESS) { - destroyMutex(&writer->mutex); - FREE(writer); - return result; - } - - // Now that we have the mutex+cond, it is safe to call freeChapterWriter. - result = allocateCacheAligned(collatedRecordsSize, "collated records", - &writer->collatedRecords); - if (result != UDS_SUCCESS) { - freeChapterWriter(writer); - return makeUnrecoverable(result); - } - result = makeOpenChapterIndex(&writer->openChapterIndex, - index->volume->geometry, - indexVersion->chapterIndexHeaderNativeEndian, - index->volume->nonce); - if (result != UDS_SUCCESS) { - freeChapterWriter(writer); - return makeUnrecoverable(result); - } - - size_t openChapterIndexMemoryAllocated - = getOpenChapterIndexMemoryAllocated(writer->openChapterIndex); - writer->memoryAllocated = (sizeof(ChapterWriter) - + index->zoneCount * sizeof(OpenChapterZone *) - + collatedRecordsSize - + openChapterIndexMemoryAllocated); - - // We're initialized, so now it's safe to start the writer thread. - result = createThread(closeChapters, writer, "writer", &writer->thread); - if (result != UDS_SUCCESS) { - freeChapterWriter(writer); - return makeUnrecoverable(result); - } - - *writerPtr = writer; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeChapterWriter(ChapterWriter *writer) -{ - if (writer == NULL) { - return; - } - - int result __attribute__((unused)) = stopChapterWriter(writer); - destroyMutex(&writer->mutex); - destroyCond(&writer->cond); - freeOpenChapterIndex(writer->openChapterIndex); - FREE(writer->collatedRecords); - FREE(writer); -} - -/**********************************************************************/ -unsigned int startClosingChapter(ChapterWriter *writer, - unsigned int zoneNumber, - OpenChapterZone *chapter) -{ - lockMutex(&writer->mutex); - unsigned int finishedZones = ++writer->zonesToWrite; - writer->chapters[zoneNumber] = chapter; - broadcastCond(&writer->cond); - unlockMutex(&writer->mutex); - - return finishedZones; -} - -/**********************************************************************/ -int finishPreviousChapter(ChapterWriter *writer, uint64_t currentChapterNumber) -{ - int result; - lockMutex(&writer->mutex); - while (writer->index->newestVirtualChapter < currentChapterNumber) { - waitCond(&writer->cond, &writer->mutex); - } - result = writer->result; - unlockMutex(&writer->mutex); - - if (result != UDS_SUCCESS) { - return logUnrecoverable(result, "Writing of previous open chapter failed"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void waitForIdleChapterWriter(ChapterWriter *writer) -{ - lockMutex(&writer->mutex); - while (writer->zonesToWrite > 0) { - // The chapter writer is probably writing a chapter. If it is not, it will - // soon wake up and write a chapter. - waitCond(&writer->cond, &writer->mutex); - } - unlockMutex(&writer->mutex); -} - -/**********************************************************************/ -int stopChapterWriter(ChapterWriter *writer) -{ - Thread writerThread = 0; - - lockMutex(&writer->mutex); - if (writer->thread != 0) { - writerThread = writer->thread; - writer->thread = 0; - writer->stop = true; - broadcastCond(&writer->cond); - } - int result = writer->result; - unlockMutex(&writer->mutex); - - if (writerThread != 0) { - joinThreads(writerThread); - } - - if (result != UDS_SUCCESS) { - return logUnrecoverable(result, "Writing of previous open chapter failed"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t getChapterWriterMemoryAllocated(ChapterWriter *writer) -{ - return writer->memoryAllocated; -} diff --git a/uds/chapterWriter.h b/uds/chapterWriter.h deleted file mode 100644 index 85c1f42..0000000 --- a/uds/chapterWriter.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/chapterWriter.h#2 $ - */ - -#ifndef CHAPTER_WRITER_H -#define CHAPTER_WRITER_H - -#include "atomicDefs.h" -#include "indexVersion.h" -#include "openChapterZone.h" - -typedef struct chapterWriter ChapterWriter; - -// This opaque declaration breaks the dependency loop with index.h -struct index; - - -/** - * Create a chapter writer and start its thread. - * - * @param index the index containing the chapters to be written - * @param indexVersion the index version parameters - * @param writerPtr pointer to hold the new writer - * - * @return UDS_SUCCESS or an error code - **/ -int makeChapterWriter(struct index *index, - const struct index_version *indexVersion, - ChapterWriter **writerPtr) - __attribute__((warn_unused_result)); - -/** - * Free a chapter writer, waiting for its thread to finish. - * - * @param writer the chapter writer to destroy - **/ -void freeChapterWriter(ChapterWriter *writer); - -/** - * Asychronously close and write a chapter by passing it to the writer - * thread. Writing won't start until all zones have submitted a chapter. - * - * @param writer the chapter writer - * @param zoneNumber the number of the zone submitting a chapter - * @param chapter the chapter to write - * - * @return The number of zones which have submitted the current chapter - **/ -unsigned int startClosingChapter(ChapterWriter *writer, - unsigned int zoneNumber, - OpenChapterZone *chapter) - __attribute__((warn_unused_result)); - -/** - * Wait for the chapter writer thread to finish closing the chapter previous - * to the one specified. - * - * @param writer the chapter writer - * @param currentChapterNumber the currentChapter number - * - * @return UDS_SUCCESS or an error code from the most recent write - * request - **/ -int finishPreviousChapter(ChapterWriter *writer, uint64_t currentChapterNumber) - __attribute__((warn_unused_result)); - - -/** - * Wait for the chapter writer thread to finish all writes to storage. - * - * @param writer the chapter writer - **/ -void waitForIdleChapterWriter(ChapterWriter *writer); - -/** - * Stop the chapter writer and wait for it to finish. - * - * @param writer the chapter writer to stop - * - * @return UDS_SUCCESS or an error code from the most recent write - * request - **/ -int stopChapterWriter(ChapterWriter *writer) - __attribute__((warn_unused_result)); - -/** - * Get the number of bytes allocated for the chapter writer. - * - * @param writer the chapter writer - * - * @return the number of bytes allocated - **/ -size_t getChapterWriterMemoryAllocated(ChapterWriter *writer); - -#endif /* CHAPTER_WRITER_H */ diff --git a/uds/common.h b/uds/common.h deleted file mode 100644 index bea27e5..0000000 --- a/uds/common.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/common.h#1 $ - */ - -#ifndef COMMON_H -#define COMMON_H - -#include "stringUtils.h" -#include "typeDefs.h" -#include "uds.h" -#include "uds-block.h" - -enum { - KILOBYTE = 1024, - MEGABYTE = KILOBYTE * KILOBYTE, - GIGABYTE = KILOBYTE * MEGABYTE -}; - -typedef struct udsChunkData UdsChunkData; - -typedef struct { - UdsChunkName name; - UdsChunkData data; -} UdsChunkRecord; - -#endif /* COMMON_H */ diff --git a/uds/compiler.h b/uds/compiler.h deleted file mode 100644 index cd57590..0000000 --- a/uds/compiler.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/compiler.h#1 $ - */ - -#ifndef COMMON_COMPILER_H -#define COMMON_COMPILER_H - -#include "compilerDefs.h" - -// Count the elements in a static array while attempting to catch some type -// errors. (See http://stackoverflow.com/a/1598827 for an explanation.) -#define COUNT_OF(x) ((sizeof(x) / sizeof(0[x])) \ - / ((size_t) (!(sizeof(x) % sizeof(0[x]))))) - -#define const_container_of(ptr, type, member) \ - __extension__ ({ \ - const __typeof__(((type *)0)->member) *__mptr = (ptr); \ - (const type *)((const char *)__mptr - offsetof(type,member)); \ - }) - -// The "inline" keyword alone takes affect only when the optimization level -// is high enough. Define INLINE to force the gcc to "always inline". -#define INLINE __attribute__((always_inline)) inline - -#endif /* COMMON_COMPILER_H */ diff --git a/uds/compilerDefs.h b/uds/compilerDefs.h deleted file mode 100644 index cc81ce2..0000000 --- a/uds/compilerDefs.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/compilerDefs.h#1 $ - */ - -#ifndef LINUX_KERNEL_COMPILER_DEFS_H -#define LINUX_KERNEL_COMPILER_DEFS_H - -#include - -#define __STRING(x) #x - -#endif /* LINUX_KERNEL_COMPILER_DEFS_H */ diff --git a/uds/config.c b/uds/config.c deleted file mode 100644 index a953da3..0000000 --- a/uds/config.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/config.c#2 $ - */ - -#include "config.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" - -/**********************************************************************/ -void freeIndexLocation(IndexLocation *loc) -{ - if (loc == NULL) { - return; - } - - FREE(loc->host); - FREE(loc->port); - FREE(loc->directory); -} - -/**********************************************************************/ -bool areUdsConfigurationsEqual(UdsConfiguration a, UdsConfiguration b) -{ - bool result = true; - if (a->recordPagesPerChapter != b->recordPagesPerChapter) { - logError("Record pages per chapter (%u) does not match (%u)", - a->recordPagesPerChapter, b->recordPagesPerChapter); - result = false; - } - if (a->chaptersPerVolume != b->chaptersPerVolume) { - logError("Chapter count (%u) does not match (%u)", - a->chaptersPerVolume, b->chaptersPerVolume); - result = false; - } - if (a->sparseChaptersPerVolume != b->sparseChaptersPerVolume) { - logError("Sparse chapter count (%u) does not match (%u)", - a->sparseChaptersPerVolume, b->sparseChaptersPerVolume); - result = false; - } - if (a->cacheChapters != b->cacheChapters) { - logError("Cache size (%u) does not match (%u)", - a->cacheChapters, b->cacheChapters); - result = false; - } - if (a->masterIndexMeanDelta != b->masterIndexMeanDelta) { - logError("Master index mean delta (%u) does not match (%u)", - a->masterIndexMeanDelta, b->masterIndexMeanDelta); - result = false; - } - if (a->bytesPerPage != b->bytesPerPage) { - logError("Bytes per page value (%u) does not match (%u)", - a->bytesPerPage, b->bytesPerPage); - result = false; - } - if (a->sparseSampleRate != b->sparseSampleRate) { - logError("Sparse sample rate (%u) does not match (%u)", - a->sparseSampleRate, b->sparseSampleRate); - result = false; - } - if (a->nonce != b->nonce) { - logError("Nonce (%llu) does not match (%llu)", - a->nonce, b->nonce); - result = false; - } - return result; -} - -/**********************************************************************/ -void logUdsConfiguration(UdsConfiguration conf) -{ - logDebug("Configuration:"); - logDebug(" Record pages per chapter: %10u", conf->recordPagesPerChapter); - logDebug(" Chapters per volume: %10u", conf->chaptersPerVolume); - logDebug(" Sparse chapters per volume: %10u", conf->sparseChaptersPerVolume); - logDebug(" Cache size (chapters): %10u", conf->cacheChapters); - logDebug(" Master index mean delta: %10u", conf->masterIndexMeanDelta); - logDebug(" Bytes per page: %10u", conf->bytesPerPage); - logDebug(" Sparse sample rate: %10u", conf->sparseSampleRate); - logDebug(" Nonce: %llu", conf->nonce); -} diff --git a/uds/config.h b/uds/config.h deleted file mode 100644 index f31efab..0000000 --- a/uds/config.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/config.h#2 $ - */ - -#ifndef CONFIG_H -#define CONFIG_H - -#include "bufferedReader.h" -#include "bufferedWriter.h" -#include "geometry.h" -#include "uds.h" - -enum { - DEFAULT_MASTER_INDEX_MEAN_DELTA = 4096, - DEFAULT_CACHE_CHAPTERS = 7, - DEFAULT_SPARSE_SAMPLE_RATE = 0 -}; - -/** - * Data that are used for configuring a new index. - **/ -struct udsConfiguration { - /** Smaller (16), Small (64) or large (256) indices */ - unsigned int recordPagesPerChapter; - /** Total number of chapters per volume */ - unsigned int chaptersPerVolume; - /** Number of sparse chapters per volume */ - unsigned int sparseChaptersPerVolume; - /** Size of the page cache, in chapters */ - unsigned int cacheChapters; - /** Frequency with which to checkpoint */ - // XXX the checkpointFrequency is not used - it is now a runtime parameter - unsigned int checkpointFrequency; - /** The master index mean delta to use */ - unsigned int masterIndexMeanDelta; - /** Size of a page, used for both record pages and index pages */ - unsigned int bytesPerPage; - /** Sampling rate for sparse indexing */ - unsigned int sparseSampleRate; - /** Index Owner's nonce */ - UdsNonce nonce; -}; - -/** - * Data that are used for a 6.01 index. - **/ -struct udsConfiguration6_01 { - /** Smaller (16), Small (64) or large (256) indices */ - unsigned int recordPagesPerChapter; - /** Total number of chapters per volume */ - unsigned int chaptersPerVolume; - /** Number of sparse chapters per volume */ - unsigned int sparseChaptersPerVolume; - /** Size of the page cache, in chapters */ - unsigned int cacheChapters; - /** Frequency with which to checkpoint */ - unsigned int checkpointFrequency; - /** The master index mean delta to use */ - unsigned int masterIndexMeanDelta; - /** Size of a page, used for both record pages and index pages */ - unsigned int bytesPerPage; - /** Sampling rate for sparse indexing */ - unsigned int sparseSampleRate; -}; - -typedef struct indexLocation { - char *host; - char *port; - char *directory; -} IndexLocation; - -/** - * A set of configuration parameters for the indexer. - **/ -typedef struct configuration Configuration; - -/** - * Construct a new indexer configuration. - * - * @param conf UdsConfiguration to use - * @param configPtr The new index configuration - * - * @return UDS_SUCCESS or an error code - **/ -int makeConfiguration(UdsConfiguration conf, - Configuration **configPtr) - __attribute__((warn_unused_result)); - -/** - * Clean up the configuration struct. - **/ -void freeConfiguration(Configuration *config); - -/** - * Read the index configuration from stable storage. - * - * @param reader A buffered reader. - * @param config The index configuration to overwrite. - * - * @return UDS_SUCCESS or an error code. - **/ -int readConfigContents(BufferedReader *reader, - UdsConfiguration config) - __attribute__((warn_unused_result)); - -/** - * Write the index configuration information to stable storage. - * - * @param writer A buffered writer. - * @param config The index configuration. - * - * @return UDS_SUCCESS or an error code. - **/ -int writeConfigContents(BufferedWriter *writer, - UdsConfiguration config) - __attribute__((warn_unused_result)); - -/** - * Free the memory used by an IndexLocation. - * - * @param loc index location to free - **/ -void freeIndexLocation(IndexLocation *loc); - -/** - * Compare two configurations for equality. - * - * @param a The first configuration to compare - * @param b The second configuration to compare - * - * @return true iff they are equal - **/ -bool areUdsConfigurationsEqual(UdsConfiguration a, UdsConfiguration b) - __attribute__((warn_unused_result)); - -/** - * Log a user configuration. - * - * @param conf The configuration - **/ -void logUdsConfiguration(UdsConfiguration conf); - -#endif /* CONFIG_H */ diff --git a/uds/cpu.h b/uds/cpu.h deleted file mode 100644 index 9314985..0000000 --- a/uds/cpu.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/cpu.h#1 $ - */ - -#ifndef CPU_H -#define CPU_H - -#include "compiler.h" -#include "typeDefs.h" - -/** - * The number of bytes in a CPU cache line. In the future, we'll probably need - * to move this to a processor-specific file or discover it at compilation - * time (or runtime, if sufficiently heterogeneous), but this will do for now. - * (Must be a \#define since enums are not proper compile-time constants.) - **/ -#ifdef __PPC__ -// N.B.: Some PPC processors have smaller cache lines. -#define CACHE_LINE_BYTES 128 -#elif defined(__s390x__) -#define CACHE_LINE_BYTES 256 -#elif defined(__x86_64__) || defined(__aarch64__) -#define CACHE_LINE_BYTES 64 -#else -#error "unknown cache line size" -#endif - -/** - * Minimize cache-miss latency by moving data into a CPU cache before it is - * accessed. - * - * @param address the address to fetch (may be invalid) - * @param forWrite must be constant at compile time--false if - * for reading, true if for writing - **/ -static INLINE void prefetchAddress(const void *address, bool forWrite) -{ - // forWrite won't won't be a constant if we are compiled with optimization - // turned off, in which case prefetching really doesn't matter. - if (__builtin_constant_p(forWrite)) { - __builtin_prefetch(address, forWrite); - } -} - -/** - * Minimize cache-miss latency by moving a range of addresses into a - * CPU cache before they are accessed. - * - * @param start the starting address to fetch (may be invalid) - * @param size the number of bytes in the address range - * @param forWrite must be constant at compile time--false if - * for reading, true if for writing - **/ -static INLINE void prefetchRange(const void *start, - unsigned int size, - bool forWrite) -{ - // Count the number of cache lines to fetch, allowing for the address range - // to span an extra cache line boundary due to address alignment. - const char *address = (const char *) start; - unsigned int offset = ((uintptr_t) address % CACHE_LINE_BYTES); - size += offset; - - unsigned int cacheLines = (1 + (size / CACHE_LINE_BYTES)); - while (cacheLines-- > 0) { - prefetchAddress(address, forWrite); - address += CACHE_LINE_BYTES; - } -} - -#endif /* CPU_H */ diff --git a/uds/deltaIndex.c b/uds/deltaIndex.c deleted file mode 100644 index 0c43e9b..0000000 --- a/uds/deltaIndex.c +++ /dev/null @@ -1,1707 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/deltaIndex.c#7 $ - */ -#include "deltaIndex.h" - -#include "bits.h" -#include "buffer.h" -#include "compiler.h" -#include "cpu.h" -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "stringUtils.h" -#include "typeDefs.h" -#include "uds.h" -#include "zone.h" - -/* - * A delta index is a key-value store, where each entry maps an address - * (the key) to a payload (the value). The entries are sorted by address, - * and only the delta between successive addresses is stored in the entry. - * The addresses are assumed to be uniformly distributed,and the deltas are - * therefore exponentially distributed. - * - * The entries could be stored in a single DeltaList, but for efficiency we - * use multiple DeltaLists. These lists are stored in a single chunk of - * memory managed by the DeltaMemory module. The DeltaMemory module can - * move the data around in memory, so we never keep any byte pointers into - * DeltaList memory. We only keep offsets into the memory. - * - * The delta lists are stored as bit streams. These bit streams are stored - * in little endian order, and all offsets into DeltaMemory are bit - * offsets. - * - * All entries are stored as a fixed length payload (the value) followed by a - * variable length key (the delta). Always strictly in little endian order. - * - * A collision entry is used when two block names have the same delta list - * address. A collision entry is encoded with DELTA==0, and has 256 - * extension bits containing the full block name. - * - * There is a special exception to be noted. The DELTA==0 encoding usually - * indicates a collision with the preceding entry. But for the first entry - * in any delta list there is no preceding entry, so the DELTA==0 encoding - * at the beginning of a delta list indicates a normal entry. - * - * The Huffman code is driven by 3 parameters: - * - * MINBITS This is the number of bits in the smallest code - * - * BASE This is the number of values coded using a code of length MINBITS - * - * INCR This is the number of values coded by using one additional bit. - * - * These parameters are related by: - * - * BASE + INCR == 1 << MINBITS - * - * When we create an index, we need to know the mean delta. From the mean - * delta, we compute these three parameters. The math for the Huffman code - * of an exponential distribution says that we compute: - * - * INCR = log(2) * MEAN_DELTA - * - * Then we find the smallest MINBITS so that - * - * 1 << MINBITS > INCR - * - * And then: - * - * BASE = (1 << MINBITS) - INCR - * - * Now we need a code such that - * - * - The first BASE values code using MINBITS bits - * - The next INCR values code using MINBITS+1 bits. - * - The next INCR values code using MINBITS+2 bits. - * - The next INCR values code using MINBITS+3 bits. - * - (and so on). - * - * ENCODE(DELTA): - * - * if (DELTA < BASE) { - * put DELTA in MINBITS bits; - * } else { - * T1 = (DELTA - BASE) % INCR + BASE; - * T2 = (DELTA - BASE) / INCR; - * put T1 in MINBITS bits; - * put 0 in T2 bits; - * put 1 in 1 bit; - * } - * - * DECODE(BIT_STREAM): - * - * T1 = next MINBITS bits of stream; - * if (T1 < BASE) { - * DELTA = T1; - * } else { - * Scan bits in the stream until reading a 1, - * setting T2 to the number of 0 bits read; - * DELTA = T2 * INCR + T1; - * } - * - * The bit field utilities that we use on the delta lists assume that it is - * possible to read a few bytes beyond the end of the bit field. So we - * make sure to allocates some extra bytes at the end of memory containing - * the delta lists. Look for POST_FIELD_GUARD_BYTES to find the code - * related to this. - * - * And note that the decode bit stream code includes a step that skips over - * 0 bits until the first 1 bit is found. A corrupted delta list could - * cause this step to run off the end of the delta list memory. As an - * extra protection against this happening, the guard bytes at the end - * should be set to all ones. - */ - -/** - * Constants and structures for the saved delta index. "DI" is for - * deltaIndex, and -##### is a number to increment when the format of the - * data changes. - **/ -enum { MAGIC_SIZE = 8 }; -static const char MAGIC_DI_START[] = "DI-00002"; - -struct di_header { - char magic[MAGIC_SIZE]; // MAGIC_DI_START - uint32_t zoneNumber; - uint32_t numZones; - uint32_t firstList; - uint32_t numLists; - uint64_t recordCount; - uint64_t collisionCount; -}; - -//********************************************************************** -// Methods for dealing with mutable delta list headers -//********************************************************************** - -/** - * Move the start of the delta list bit stream without moving the end. - * - * @param deltaList The delta list header - * @param increment The change in the start of the delta list - **/ -static INLINE void moveDeltaListStart(DeltaList *deltaList, int increment) -{ - deltaList->startOffset += increment; - deltaList->size -= increment; -} - -/** - * Move the end of the delta list bit stream without moving the start. - * - * @param deltaList The delta list header - * @param increment The change in the end of the delta list - **/ -static INLINE void moveDeltaListEnd(DeltaList *deltaList, int increment) -{ - deltaList->size += increment; -} - -//********************************************************************** -// Methods for dealing with immutable delta list headers packed -//********************************************************************** - -// Header data used for immutable delta index pages. These data are -// followed by the delta list offset table. -typedef struct __attribute__((packed)) deltaPageHeader { - uint64_t nonce; // Externally-defined nonce - uint64_t virtualChapterNumber; // The virtual chapter number - uint16_t firstList; // Index of the first delta list on the page - uint16_t numLists; // Number of delta lists on the page -} DeltaPageHeader; - -// Immutable delta lists are packed into pages containing a header that -// encodes the delta list information into 19 bits per list (64KB bit offset) - -enum { IMMUTABLE_HEADER_SIZE = 19 }; - -/** - * Get the bit offset to the immutable delta list header - * - * @param listNumber The delta list number - * - * @return the offset of immutable delta list header - **/ -static INLINE unsigned int getImmutableHeaderOffset(unsigned int listNumber) -{ - return (sizeof(DeltaPageHeader) * CHAR_BIT - + listNumber * IMMUTABLE_HEADER_SIZE); -} - -/** - * Get the bit offset to the start of the immutable delta list bit stream - * - * @param memory The memory page containing the delta lists - * @param listNumber The delta list number - * - * @return the start of the delta list - **/ -static INLINE unsigned int getImmutableStart(const byte *memory, - unsigned int listNumber) -{ - return getField(memory, getImmutableHeaderOffset(listNumber), - IMMUTABLE_HEADER_SIZE); -} - -/** - * Set the bit offset to the start of the immutable delta list bit stream - * - * @param memory The memory page containing the delta lists - * @param listNumber The delta list number - * @param startOffset The start of the delta list - **/ -static INLINE void setImmutableStart(byte *memory, unsigned int listNumber, - unsigned int startOffset) -{ - setField(startOffset, memory, getImmutableHeaderOffset(listNumber), - IMMUTABLE_HEADER_SIZE); -} - -//********************************************************************** -// Methods for dealing with Delta List Entries -//********************************************************************** - -/** - * Decode a delta index entry delta value. The DeltaIndexEntry basically - * describes the previous list entry, and has had its offset field changed to - * point to the subsequent entry. We decode the bit stream and update the - * DeltaListEntry to describe the entry. - * - * @param deltaEntry The delta index entry - **/ -static INLINE void decodeDelta(DeltaIndexEntry *deltaEntry) -{ - const DeltaMemory *deltaZone = deltaEntry->deltaZone; - const byte *memory = deltaZone->memory; - uint64_t deltaOffset - = getDeltaEntryOffset(deltaEntry) + deltaEntry->valueBits; - const byte *addr = memory + deltaOffset / CHAR_BIT; - int offset = deltaOffset % CHAR_BIT; - uint32_t data = getUInt32LE(addr) >> offset; - addr += sizeof(uint32_t); - int keyBits = deltaZone->minBits; - unsigned int delta = data & ((1 << keyBits) - 1); - if (delta >= deltaZone->minKeys) { - data >>= keyBits; - if (data == 0) { - keyBits = sizeof(uint32_t) * CHAR_BIT - offset; - while ((data = getUInt32LE(addr)) == 0) { - addr += sizeof(uint32_t); - keyBits += sizeof(uint32_t) * CHAR_BIT; - } - } - keyBits += ffs(data); - delta += (keyBits - deltaZone->minBits - 1) * deltaZone->incrKeys; - } - deltaEntry->delta = delta; - deltaEntry->key += delta; - - // Check for a collision, a delta of zero not at the start of the list. - if (unlikely((delta == 0) && (deltaEntry->offset > 0))) { - deltaEntry->isCollision = true; - // The small duplication of this math in the two arms of this if statement - // makes a tiny but measurable difference in performance. - deltaEntry->entryBits = deltaEntry->valueBits + keyBits + COLLISION_BITS; - } else { - deltaEntry->isCollision = false; - deltaEntry->entryBits = deltaEntry->valueBits + keyBits; - } -} - -/** - * Delete bits from a delta list at the offset of the specified delta index - * entry. - * - * @param deltaEntry The delta index entry - * @param size The number of bits to delete - **/ -static void deleteBits(const DeltaIndexEntry *deltaEntry, int size) -{ - DeltaList *deltaList = deltaEntry->deltaList; - byte *memory = deltaEntry->deltaZone->memory; - // Compute how many bits are retained before and after the deleted bits - uint32_t totalSize = getDeltaListSize(deltaList); - uint32_t beforeSize = deltaEntry->offset; - uint32_t afterSize = totalSize - deltaEntry->offset - size; - - // Determine whether to add to the available space either before or after - // the delta list. We prefer to move the least amount of data. If it is - // exactly the same, try to add to the smaller amount of free space. - bool beforeFlag; - if (beforeSize < afterSize) { - beforeFlag = true; - } else if (afterSize < beforeSize) { - beforeFlag = false; - } else { - uint64_t freeBefore - = getDeltaListStart(&deltaList[0]) - getDeltaListEnd(&deltaList[-1]); - uint64_t freeAfter - = getDeltaListStart(&deltaList[1]) - getDeltaListEnd(&deltaList[ 0]); - beforeFlag = freeBefore < freeAfter; - } - - uint64_t source, destination; - uint32_t count; - if (beforeFlag) { - source = getDeltaListStart(deltaList); - destination = source + size; - moveDeltaListStart(deltaList, size); - count = beforeSize; - } else { - moveDeltaListEnd(deltaList, -size); - destination = getDeltaListStart(deltaList) + deltaEntry->offset; - source = destination + size; - count = afterSize; - } - moveBits(memory, source, memory, destination, count); -} - -/** - * Get the offset of the collision field in a DeltaIndexEntry - * - * @param entry The delta index record - * - * @return the offset of the start of the collision name - **/ -static INLINE uint64_t getCollisionOffset(const DeltaIndexEntry *entry) -{ - return (getDeltaEntryOffset(entry) + entry->entryBits - COLLISION_BITS); -} - -/** - * Encode a delta index entry delta. - * - * @param deltaEntry The delta index entry - **/ -static void encodeDelta(const DeltaIndexEntry *deltaEntry) -{ - const DeltaMemory *deltaZone = deltaEntry->deltaZone; - byte *memory = deltaZone->memory; - uint64_t offset = getDeltaEntryOffset(deltaEntry) + deltaEntry->valueBits; - if (deltaEntry->delta < deltaZone->minKeys) { - setField(deltaEntry->delta, memory, offset, deltaZone->minBits); - return; - } - unsigned int temp = deltaEntry->delta - deltaZone->minKeys; - unsigned int t1 = (temp % deltaZone->incrKeys) + deltaZone->minKeys; - unsigned int t2 = temp / deltaZone->incrKeys; - setField(t1, memory, offset, deltaZone->minBits); - setZero(memory, offset + deltaZone->minBits, t2); - setOne(memory, offset + deltaZone->minBits + t2, 1); -} - -/** - * Encode a delta index entry. - * - * @param deltaEntry The delta index entry - * @param value The value associated with the entry - * @param name For collision entries, the 256 bit full name. - **/ -static void encodeEntry(const DeltaIndexEntry *deltaEntry, unsigned int value, - const byte *name) -{ - byte *memory = deltaEntry->deltaZone->memory; - uint64_t offset = getDeltaEntryOffset(deltaEntry); - setField(value, memory, offset, deltaEntry->valueBits); - encodeDelta(deltaEntry); - if (name != NULL) { - setBytes(memory, getCollisionOffset(deltaEntry), name, COLLISION_BYTES); - } -} - -/** - * Insert bits into a delta list at the offset of the specified delta index - * entry. - * - * @param deltaEntry The delta index entry - * @param size The number of bits to insert - * - * @return UDS_SUCCESS or an error code - **/ -static int insertBits(DeltaIndexEntry *deltaEntry, int size) -{ - DeltaMemory *deltaZone = deltaEntry->deltaZone; - DeltaList *deltaList = deltaEntry->deltaList; - // Compute how many bits are in use before and after the inserted bits - uint32_t totalSize = getDeltaListSize(deltaList); - uint32_t beforeSize = deltaEntry->offset; - uint32_t afterSize = totalSize - deltaEntry->offset; - if ((unsigned int) (totalSize + size) > UINT16_MAX) { - deltaEntry->listOverflow = true; - deltaZone->overflowCount++; - return UDS_OVERFLOW; - } - - // Compute how many bits are available before and after the delta list - uint64_t freeBefore - = getDeltaListStart(&deltaList[0]) - getDeltaListEnd(&deltaList[-1]); - uint64_t freeAfter - = getDeltaListStart(&deltaList[1]) - getDeltaListEnd(&deltaList[ 0]); - - bool beforeFlag; - if (((unsigned int) size <= freeBefore) - && ((unsigned int) size <= freeAfter)) { - // We have enough space to use either before or after the list. Prefer - // to move the least amount of data. If it is exactly the same, try to - // take from the larger amount of free space. - if (beforeSize < afterSize) { - beforeFlag = true; - } else if (afterSize < beforeSize) { - beforeFlag = false; - } else { - beforeFlag = freeBefore > freeAfter; - } - } else if ((unsigned int) size <= freeBefore) { - // There is space before but not after - beforeFlag = true; - } else if ((unsigned int) size <= freeAfter) { - // There is space after but not before - beforeFlag = false; - } else { - // Neither of the surrounding spaces is large enough for this request, - // Extend and/or rebalance the delta list memory choosing to move the - // least amount of data. - unsigned int growingIndex = deltaEntry->listNumber + 1; - beforeFlag = beforeSize < afterSize; - if (!beforeFlag) { - growingIndex++; - } - int result = extendDeltaMemory(deltaZone, growingIndex, - (size + CHAR_BIT - 1) / CHAR_BIT, true); - if (result != UDS_SUCCESS) { - return result; - } - } - - uint64_t source, destination; - uint32_t count; - if (beforeFlag) { - source = getDeltaListStart(deltaList); - destination = source - size; - moveDeltaListStart(deltaList, -size); - count = beforeSize; - } else { - moveDeltaListEnd(deltaList, size); - source = getDeltaListStart(deltaList) + deltaEntry->offset; - destination = source + size; - count = afterSize; - } - byte *memory = deltaZone->memory; - moveBits(memory, source, memory, destination, count); - return UDS_SUCCESS; -} - -/** - * Get the amount of memory to allocate for each zone - * - * @param numZones The number of zones in the index - * @param memorySize The number of bytes in memory for the index - * - * @return the number of bytes to allocate for a single zone - **/ -static INLINE size_t getZoneMemorySize(unsigned int numZones, - size_t memorySize) -{ - size_t zoneSize = memorySize / numZones; - // Round the size up so that each zone is a multiple of 64K in size. - enum { ALLOC_BOUNDARY = 64 * KILOBYTE }; - return (zoneSize + ALLOC_BOUNDARY - 1) & -ALLOC_BOUNDARY; -} - -/** - * Validate delta index parameters - * - * @param meanDelta The mean delta value - * @param numPayloadBits The number of bits in the payload or value - **/ -static bool invalidParameters(unsigned int meanDelta, - unsigned int numPayloadBits) -{ - const unsigned int minDelta = 10; - const unsigned int maxDelta = 1 << MAX_FIELD_BITS; - if ((meanDelta < minDelta) || (meanDelta > maxDelta)) { - logWarning("error initializing delta index: " - "meanDelta (%u) is not in the range %u to %u", - meanDelta, minDelta, maxDelta); - return true; - } - if (numPayloadBits > MAX_FIELD_BITS) { - logWarning("error initializing delta index: Too many payload bits (%u)", - numPayloadBits); - return true; - } - return false; -} - -/** - * Set a delta index entry to be a collision - * - * @param deltaEntry The delta index entry - **/ -static void setCollision(DeltaIndexEntry *deltaEntry) -{ - deltaEntry->isCollision = true; - deltaEntry->entryBits += COLLISION_BITS; -} - -/** - * Set the delta in a delta index entry. - * - * @param deltaEntry The delta index entry - * @param delta The new delta - **/ -static void setDelta(DeltaIndexEntry *deltaEntry, unsigned int delta) -{ - const DeltaMemory *deltaZone = deltaEntry->deltaZone; - deltaEntry->delta = delta; - int keyBits = (deltaZone->minBits - + ((deltaZone->incrKeys - deltaZone->minKeys + delta) - / deltaZone->incrKeys)); - deltaEntry->entryBits = deltaEntry->valueBits + keyBits; -} - -//********************************************************************** -// External functions declared in deltaIndex.h -//********************************************************************** - -int initializeDeltaIndex(DeltaIndex *deltaIndex, unsigned int numZones, - unsigned int numLists, unsigned int meanDelta, - unsigned int numPayloadBits, size_t memorySize) -{ - size_t memSize = getZoneMemorySize(numZones, memorySize); - if (invalidParameters(meanDelta, numPayloadBits)) { - return UDS_INVALID_ARGUMENT; - } - - int result = ALLOCATE(numZones, DeltaMemory, "Delta Index Zones", - &deltaIndex->deltaZones); - if (result != UDS_SUCCESS) { - return result; - } - - deltaIndex->numZones = numZones; - deltaIndex->numLists = numLists; - deltaIndex->listsPerZone = (numLists + numZones - 1) / numZones; - deltaIndex->isMutable = true; - deltaIndex->tag = 'm'; - - unsigned int z; - for (z = 0; z < numZones; z++) { - unsigned int firstListInZone = z * deltaIndex->listsPerZone; - unsigned int numListsInZone = deltaIndex->listsPerZone; - if (z == numZones - 1) { - /* - * The last zone gets fewer lists if numZones doesn't evenly divide - * numLists. We'll have an underflow if the assertion below doesn't - * hold. (And it turns out that the assertion is equivalent to - * numZones <= 1 + (numLists / numZones) + (numLists % numZones) - * in the case that numZones doesn't evenly divide numlists. - * If numLists >= numZones * numZones, then the above inequality - * will always hold.) - */ - if (deltaIndex->numLists <= firstListInZone) { - uninitializeDeltaIndex(deltaIndex); - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "%u delta-lists not enough for %u zones", - numLists, numZones); - } - numListsInZone = deltaIndex->numLists - firstListInZone; - } - int result = initializeDeltaMemory(&deltaIndex->deltaZones[z], memSize, - firstListInZone, numListsInZone, - meanDelta, numPayloadBits); - if (result != UDS_SUCCESS) { - uninitializeDeltaIndex(deltaIndex); - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static bool verifyDeltaIndexPage(uint64_t nonce, - uint16_t numLists, - uint64_t expectedNonce, - byte *memory, - size_t memSize) -{ - // Verify the nonce. A mismatch here happens in normal operation when we are - // doing a rebuild but haven't written the entire volume once. - if (nonce != expectedNonce) { - return false; - } - - // Verify that the number of delta lists can fit in the page. - if (numLists > - (memSize - sizeof(DeltaPageHeader)) * CHAR_BIT / IMMUTABLE_HEADER_SIZE) { - return false; - } - - // Verify that the first delta list is immediately after the last delta list - // header. - if (getImmutableStart(memory, 0) != getImmutableHeaderOffset(numLists + 1)) { - return false; - } - - // Verify that the lists are in the correct order. - unsigned int i; - for (i = 0; i < numLists; i++) { - if (getImmutableStart(memory, i) > getImmutableStart(memory, i + 1)) { - return false; - } - } - - // Verify that the last list ends on the page, and that there is room for the - // post-field guard bits. - if (getImmutableStart(memory, numLists) - > (memSize - POST_FIELD_GUARD_BYTES) * CHAR_BIT) { - return false; - } - - // Verify that the guard bytes are correctly set to all ones. - for (i = 0; i < POST_FIELD_GUARD_BYTES; i++) { - byte guardByte = memory[memSize - POST_FIELD_GUARD_BYTES + i]; - if (guardByte != (byte) ~0) { - return false; - } - } - - // All verifications passed. - return true; -} - -/**********************************************************************/ -int initializeDeltaIndexPage(DeltaIndexPage *deltaIndexPage, - uint64_t expectedNonce, - unsigned int meanDelta, - unsigned int numPayloadBits, - byte *memory, - size_t memSize) -{ - const DeltaPageHeader *header = (const DeltaPageHeader *) memory; - - if (invalidParameters(meanDelta, numPayloadBits)) { - return UDS_INVALID_ARGUMENT; - } - - // First assume that the header is little endian - uint64_t nonce = getUInt64LE((const byte *) &header->nonce); - uint64_t vcn = getUInt64LE((const byte *) &header->virtualChapterNumber); - uint16_t firstList = getUInt16LE((const byte *) &header->firstList); - uint16_t numLists = getUInt16LE((const byte *) &header->numLists); - if (!verifyDeltaIndexPage(nonce, numLists, expectedNonce, memory, memSize)) { - // That failed, so try big endian - nonce = getUInt64BE((const byte *) &header->nonce); - vcn = getUInt64BE((const byte *) &header->virtualChapterNumber); - firstList = getUInt16BE((const byte *) &header->firstList); - numLists = getUInt16BE((const byte *) &header->numLists); - if (!verifyDeltaIndexPage(nonce, numLists, expectedNonce, memory, - memSize)) { - // Also failed. Do not log this as an error. It happens in normal - // operation when we are doing a rebuild but haven't written the entire - // volume once. - return UDS_CORRUPT_COMPONENT; - } - } - - deltaIndexPage->deltaIndex.deltaZones = &deltaIndexPage->deltaMemory; - deltaIndexPage->deltaIndex.numZones = 1; - deltaIndexPage->deltaIndex.numLists = numLists; - deltaIndexPage->deltaIndex.listsPerZone = numLists; - deltaIndexPage->deltaIndex.isMutable = false; - deltaIndexPage->deltaIndex.tag = 'p'; - deltaIndexPage->virtualChapterNumber = vcn; - deltaIndexPage->lowestListNumber = firstList; - deltaIndexPage->highestListNumber = firstList + numLists - 1; - - initializeDeltaMemoryPage(&deltaIndexPage->deltaMemory, (byte *) memory, - memSize, numLists, meanDelta, numPayloadBits); - return UDS_SUCCESS; -} - -/**********************************************************************/ -void uninitializeDeltaIndex(DeltaIndex *deltaIndex) -{ - if (deltaIndex != NULL) { - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - uninitializeDeltaMemory(&deltaIndex->deltaZones[z]); - } - FREE(deltaIndex->deltaZones); - memset(deltaIndex, 0, sizeof(DeltaIndex)); - } -} - -/**********************************************************************/ -void emptyDeltaIndex(const DeltaIndex *deltaIndex) -{ - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - emptyDeltaLists(&deltaIndex->deltaZones[z]); - } -} - -/**********************************************************************/ -void emptyDeltaIndexZone(const DeltaIndex *deltaIndex, unsigned int zoneNumber) -{ - emptyDeltaLists(&deltaIndex->deltaZones[zoneNumber]); -} - -/**********************************************************************/ -int packDeltaIndexPage(const DeltaIndex *deltaIndex, - uint64_t headerNonce, - bool headerNativeEndian, - byte *memory, - size_t memSize, - uint64_t virtualChapterNumber, - unsigned int firstList, - unsigned int *numLists) -{ - if (!deltaIndex->isMutable) { - return logErrorWithStringError(UDS_BAD_STATE, - "Cannot pack an immutable index"); - } - if (deltaIndex->numZones != 1) { - return logErrorWithStringError(UDS_BAD_STATE, - "Cannot pack a delta index page when the" - " index has %u zones", - deltaIndex->numZones); - } - if (firstList > deltaIndex->numLists) { - return logErrorWithStringError(UDS_BAD_STATE, - "Cannot pack a delta index page when the" - " first list (%u) is larger than the number" - " of lists (%u)", - firstList, deltaIndex->numLists); - } - - const DeltaMemory *deltaZone = &deltaIndex->deltaZones[0]; - DeltaList *deltaLists = &deltaZone->deltaLists[firstList + 1]; - unsigned int maxLists = deltaIndex->numLists - firstList; - - // Compute how many lists will fit on the page - int numBits = memSize * CHAR_BIT; - // Subtract the size of the fixed header and 1 delta list offset - numBits -= getImmutableHeaderOffset(1); - // Subtract the guard bytes of memory so that allow us to freely read a - // short distance past the end of any byte we are interested in. - numBits -= POST_FIELD_GUARD_BYTES * CHAR_BIT; - if (numBits < IMMUTABLE_HEADER_SIZE) { - // This page is too small to contain even one empty delta list - return logErrorWithStringError(UDS_OVERFLOW, - "Chapter Index Page of %zu bytes is too" - " small", - memSize); - } - - unsigned int nLists = 0; - while (nLists < maxLists) { - // Each list requires 1 delta list offset and the list data - int bits = IMMUTABLE_HEADER_SIZE + getDeltaListSize(&deltaLists[nLists]); - if (bits > numBits) { - break; - } - nLists++; - numBits -= bits; - } - *numLists = nLists; - - // Construct the page header - DeltaPageHeader *header = (DeltaPageHeader *) memory; - if (headerNativeEndian) { - header->nonce = headerNonce; - header->virtualChapterNumber = virtualChapterNumber; - header->firstList = firstList; - header->numLists = nLists; - } else { - storeUInt64LE((byte *) &header->nonce, headerNonce); - storeUInt64LE((byte *) &header->virtualChapterNumber, - virtualChapterNumber); - storeUInt16LE((byte *) &header->firstList, firstList); - storeUInt16LE((byte *) &header->numLists, nLists); - } - - // Construct the delta list offset table, making sure that the memory - // page is large enough. - unsigned int offset = getImmutableHeaderOffset(nLists + 1); - setImmutableStart(memory, 0, offset); - unsigned int i; - for (i = 0; i < nLists; i++) { - offset += getDeltaListSize(&deltaLists[i]); - setImmutableStart(memory, i + 1, offset); - } - - // Copy the delta list data onto the memory page - for (i = 0; i < nLists; i++) { - DeltaList *deltaList = &deltaLists[i]; - moveBits(deltaZone->memory, getDeltaListStart(deltaList), memory, - getImmutableStart(memory, i), getDeltaListSize(deltaList)); - } - - // Set all the bits in the guard bytes. Do not use the bit field - // utilities. - memset(memory + memSize - POST_FIELD_GUARD_BYTES, ~0, - POST_FIELD_GUARD_BYTES); - return UDS_SUCCESS; -} - - -/**********************************************************************/ -void setDeltaIndexTag(DeltaIndex *deltaIndex, byte tag) -{ - deltaIndex->tag = tag; - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - deltaIndex->deltaZones[z].tag = tag; - } -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int decodeDeltaIndexHeader(Buffer *buffer, struct di_header *header) -{ - int result = getBytesFromBuffer(buffer, MAGIC_SIZE, &header->magic); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &header->zoneNumber); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &header->numZones); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &header->firstList); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &header->numLists); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &header->recordCount); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &header->collisionCount); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, - "%zu bytes decoded of %zu expected", - bufferLength(buffer) - contentLength(buffer), - bufferLength(buffer)); - return result; -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int readDeltaIndexHeader(BufferedReader *reader, - struct di_header *header) -{ - Buffer *buffer; - - int result = makeBuffer(sizeof(*header), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(reader, getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return logWarningWithStringError(result, - "failed to read delta index header"); - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = decodeDeltaIndexHeader(buffer, header); - freeBuffer(&buffer); - return result; -} - -/**********************************************************************/ -int startRestoringDeltaIndex(const DeltaIndex *deltaIndex, - BufferedReader **bufferedReaders, - int numReaders) -{ - if (!deltaIndex->isMutable) { - return logErrorWithStringError(UDS_BAD_STATE, - "Cannot restore to an immutable index"); - } - if (numReaders <= 0) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "No delta index files"); - } - - unsigned int numZones = numReaders; - if (numZones > MAX_ZONES) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "zone count %u must not exceed MAX_ZONES", - numZones); - } - - unsigned long recordCount = 0; - unsigned long collisionCount = 0; - unsigned int firstList[MAX_ZONES]; - unsigned int numLists[MAX_ZONES]; - BufferedReader *reader[MAX_ZONES]; - bool zoneFlags[MAX_ZONES] = { false, }; - - // Read the header from each file, and make sure we have a matching set - unsigned int z; - for (z = 0; z < numZones; z++) { - struct di_header header; - int result = readDeltaIndexHeader(bufferedReaders[z], &header); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to read delta index header"); - } - if (memcmp(header.magic, MAGIC_DI_START, MAGIC_SIZE) != 0) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "delta index file has bad magic" - " number"); - } - if (numZones != header.numZones) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "delta index files contain mismatched" - " zone counts (%u,%u)", - numZones, header.numZones); - } - if (header.zoneNumber >= numZones) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "delta index files contains zone %u of" - " %u zones", - header.zoneNumber, numZones); - } - if (zoneFlags[header.zoneNumber]) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "delta index files contain two of zone" - " %u", - header.zoneNumber); - } - reader[header.zoneNumber] = bufferedReaders[z]; - firstList[header.zoneNumber] = header.firstList; - numLists[header.zoneNumber] = header.numLists; - zoneFlags[header.zoneNumber] = true; - recordCount += header.recordCount; - collisionCount += header.collisionCount; - } - unsigned int listNext = 0; - for (z = 0; z < numZones; z++) { - if (firstList[z] != listNext) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "delta index file for zone %u starts" - " with list %u instead of list %u", - z, firstList[z], listNext); - } - listNext += numLists[z]; - } - if (listNext != deltaIndex->numLists) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "delta index files contain %u delta lists" - " instead of %u delta lists", - listNext, deltaIndex->numLists); - } - if (collisionCount > recordCount) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "delta index files contain %ld collisions" - " and %ld records", - collisionCount, recordCount); - } - - emptyDeltaIndex(deltaIndex); - deltaIndex->deltaZones[0].recordCount = recordCount; - deltaIndex->deltaZones[0].collisionCount = collisionCount; - - // Read the delta list sizes from the files, and distribute each of them - // to proper zone - for (z = 0; z < numZones; z++) { - unsigned int i; - for (i = 0; i < numLists[z]; i++) { - byte deltaListSizeData[sizeof(uint16_t)]; - int result = readFromBufferedReader(reader[z], deltaListSizeData, - sizeof(deltaListSizeData)); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to read delta index size"); - } - uint16_t deltaListSize = getUInt16LE(deltaListSizeData); - unsigned int listNumber = firstList[z] + i; - unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, listNumber); - const DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; - listNumber -= deltaZone->firstList; - deltaZone->deltaLists[listNumber + 1].size = deltaListSize; - } - } - - // Prepare each zone to start receiving the delta list data - for (z = 0; z < deltaIndex->numZones; z++) { - int result = startRestoringDeltaMemory(&deltaIndex->deltaZones[z]); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -bool isRestoringDeltaIndexDone(const DeltaIndex *deltaIndex) -{ - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - if (!areDeltaMemoryTransfersDone(&deltaIndex->deltaZones[z])) { - return false; - } - } - return true; -} - -/**********************************************************************/ -int restoreDeltaListToDeltaIndex(const DeltaIndex *deltaIndex, - const DeltaListSaveInfo *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - // Make sure the data are intended for this delta list. Do not - // log an error, as this may be valid data for another delta index. - if (dlsi->tag != deltaIndex->tag) { - return UDS_CORRUPT_COMPONENT; - } - - if (dlsi->index >= deltaIndex->numLists) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "invalid delta list number %u of %u", - dlsi->index, deltaIndex->numLists); - } - - unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, dlsi->index); - return restoreDeltaList(&deltaIndex->deltaZones[zoneNumber], dlsi, data); -} - -/**********************************************************************/ -void abortRestoringDeltaIndex(const DeltaIndex *deltaIndex) -{ - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - abortRestoringDeltaMemory(&deltaIndex->deltaZones[z]); - } -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int encodeDeltaIndexHeader(Buffer *buffer, struct di_header *header) -{ - int result = putBytes(buffer, MAGIC_SIZE, MAGIC_DI_START); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, header->zoneNumber); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, header->numZones); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, header->firstList); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, header->numLists); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, header->recordCount); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, header->collisionCount); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(*header), - "%zu bytes encoded of %zu expected", - contentLength(buffer), sizeof(*header)); - - return result; -} - -/**********************************************************************/ -int startSavingDeltaIndex(const DeltaIndex *deltaIndex, - unsigned int zoneNumber, - BufferedWriter *bufferedWriter) -{ - DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; - struct di_header header; - memcpy(header.magic, MAGIC_DI_START, MAGIC_SIZE); - header.zoneNumber = zoneNumber; - header.numZones = deltaIndex->numZones; - header.firstList = deltaZone->firstList; - header.numLists = deltaZone->numLists; - header.recordCount = deltaZone->recordCount; - header.collisionCount = deltaZone->collisionCount; - - Buffer *buffer; - int result = makeBuffer(sizeof(struct di_header), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = encodeDeltaIndexHeader(buffer, &header); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to write delta index header"); - } - - unsigned int i; - for (i = 0; i < deltaZone->numLists; i++) { - uint16_t deltaListSize = getDeltaListSize(&deltaZone->deltaLists[i + 1]); - byte data[2]; - storeUInt16LE(data, deltaListSize); - result = writeToBufferedWriter(bufferedWriter, data, sizeof(data)); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to write delta list size"); - } - } - - startSavingDeltaMemory(deltaZone, bufferedWriter); - return UDS_SUCCESS; -} - -/**********************************************************************/ -bool isSavingDeltaIndexDone(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) -{ - return areDeltaMemoryTransfersDone(&deltaIndex->deltaZones[zoneNumber]); -} - -/**********************************************************************/ -int finishSavingDeltaIndex(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) -{ - return finishSavingDeltaMemory(&deltaIndex->deltaZones[zoneNumber]); -} - -/**********************************************************************/ -int abortSavingDeltaIndex(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) -{ - abortSavingDeltaMemory(&deltaIndex->deltaZones[zoneNumber]); - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t computeDeltaIndexSaveBytes(unsigned int numLists, size_t memorySize) -{ - // The exact amount of memory used depends upon the number of zones. - // Compute the maximum potential memory size. - size_t maxMemSize = memorySize; - unsigned int numZones; - for (numZones = 1; numZones <= MAX_ZONES; numZones++) { - size_t memSize = getZoneMemorySize(numZones, memorySize); - if (memSize > maxMemSize) { - maxMemSize = memSize; - } - } - // Saving a delta index requires a header ... - return (sizeof(struct di_header) - // ... plus a DeltaListSaveInfo per delta list - // plus an extra byte per delta list ... - + numLists * (sizeof(DeltaListSaveInfo) + 1) - // ... plus the delta list memory - + maxMemSize); -} - -/**********************************************************************/ -int validateDeltaIndex(const DeltaIndex *deltaIndex) -{ - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - int result = validateDeltaLists(&deltaIndex->deltaZones[z]); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int assertNotAtEnd(const DeltaIndexEntry *deltaEntry, int errorCode) -{ - return ASSERT_WITH_ERROR_CODE(!deltaEntry->atEnd, errorCode, - "operation is invalid because the list entry " - "is at the end of the delta list"); -} - -/**********************************************************************/ -static void prefetchDeltaList(const DeltaMemory *deltaZone, - const DeltaList *deltaList) -{ - const byte *memory = deltaZone->memory; - const byte *addr = &memory[getDeltaListStart(deltaList) / CHAR_BIT]; - unsigned int size = getDeltaListSize(deltaList) / CHAR_BIT; - prefetchRange(addr, size, false); -} - -/**********************************************************************/ -int startDeltaIndexSearch(const DeltaIndex *deltaIndex, - unsigned int listNumber, unsigned int key, - bool readOnly, DeltaIndexEntry *deltaEntry) -{ - int result - = ASSERT_WITH_ERROR_CODE((listNumber < deltaIndex->numLists), - UDS_CORRUPT_DATA, - "Delta list number (%u) is out of range (%u)", - listNumber, deltaIndex->numLists); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int zoneNumber = getDeltaIndexZone(deltaIndex, listNumber); - DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; - listNumber -= deltaZone->firstList; - result = ASSERT_WITH_ERROR_CODE((listNumber < deltaZone->numLists), - UDS_CORRUPT_DATA, - "Delta list number (%u)" - " is out of range (%u) for zone (%u)", - listNumber, deltaZone->numLists, zoneNumber); - if (result != UDS_SUCCESS) { - return result; - } - - DeltaList *deltaList; - if (deltaIndex->isMutable) { - deltaList = &deltaZone->deltaLists[listNumber + 1]; - if (!readOnly) { - // Here is the lazy writing of the index for a checkpoint - lazyFlushDeltaList(deltaZone, listNumber); - } - } else { - // Translate the immutable delta list header into a temporary full - // delta list header - deltaList = &deltaEntry->tempDeltaList; - deltaList->startOffset = getImmutableStart(deltaZone->memory, listNumber); - unsigned int endOffset = getImmutableStart(deltaZone->memory, - listNumber + 1); - deltaList->size = endOffset - deltaList->startOffset; - deltaList->saveKey = 0; - deltaList->saveOffset = 0; - } - - if (key > deltaList->saveKey) { - deltaEntry->key = deltaList->saveKey; - deltaEntry->offset = deltaList->saveOffset; - } else { - deltaEntry->key = 0; - deltaEntry->offset = 0; - if (key == 0) { - // This usually means we're about to walk the entire delta list, so get - // all of it into the CPU cache. - prefetchDeltaList(deltaZone, deltaList); - } - } - - deltaEntry->atEnd = false; - deltaEntry->deltaZone = deltaZone; - deltaEntry->deltaList = deltaList; - deltaEntry->entryBits = 0; - deltaEntry->isCollision = false; - deltaEntry->listNumber = listNumber; - deltaEntry->listOverflow = false; - deltaEntry->valueBits = deltaZone->valueBits; - return UDS_SUCCESS; -} - -/**********************************************************************/ -__attribute__((__noinline__)) -int nextDeltaIndexEntry(DeltaIndexEntry *deltaEntry) -{ - int result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); - if (result != UDS_SUCCESS) { - return result; - } - - const DeltaList *deltaList = deltaEntry->deltaList; - deltaEntry->offset += deltaEntry->entryBits; - unsigned int size = getDeltaListSize(deltaList); - if (unlikely(deltaEntry->offset >= size)) { - deltaEntry->atEnd = true; - deltaEntry->delta = 0; - deltaEntry->isCollision = false; - return ASSERT_WITH_ERROR_CODE((deltaEntry->offset == size), - UDS_CORRUPT_DATA, - "next offset past end of delta list"); - } - - decodeDelta(deltaEntry); - - unsigned int nextOffset = deltaEntry->offset + deltaEntry->entryBits; - if (nextOffset > size) { - // This is not an assertion because validateChapterIndexPage() wants to - // handle this error. - logWarning("Decoded past the end of the delta list"); - return UDS_CORRUPT_DATA; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int rememberDeltaIndexOffset(const DeltaIndexEntry *deltaEntry) -{ - int result = ASSERT(!deltaEntry->isCollision, "entry is not a collision"); - if (result != UDS_SUCCESS) { - return result; - } - - DeltaList *deltaList = deltaEntry->deltaList; - deltaList->saveKey = deltaEntry->key - deltaEntry->delta; - deltaList->saveOffset = deltaEntry->offset; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getDeltaIndexEntry(const DeltaIndex *deltaIndex, unsigned int listNumber, - unsigned int key, const byte *name, bool readOnly, - DeltaIndexEntry *deltaEntry) -{ - int result = startDeltaIndexSearch(deltaIndex, listNumber, key, readOnly, - deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - do { - result = nextDeltaIndexEntry(deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - } while (!deltaEntry->atEnd && (key > deltaEntry->key)); - - result = rememberDeltaIndexOffset(deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - - if (!deltaEntry->atEnd && (key == deltaEntry->key)) { - DeltaIndexEntry collisionEntry; - collisionEntry = *deltaEntry; - for (;;) { - result = nextDeltaIndexEntry(&collisionEntry); - if (result != UDS_SUCCESS) { - return result; - } - if (collisionEntry.atEnd || !collisionEntry.isCollision) { - break; - } - byte collisionName[COLLISION_BYTES]; - getBytes(deltaEntry->deltaZone->memory, - getCollisionOffset(&collisionEntry), collisionName, - COLLISION_BYTES); - if (memcmp(collisionName, name, COLLISION_BYTES) == 0) { - *deltaEntry = collisionEntry; - break; - } - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getDeltaEntryCollision(const DeltaIndexEntry *deltaEntry, byte *name) -{ - int result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_WITH_ERROR_CODE(deltaEntry->isCollision, UDS_BAD_STATE, - "Cannot get full block name from a" - " non-collision delta index entry"); - if (result != UDS_SUCCESS) { - return result; - } - - getBytes(deltaEntry->deltaZone->memory, getCollisionOffset(deltaEntry), - name, COLLISION_BYTES); - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int assertMutableEntry(const DeltaIndexEntry *deltaEntry) -{ - return ASSERT_WITH_ERROR_CODE(deltaEntry->deltaList - != &deltaEntry->tempDeltaList, - UDS_BAD_STATE, - "delta index is mutable"); -} - -/**********************************************************************/ -int setDeltaEntryValue(const DeltaIndexEntry *deltaEntry, unsigned int value) -{ - int result = assertMutableEntry(deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT_WITH_ERROR_CODE(((value & ((1 << deltaEntry->valueBits) - 1)) - == value), UDS_INVALID_ARGUMENT, - "Value (%u) being set in a delta index is " - "too large (must fit in %u bits)", - value, deltaEntry->valueBits); - if (result != UDS_SUCCESS) { - return result; - } - - setField(value, deltaEntry->deltaZone->memory, - getDeltaEntryOffset(deltaEntry), deltaEntry->valueBits); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int putDeltaIndexEntry(DeltaIndexEntry *deltaEntry, unsigned int key, - unsigned int value, const byte *name) -{ - int result = assertMutableEntry(deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - if (deltaEntry->isCollision) { - /* - * The caller wants us to insert a collision entry onto a collision - * entry. This happens when we find a collision and attempt to add the - * name again to the index. This is normally a fatal error unless we - * are replaying a closed chapter while we are rebuilding a master - * index. - */ - return UDS_DUPLICATE_NAME; - } - - if (deltaEntry->offset < deltaEntry->deltaList->saveOffset) { - // The saved entry offset is after the new entry and will no longer be - // valid, so replace it with the insertion point. - result = rememberDeltaIndexOffset(deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - } - - if (name != NULL) { - // We are inserting a collision entry which is placed after this entry - result = assertNotAtEnd(deltaEntry, UDS_BAD_STATE); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT((key == deltaEntry->key), - "incorrect key for collision entry"); - if (result != UDS_SUCCESS) { - return result; - } - - deltaEntry->offset += deltaEntry->entryBits; - setDelta(deltaEntry, 0); - setCollision(deltaEntry); - result = insertBits(deltaEntry, deltaEntry->entryBits); - } else if (deltaEntry->atEnd) { - // We are inserting a new entry at the end of the delta list - result = ASSERT((key >= deltaEntry->key), "key past end of list"); - if (result != UDS_SUCCESS) { - return result; - } - - setDelta(deltaEntry, key - deltaEntry->key); - deltaEntry->key = key; - deltaEntry->atEnd = false; - result = insertBits(deltaEntry, deltaEntry->entryBits); - } else { - // We are inserting a new entry which requires the delta in the - // following entry to be updated. - result = ASSERT((key < deltaEntry->key), "key precedes following entry"); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT((key >= deltaEntry->key - deltaEntry->delta), - "key effects following entry's delta"); - if (result != UDS_SUCCESS) { - return result; - } - - int oldEntrySize = deltaEntry->entryBits; - DeltaIndexEntry nextEntry = *deltaEntry; - unsigned int nextValue = getDeltaEntryValue(&nextEntry); - setDelta(deltaEntry, key - (deltaEntry->key - deltaEntry->delta)); - deltaEntry->key = key; - setDelta(&nextEntry, nextEntry.key - key); - nextEntry.offset += deltaEntry->entryBits; - // The 2 new entries are always bigger than the 1 entry we are replacing - int additionalSize - = deltaEntry->entryBits + nextEntry.entryBits - oldEntrySize; - result = insertBits(deltaEntry, additionalSize); - if (result != UDS_SUCCESS) { - return result; - } - encodeEntry(&nextEntry, nextValue, NULL); - } - if (result != UDS_SUCCESS) { - return result; - } - encodeEntry(deltaEntry, value, name); - - DeltaMemory *deltaZone = deltaEntry->deltaZone; - deltaZone->recordCount++; - deltaZone->collisionCount += deltaEntry->isCollision ? 1 : 0; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int removeDeltaIndexEntry(DeltaIndexEntry *deltaEntry) -{ - int result = assertMutableEntry(deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - - DeltaIndexEntry nextEntry = *deltaEntry; - result = nextDeltaIndexEntry(&nextEntry); - if (result != UDS_SUCCESS) { - return result; - } - - DeltaMemory *deltaZone = deltaEntry->deltaZone; - - if (deltaEntry->isCollision) { - // This is a collision entry, so just remove it - deleteBits(deltaEntry, deltaEntry->entryBits); - nextEntry.offset = deltaEntry->offset; - deltaZone->collisionCount -= 1; - } else if (nextEntry.atEnd) { - // This entry is at the end of the list, so just remove it - deleteBits(deltaEntry, deltaEntry->entryBits); - nextEntry.key -= deltaEntry->delta; - nextEntry.offset = deltaEntry->offset; - } else { - // The delta in the next entry needs to be updated. - unsigned int nextValue = getDeltaEntryValue(&nextEntry); - int oldSize = deltaEntry->entryBits + nextEntry.entryBits; - if (nextEntry.isCollision) { - // The next record is a collision. It needs to be rewritten as a - // non-collision with a larger delta. - nextEntry.isCollision = false; - deltaZone->collisionCount -= 1; - } - setDelta(&nextEntry, deltaEntry->delta + nextEntry.delta); - nextEntry.offset = deltaEntry->offset; - // The 1 new entry is always smaller than the 2 entries we are replacing - deleteBits(deltaEntry, oldSize - nextEntry.entryBits); - encodeEntry(&nextEntry, nextValue, NULL); - } - deltaZone->recordCount--; - deltaZone->discardCount++; - *deltaEntry = nextEntry; - - DeltaList *deltaList = deltaEntry->deltaList; - if (deltaEntry->offset < deltaList->saveOffset) { - // The saved entry offset is after the entry we just removed and it - // will no longer be valid. We must force the next search to start at - // the beginning. - deltaList->saveKey = 0; - deltaList->saveOffset = 0; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -unsigned int getDeltaIndexZoneFirstList(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) -{ - return deltaIndex->deltaZones[zoneNumber].firstList; -} - -/**********************************************************************/ -unsigned int getDeltaIndexZoneNumLists(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) -{ - return deltaIndex->deltaZones[zoneNumber].numLists; -} - -/**********************************************************************/ -uint64_t getDeltaIndexZoneDlistBitsUsed(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) -{ - uint64_t bitCount = 0; - const DeltaMemory *deltaZone = &deltaIndex->deltaZones[zoneNumber]; - unsigned int i; - for (i = 0; i < deltaZone->numLists; i++) { - bitCount += getDeltaListSize(&deltaZone->deltaLists[i + 1]); - } - return bitCount; -} - -/**********************************************************************/ -uint64_t getDeltaIndexDlistBitsUsed(const DeltaIndex *deltaIndex) -{ - uint64_t bitCount = 0; - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - bitCount += getDeltaIndexZoneDlistBitsUsed(deltaIndex, z); - } - return bitCount; -} - -/**********************************************************************/ -uint64_t getDeltaIndexDlistBitsAllocated(const DeltaIndex *deltaIndex) -{ - uint64_t byteCount = 0; - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - const DeltaMemory *deltaZone = &deltaIndex->deltaZones[z]; - byteCount += deltaZone->size; - } - return byteCount * CHAR_BIT; -} - -/**********************************************************************/ -void getDeltaIndexStats(const DeltaIndex *deltaIndex, DeltaIndexStats *stats) -{ - memset(stats, 0, sizeof(DeltaIndexStats)); - stats->memoryAllocated = deltaIndex->numZones * sizeof(DeltaMemory); - unsigned int z; - for (z = 0; z < deltaIndex->numZones; z++) { - const DeltaMemory *deltaZone = &deltaIndex->deltaZones[z]; - stats->memoryAllocated += getDeltaMemoryAllocated(deltaZone); - stats->rebalanceTime += deltaZone->rebalanceTime; - stats->rebalanceCount += deltaZone->rebalanceCount; - stats->recordCount += deltaZone->recordCount; - stats->collisionCount += deltaZone->collisionCount; - stats->discardCount += deltaZone->discardCount; - stats->overflowCount += deltaZone->overflowCount; - stats->numLists += deltaZone->numLists; - } -} - -/**********************************************************************/ -unsigned int getDeltaIndexPageCount(unsigned int numEntries, - unsigned int numLists, - unsigned int meanDelta, - unsigned int numPayloadBits, - size_t bytesPerPage) -{ - // Compute the number of bits needed for all the entries - size_t bitsPerIndex - = getDeltaMemorySize(numEntries, meanDelta, numPayloadBits); - // Compute the number of bits needed for a single delta list - unsigned int bitsPerDeltaList = bitsPerIndex / numLists; - // Adjust the bits per index, adding the immutable delta list headers - bitsPerIndex += numLists * IMMUTABLE_HEADER_SIZE; - // Compute the number of usable bits on an immutable index page - unsigned int bitsPerPage - = (bytesPerPage - sizeof(DeltaPageHeader)) * CHAR_BIT; - // Adjust the bits per page, taking away one immutable delta list header - // and one delta list representing internal fragmentation - bitsPerPage -= IMMUTABLE_HEADER_SIZE + bitsPerDeltaList; - // Now compute the number of pages needed - return (bitsPerIndex + bitsPerPage - 1) / bitsPerPage; -} - -/**********************************************************************/ -void logDeltaIndexEntry(DeltaIndexEntry *deltaEntry) -{ - logRatelimit(logInfo, "List 0x%X Key 0x%X Offset 0x%X%s%s ListSize 0x%X%s", - deltaEntry->listNumber, deltaEntry->key, deltaEntry->offset, - deltaEntry->atEnd ? " end" : "", - deltaEntry->isCollision ? " collision" : "", - getDeltaListSize(deltaEntry->deltaList), - deltaEntry->listOverflow ? " overflow" : ""); - deltaEntry->listOverflow = false; -} diff --git a/uds/deltaIndex.h b/uds/deltaIndex.h deleted file mode 100644 index af2d762..0000000 --- a/uds/deltaIndex.h +++ /dev/null @@ -1,595 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/deltaIndex.h#4 $ - */ - -#ifndef DELTAINDEX_H -#define DELTAINDEX_H 1 - -#include "compiler.h" -#include "deltaMemory.h" - -enum { - // the number of extra bytes and bits needed to store a collision entry - COLLISION_BYTES = UDS_CHUNK_NAME_SIZE, - COLLISION_BITS = COLLISION_BYTES * CHAR_BIT -}; - -typedef struct deltaIndex { - DeltaMemory *deltaZones; // The zones - unsigned int numZones; // The number of zones - unsigned int numLists; // The number of delta lists - unsigned int listsPerZone; // Lists per zone (last zone can be smaller) - bool isMutable; // True if this index is mutable - byte tag; // Tag belonging to this delta index -} DeltaIndex; - -/* - * A DeltaIndexPage describes a single page of a chapter index. The deltaIndex - * field allows the page to be treated as an immutable DeltaIndex. We use the - * deltaMemory field to treat the chapter index page as a single zone index, - * and without the need to do an additional memory allocation. - */ - -typedef struct deltaIndexPage { - DeltaIndex deltaIndex; - // These values are loaded from the DeltaPageHeader - unsigned int lowestListNumber; - unsigned int highestListNumber; - uint64_t virtualChapterNumber; - // This structure describes the single zone of a delta index page. - DeltaMemory deltaMemory; -} DeltaIndexPage; - -/* - * Notes on the DeltaIndexEntries: - * - * The fields documented as "public" can be read by any code that uses a - * DeltaIndex. The fields documented as "private" carry information - * between DeltaIndex method calls and should not be used outside the - * DeltaIndex module. - * - * (1) The DeltaIndexEntry is used like an iterator when searching a delta - * list. - * - * (2) And it is also the result of a successful search and can be used to - * refer to the element found by the search. - * - * (3) And it is also the result of an unsuccessful search and can be used - * to refer to the insertion point for a new record. - * - * (4) If atEnd==true, the DeltaListEntry can only be used as the insertion - * point for a new record at the end of the list. - * - * (5) If atEnd==false and isCollision==true, the DeltaListEntry fields - * refer to a collision entry in the list, and the DeltaListEntry can - * be used a a reference to this entry. - * - * (6) If atEnd==false and isCollision==false, the DeltaListEntry fields - * refer to a non-collision entry in the list. Such DeltaListEntries - * can be used as a reference to a found entry, or an insertion point - * for a non-collision entry before this entry, or an insertion point - * for a collision entry that collides with this entry. - */ - -typedef struct deltaIndexEntry { - // Public fields - unsigned int key; // The key for this entry - bool atEnd; // We are after the last entry in the list - bool isCollision; // This record is a collision - // Private fields (but DeltaIndex_t1 cheats and looks at them) - bool listOverflow; // This delta list overflowed - unsigned short valueBits; // The number of bits used for the value - unsigned short entryBits; // The number of bits used for the entire entry - DeltaMemory *deltaZone; // The delta index zone - DeltaList *deltaList; // The delta list containing the entry, - unsigned int listNumber; // The delta list number - uint32_t offset; // Bit offset of this entry within the list - unsigned int delta; // The delta between this and previous entry - DeltaList tempDeltaList; // Temporary delta list for immutable indices -} DeltaIndexEntry; - -typedef struct { - size_t memoryAllocated; // Number of bytes allocated - RelTime rebalanceTime; // The time spent rebalancing - int rebalanceCount; // Number of memory rebalances - long recordCount; // The number of records in the index - long collisionCount; // The number of collision records - long discardCount; // The number of records removed - long overflowCount; // The number of UDS_OVERFLOWs detected - unsigned int numLists; // The number of delta lists -} DeltaIndexStats; - -/** - * Initialize a delta index. - * - * @param deltaIndex The delta index to initialize - * @param numZones The number of zones in the index - * @param numLists The number of delta lists in the index - * @param meanDelta The mean delta value - * @param numPayloadBits The number of bits in the payload or value - * @param memorySize The number of bytes in memory for the index - * - * @return error code or UDS_SUCCESS - **/ -int initializeDeltaIndex(DeltaIndex *deltaIndex, unsigned int numZones, - unsigned int numLists, unsigned int meanDelta, - unsigned int numPayloadBits, size_t memorySize) - __attribute__((warn_unused_result)); - -/** - * Initialize an immutable delta index page. - * - * @param deltaIndexPage The delta index page to initialize - * @param expectedNonce If non-zero, the expected nonce. - * @param meanDelta The mean delta value - * @param numPayloadBits The number of bits in the payload or value - * @param memory The memory page - * @param memSize The size of the memory page - * - * @return error code or UDS_SUCCESS - **/ -int initializeDeltaIndexPage(DeltaIndexPage *deltaIndexPage, - uint64_t expectedNonce, - unsigned int meanDelta, - unsigned int numPayloadBits, - byte *memory, - size_t memSize) - __attribute__((warn_unused_result)); - -/** - * Uninitialize a delta index. - * - * @param deltaIndex The delta index to uninitialize - **/ -void uninitializeDeltaIndex(DeltaIndex *deltaIndex); - -/** - * Empty the delta index. - * - * @param deltaIndex The delta index being emptied. - **/ -void emptyDeltaIndex(const DeltaIndex *deltaIndex); - -/** - * Empty a zone of the delta index. - * - * @param deltaIndex The delta index - * @param zoneNumber The zone being emptied - **/ -void emptyDeltaIndexZone(const DeltaIndex *deltaIndex, - unsigned int zoneNumber); - -/** - * Pack delta lists from a mutable delta index into an immutable delta index - * page. A range of delta lists (starting with a specified list index) is - * copied from the mutable delta index into a memory page used in the immutable - * index. The number of lists copied onto the page is returned to the caller. - * - * @param deltaIndex The delta index being converted - * @param headerNonce The header nonce to store - * @param headerNativeEndian If true, write native endian header - * @param memory The memory page to use - * @param memSize The size of the memory page - * @param virtualChapterNumber The virtual chapter number - * @param firstList The first delta list number to be copied - * @param numLists The number of delta lists that were copied - * - * @return error code or UDS_SUCCESS. On UDS_SUCCESS, the numLists - * argument contains the number of lists copied. - **/ -int packDeltaIndexPage(const DeltaIndex *deltaIndex, - uint64_t headerNonce, - bool headerNativeEndian, - byte *memory, - size_t memSize, - uint64_t virtualChapterNumber, - unsigned int firstList, - unsigned int *numLists) - __attribute__((warn_unused_result)); - - -/** - * Set the tag value used when saving and/or restoring a delta index. - * - * @param deltaIndex The delta index - * @param tag The tag value - **/ -void setDeltaIndexTag(DeltaIndex *deltaIndex, byte tag); - -/** - * Start restoring a delta index from an input stream. - * - * @param deltaIndex The delta index to read into - * @param bufferedReaders The buffered readers to read the delta index from - * @param numReaders The number of buffered readers - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int startRestoringDeltaIndex(const DeltaIndex *deltaIndex, - BufferedReader **bufferedReaders, int numReaders) - __attribute__((warn_unused_result)); - -/** - * Have all the data been read while restoring a delta index from an - * input stream? - * - * @param deltaIndex The delta index - * - * @return true if all the data are read - **/ -bool isRestoringDeltaIndexDone(const DeltaIndex *deltaIndex); - -/** - * Restore a saved delta list - * - * @param deltaIndex The delta index - * @param dlsi The DeltaListSaveInfo describing the delta list - * @param data The saved delta list bit stream - * - * @return error code or UDS_SUCCESS - **/ -int restoreDeltaListToDeltaIndex(const DeltaIndex *deltaIndex, - const DeltaListSaveInfo *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) - __attribute__((warn_unused_result)); - -/** - * Abort restoring a delta index from an input stream. - * - * @param deltaIndex The delta index - **/ -void abortRestoringDeltaIndex(const DeltaIndex *deltaIndex); - -/** - * Start saving a delta index zone to a buffered output stream. - * - * @param deltaIndex The delta index - * @param zoneNumber The zone number - * @param bufferedWriter The index state component being written - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int startSavingDeltaIndex(const DeltaIndex *deltaIndex, - unsigned int zoneNumber, - BufferedWriter *bufferedWriter) - __attribute__((warn_unused_result)); - -/** - * Have all the data been written while saving a delta index zone to an - * output stream? If the answer is yes, it is still necessary to call - * finishSavingDeltaIndex(), which will return quickly. - * - * @param deltaIndex The delta index - * @param zoneNumber The zone number - * - * @return true if all the data are written - **/ -bool isSavingDeltaIndexDone(const DeltaIndex *deltaIndex, - unsigned int zoneNumber); - -/** - * Finish saving a delta index zone to an output stream. Force the writing - * of all of the remaining data. If an error occurred asynchronously - * during the save operation, it will be returned here. - * - * @param deltaIndex The delta index - * @param zoneNumber The zone number - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int finishSavingDeltaIndex(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) - __attribute__((warn_unused_result)); - -/** - * Abort saving a delta index zone to an output stream. If an error - * occurred asynchronously during the save operation, it will be dropped. - * - * @param deltaIndex The delta index - * @param zoneNumber The zone number - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int abortSavingDeltaIndex(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) - __attribute__((warn_unused_result)); - -/** - * Compute the number of bytes required to save a delta index - * - * @param numLists The number of delta lists in the index - * @param memorySize The number of bytes in memory for the index - * - * @return numBytes The number of bytes required to save the master index - **/ -size_t computeDeltaIndexSaveBytes(unsigned int numLists, size_t memorySize) - __attribute__((warn_unused_result)); - -/** - * Validate the delta index - * - * @param deltaIndex The delta index - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int validateDeltaIndex(const DeltaIndex *deltaIndex) - __attribute__((warn_unused_result)); - -/** - * Prepare to search for an entry in the specified delta list. - * - *

This is always the first routine to be called when dealing with delta - * index entries. It is always followed by calls to nextDeltaIndexEntry to - * iterate through a delta list. The fields of the DeltaIndexEntry argument - * will be set up for iteration, but will not contain an entry from the list. - * - * @param deltaIndex The delta index to search - * @param listNumber The delta list number - * @param key First delta list key that the caller is interested in - * @param readOnly True if this is a read-only operation - * @param iterator The index entry being used to search through the list - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int startDeltaIndexSearch(const DeltaIndex *deltaIndex, - unsigned int listNumber, unsigned int key, - bool readOnly, DeltaIndexEntry *iterator) - __attribute__((warn_unused_result)); - -/** - * Find the next entry in the specified delta list - * - * @param deltaEntry Info about an entry, which is updated to describe the - * following entry - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int nextDeltaIndexEntry(DeltaIndexEntry *deltaEntry) - __attribute__((warn_unused_result)); - -/** - * Remember the position of a delta index entry, so that we can use it when - * starting the next search. - * - * @param deltaEntry Info about an entry found during a search. This should - * be the first entry that matches the key exactly (i.e. - * not a collision entry), or the first entry with a key - * greater than the entry sought for. - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int rememberDeltaIndexOffset(const DeltaIndexEntry *deltaEntry) - __attribute__((warn_unused_result)); - -/** - * Find the delta index entry, or the insertion point for a delta index - * entry. - * - * @param deltaIndex The delta index to search - * @param listNumber The delta list number - * @param key The key field being looked for - * @param name The 256 bit full name - * @param readOnly True if this is a read-only index search - * @param deltaEntry Updated to describe the entry being looked for - * - * @return UDS_SUCCESS or an error code - **/ -int getDeltaIndexEntry(const DeltaIndex *deltaIndex, unsigned int listNumber, - unsigned int key, const byte *name, bool readOnly, - DeltaIndexEntry *deltaEntry) - __attribute__((warn_unused_result)); - -/** - * Get the full name from a collision DeltaIndexEntry - * - * @param deltaEntry The delta index record - * @param name The 256 bit full name - * - * @return UDS_SUCCESS or an error code - **/ -int getDeltaEntryCollision(const DeltaIndexEntry *deltaEntry, byte *name) - __attribute__((warn_unused_result)); - -/** - * Get the bit offset into delta memory of a delta index entry. - * - * @param deltaEntry The delta index entry - * - * @return the bit offset into delta memory - **/ -static INLINE uint64_t getDeltaEntryOffset(const DeltaIndexEntry *deltaEntry) -{ - return getDeltaListStart(deltaEntry->deltaList) + deltaEntry->offset; -} - -/** - * Get the number of bits used to encode the entry key (the delta). - * - * @param entry The delta index record - * - * @return the number of bits used to encode the key - **/ -static INLINE unsigned int getDeltaEntryKeyBits(const DeltaIndexEntry *entry) -{ - /* - * Derive keyBits by subtracting the sizes of the other two fields from the - * total. We don't actually use this for encoding/decoding, so it doesn't - * need to be super-fast. We save time where it matters by not storing it. - */ - return (entry->entryBits - entry->valueBits - - (entry->isCollision ? COLLISION_BITS : 0)); -} - -/** - * Get the value field of the DeltaIndexEntry - * - * @param deltaEntry The delta index record - * - * @return the value - **/ -static INLINE unsigned int getDeltaEntryValue(const DeltaIndexEntry *deltaEntry) -{ - return getField(deltaEntry->deltaZone->memory, - getDeltaEntryOffset(deltaEntry), deltaEntry->valueBits); -} - -/** - * Set the value field of the DeltaIndexEntry - * - * @param deltaEntry The delta index record - * @param value The new value - * - * @return UDS_SUCCESS or an error code - **/ -int setDeltaEntryValue(const DeltaIndexEntry *deltaEntry, unsigned int value) - __attribute__((warn_unused_result)); - -/** - * Create a new entry in the delta index - * - * @param deltaEntry The delta index entry that indicates the insertion point - * for the new record. For a collision entry, this is the - * non-collision entry that the new entry collides with. - * For a non-collision entry, this new entry is inserted - * before the specified entry. - * @param key The key field - * @param value The value field - * @param name For collision entries, the 256 bit full name; - * Otherwise null - * - * @return UDS_SUCCESS or an error code - **/ -int putDeltaIndexEntry(DeltaIndexEntry *deltaEntry, unsigned int key, - unsigned int value, const byte *name) - __attribute__((warn_unused_result)); - -/** - * Remove an existing delta index entry, and advance to the next entry in - * the delta list. - * - * @param deltaEntry On call the delta index record to remove. After - * returning, the following entry in the delta list. - * - * @return UDS_SUCCESS or an error code - **/ -int removeDeltaIndexEntry(DeltaIndexEntry *deltaEntry) - __attribute__((warn_unused_result)); - -/** - * Map a delta list number to a delta zone number - * - * @param deltaIndex The delta index - * @param listNumber The delta list number - * - * @return the zone number containing the delta list - **/ -static INLINE unsigned int getDeltaIndexZone(const DeltaIndex *deltaIndex, - unsigned int listNumber) -{ - return listNumber / deltaIndex->listsPerZone; -} - -/** - * Get the first delta list number in a zone - * - * @param deltaIndex The delta index - * @param zoneNumber The zone number - * - * @return the first delta list index in the zone - **/ -unsigned int getDeltaIndexZoneFirstList(const DeltaIndex *deltaIndex, - unsigned int zoneNumber); - -/** - * Get the number of delta lists in a zone - * - * @param deltaIndex The delta index - * @param zoneNumber The zone number - * - * @return the number of delta lists in the zone - **/ -unsigned int getDeltaIndexZoneNumLists(const DeltaIndex *deltaIndex, - unsigned int zoneNumber); - -/** - * Get the number of bytes used for master index entries in a zone - * - * @param deltaIndex The delta index - * @param zoneNumber The zone number - * - * @return The number of bits in use - **/ -uint64_t getDeltaIndexZoneDlistBitsUsed(const DeltaIndex *deltaIndex, - unsigned int zoneNumber) - __attribute__((warn_unused_result)); - -/** - * Get the number of bytes used for master index entries. - * - * @param deltaIndex The delta index - * - * @return The number of bits in use - **/ -uint64_t getDeltaIndexDlistBitsUsed(const DeltaIndex *deltaIndex) - __attribute__((warn_unused_result)); - -/** - * Get the number of bytes allocated for master index entries. - * - * @param deltaIndex The delta index - * - * @return The number of bits allocated - **/ -uint64_t getDeltaIndexDlistBitsAllocated(const DeltaIndex *deltaIndex) - __attribute__((warn_unused_result)); - -/** - * Get the delta index statistics. - * - * @param deltaIndex The delta index - * @param stats The statistics - **/ -void getDeltaIndexStats(const DeltaIndex *deltaIndex, DeltaIndexStats *stats); - -/** - * Get the number of pages needed for an immutable delta index. - * - * @param numEntries The number of entries in the index - * @param numLists The number of delta lists - * @param meanDelta The mean delta value - * @param numPayloadBits The number of bits in the payload or value - * @param bytesPerPage The number of bytes in a page - * - * @return the number of pages needed for the index - **/ -unsigned int getDeltaIndexPageCount(unsigned int numEntries, - unsigned int numLists, - unsigned int meanDelta, - unsigned int numPayloadBits, - size_t bytesPerPage); - -/** - * Log a delta index entry, and any error conditions related to the entry. - * - * @param deltaEntry The delta index entry. - **/ -void logDeltaIndexEntry(DeltaIndexEntry *deltaEntry); - -#endif /* DELTAINDEX_H */ diff --git a/uds/deltaMemory.c b/uds/deltaMemory.c deleted file mode 100644 index 2b30714..0000000 --- a/uds/deltaMemory.c +++ /dev/null @@ -1,720 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/deltaMemory.c#3 $ - */ -#include "deltaMemory.h" - -#include "bits.h" -#include "buffer.h" -#include "compiler.h" -#include "errors.h" -#include "hashUtils.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "timeUtils.h" -#include "typeDefs.h" -#include "uds.h" - -/* - * The DeltaMemory structure manages the memory that stores delta lists. - * - * The "mutable" form of DeltaMemory is used for the master index and for - * an open chapter index. The "immutable" form of DeltaMemory is used for - * regular chapter indices. - */ - -// This is the number of guard bits that are needed in the tail guard list -enum { GUARD_BITS = POST_FIELD_GUARD_BYTES * CHAR_BIT }; - -/** - * Get the offset of the first byte that a delta list bit stream resides in - * - * @param deltaList The delta list - * - * @return the number byte offset - **/ -static INLINE uint64_t getDeltaListByteStart(const DeltaList *deltaList) -{ - return getDeltaListStart(deltaList) / CHAR_BIT; -} - -/** - * Get the actual number of bytes that a delta list bit stream resides in - * - * @param deltaList The delta list - * - * @return the number of bytes - **/ -static INLINE uint16_t getDeltaListByteSize(const DeltaList *deltaList) -{ - uint16_t startBitOffset = getDeltaListStart(deltaList) % CHAR_BIT; - uint16_t bitSize = getDeltaListSize(deltaList); - return ((unsigned int) startBitOffset + bitSize + CHAR_BIT - 1) / CHAR_BIT; -} - -/** - * Get the number of bytes in the delta lists headers. - * - * @param numLists The number of delta lists - * - * @return the number of bytes in the delta lists headers - **/ -static INLINE size_t getSizeOfDeltaLists(unsigned int numLists) -{ - return (numLists + 2) * sizeof(DeltaList); -} - -/** - * Get the size of the flags array (in bytes) - * - * @param numLists The number of delta lists - * - * @return the number of bytes for an array that has one bit per delta - * list, plus the necessary guard bytes. - **/ -static INLINE size_t getSizeOfFlags(unsigned int numLists) -{ - return (numLists + CHAR_BIT - 1) / CHAR_BIT + POST_FIELD_GUARD_BYTES; -} - -/** - * Get the number of bytes of scratch memory for the delta lists. - * - * @param numLists The number of delta lists - * - * @return the number of bytes of scratch memory for the delta lists - **/ -static INLINE size_t getSizeOfTempOffsets(unsigned int numLists) -{ - return (numLists + 2) * sizeof(uint64_t); -} - -/**********************************************************************/ - -/** - * Clear the transfers flags. - * - * @param deltaMemory The delta memory - **/ -static void clearTransferFlags(DeltaMemory *deltaMemory) -{ - memset(deltaMemory->flags, 0, getSizeOfFlags(deltaMemory->numLists)); - deltaMemory->numTransfers = 0; - deltaMemory->transferStatus = UDS_SUCCESS; -} - -/**********************************************************************/ - -/** - * Set the transfer flags for delta lists that are not empty, and count how - * many there are. - * - * @param deltaMemory The delta memory - **/ -static void flagNonEmptyDeltaLists(DeltaMemory *deltaMemory) -{ - clearTransferFlags(deltaMemory); - unsigned int i; - for (i = 0; i < deltaMemory->numLists; i++) { - if (getDeltaListSize(&deltaMemory->deltaLists[i + 1]) > 0) { - setOne(deltaMemory->flags, i, 1); - deltaMemory->numTransfers++; - } - } -} - -/**********************************************************************/ -void emptyDeltaLists(DeltaMemory *deltaMemory) -{ - // Zero all the delta list headers - DeltaList *deltaLists = deltaMemory->deltaLists; - memset(deltaLists, 0, getSizeOfDeltaLists(deltaMemory->numLists)); - - /* - * Initialize delta lists to be empty. We keep 2 extra delta list - * descriptors, one before the first real entry and one after so that we - * don't need to bounds check the array access when calculating - * preceeding and following gap sizes. - * - * Because the delta list headers were zeroed, the head guard list is - * already at offset zero and size zero. - * - * The end guard list contains guard bytes so that the bit field - * utilities can safely read past the end of any byte we are interested - * in. - */ - uint64_t numBits = (uint64_t) deltaMemory->size * CHAR_BIT; - deltaLists[deltaMemory->numLists + 1].startOffset = numBits - GUARD_BITS; - deltaLists[deltaMemory->numLists + 1].size = GUARD_BITS; - - // Set all the bits in the end guard list. Do not use the bit field - // utilities. - memset(deltaMemory->memory + deltaMemory->size - POST_FIELD_GUARD_BYTES, - ~0, POST_FIELD_GUARD_BYTES); - - // Evenly space out the real delta lists. The sizes are already zero, so - // we just need to set the starting offsets. - uint64_t spacing = (numBits - GUARD_BITS) / deltaMemory->numLists; - uint64_t offset = spacing / 2; - unsigned int i; - for (i = 1; i <= deltaMemory->numLists; i++) { - deltaLists[i].startOffset = offset; - offset += spacing; - } - - // Update the statistics - deltaMemory->discardCount += deltaMemory->recordCount; - deltaMemory->recordCount = 0; - deltaMemory->collisionCount = 0; -} - -/**********************************************************************/ -/** - * Compute the Huffman coding parameters for the given mean delta - * - * @param meanDelta The mean delta value - * @param minBits The number of bits in the minimal key code - * @param minKeys The number of keys used in a minimal code - * @param incrKeys The number of keys used for another code bit - **/ -static void computeCodingConstants(unsigned int meanDelta, - unsigned short *minBits, - unsigned int *minKeys, - unsigned int *incrKeys) -{ - // We want to compute the rounded value of log(2) * meanDelta. Since we - // cannot always use floating point, use a really good integer approximation. - *incrKeys = (836158UL * meanDelta + 603160UL) / 1206321UL; - *minBits = computeBits(*incrKeys + 1); - *minKeys = (1 << *minBits) - *incrKeys; -} - -/**********************************************************************/ -/** - * Rebalance a range of delta lists within memory. - * - * @param deltaMemory A delta memory structure - * @param first The first delta list index - * @param last The last delta list index - **/ -static void rebalanceDeltaMemory(const DeltaMemory *deltaMemory, - unsigned int first, unsigned int last) -{ - if (first == last) { - DeltaList *deltaList = &deltaMemory->deltaLists[first]; - uint64_t newStart = deltaMemory->tempOffsets[first]; - // We need to move only one list, and we know it is safe to do so - if (getDeltaListStart(deltaList) != newStart) { - // Compute the first source byte - uint64_t source = getDeltaListByteStart(deltaList); - // Update the delta list location - deltaList->startOffset = newStart; - // Now use the same computation to locate the first destination byte - uint64_t destination = getDeltaListByteStart(deltaList); - memmove(deltaMemory->memory + destination, deltaMemory->memory + source, - getDeltaListByteSize(deltaList)); - } - } else { - // There is more than one list. Divide the problem in half, and use - // recursive calls to process each half. Note that after this - // computation, first <= middle, and middle < last. - unsigned int middle = (first + last) / 2; - const DeltaList *deltaList = &deltaMemory->deltaLists[middle]; - uint64_t newStart = deltaMemory->tempOffsets[middle]; - // The direction that our middle list is moving determines which half - // of the problem must be processed first. - if (newStart > getDeltaListStart(deltaList)) { - rebalanceDeltaMemory(deltaMemory, middle + 1, last); - rebalanceDeltaMemory(deltaMemory, first, middle); - } else { - rebalanceDeltaMemory(deltaMemory, first, middle); - rebalanceDeltaMemory(deltaMemory, middle + 1, last); - } - } -} - -/**********************************************************************/ -int initializeDeltaMemory(DeltaMemory *deltaMemory, size_t size, - unsigned int firstList, unsigned int numLists, - unsigned int meanDelta, unsigned int numPayloadBits) -{ - if (numLists == 0) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot initialize delta memory with 0 " - "delta lists"); - } - byte *memory = NULL; - int result = ALLOCATE(size, byte, "delta list", &memory); - if (result != UDS_SUCCESS) { - return result; - } - uint64_t *tempOffsets = NULL; - result = ALLOCATE(numLists + 2, uint64_t, "delta list temp", - &tempOffsets); - if (result != UDS_SUCCESS) { - FREE(memory); - return result; - } - byte *flags = NULL; - result = ALLOCATE(getSizeOfFlags(numLists), byte, "delta list flags", - &flags); - if (result != UDS_SUCCESS) { - FREE(memory); - FREE(tempOffsets); - return result; - } - - computeCodingConstants(meanDelta, &deltaMemory->minBits, - &deltaMemory->minKeys, &deltaMemory->incrKeys); - deltaMemory->valueBits = numPayloadBits; - deltaMemory->memory = memory; - deltaMemory->deltaLists = NULL; - deltaMemory->tempOffsets = tempOffsets; - deltaMemory->flags = flags; - deltaMemory->bufferedWriter = NULL; - deltaMemory->size = size; - deltaMemory->rebalanceTime = 0; - deltaMemory->rebalanceCount = 0; - deltaMemory->recordCount = 0; - deltaMemory->collisionCount = 0; - deltaMemory->discardCount = 0; - deltaMemory->overflowCount = 0; - deltaMemory->firstList = firstList; - deltaMemory->numLists = numLists; - deltaMemory->numTransfers = 0; - deltaMemory->transferStatus = UDS_SUCCESS; - deltaMemory->tag = 'm'; - - // Allocate the delta lists. - result = ALLOCATE(deltaMemory->numLists + 2, DeltaList, - "delta lists", &deltaMemory->deltaLists); - if (result != UDS_SUCCESS) { - uninitializeDeltaMemory(deltaMemory); - return result; - } - - emptyDeltaLists(deltaMemory); - return UDS_SUCCESS; -} - -/**********************************************************************/ -void uninitializeDeltaMemory(DeltaMemory *deltaMemory) -{ - FREE(deltaMemory->flags); - deltaMemory->flags = NULL; - FREE(deltaMemory->tempOffsets); - deltaMemory->tempOffsets = NULL; - FREE(deltaMemory->deltaLists); - deltaMemory->deltaLists = NULL; - FREE(deltaMemory->memory); - deltaMemory->memory = NULL; -} - -/**********************************************************************/ -void initializeDeltaMemoryPage(DeltaMemory *deltaMemory, byte *memory, - size_t size, unsigned int numLists, - unsigned int meanDelta, - unsigned int numPayloadBits) -{ - computeCodingConstants(meanDelta, &deltaMemory->minBits, - &deltaMemory->minKeys, &deltaMemory->incrKeys); - deltaMemory->valueBits = numPayloadBits; - deltaMemory->memory = memory; - deltaMemory->deltaLists = NULL; - deltaMemory->tempOffsets = NULL; - deltaMemory->flags = NULL; - deltaMemory->bufferedWriter = NULL; - deltaMemory->size = size; - deltaMemory->rebalanceTime = 0; - deltaMemory->rebalanceCount = 0; - deltaMemory->recordCount = 0; - deltaMemory->collisionCount = 0; - deltaMemory->discardCount = 0; - deltaMemory->overflowCount = 0; - deltaMemory->firstList = 0; - deltaMemory->numLists = numLists; - deltaMemory->numTransfers = 0; - deltaMemory->transferStatus = UDS_SUCCESS; - deltaMemory->tag = 'p'; -} - -/**********************************************************************/ -bool areDeltaMemoryTransfersDone(const DeltaMemory *deltaMemory) -{ - return deltaMemory->numTransfers == 0; -} - -/**********************************************************************/ -int startRestoringDeltaMemory(DeltaMemory *deltaMemory) -{ - // Extend and balance memory to receive the delta lists - int result = extendDeltaMemory(deltaMemory, 0, 0, false); - if (result != UDS_SUCCESS) { - return UDS_SUCCESS; - } - - // The tail guard list needs to be set to ones - DeltaList *deltaList = &deltaMemory->deltaLists[deltaMemory->numLists + 1]; - setOne(deltaMemory->memory, getDeltaListStart(deltaList), - getDeltaListSize(deltaList)); - - flagNonEmptyDeltaLists(deltaMemory); - return UDS_SUCCESS; -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int readDeltaListSaveInfo(BufferedReader *reader, - DeltaListSaveInfo *dlsi) -{ - byte buffer[sizeof(DeltaListSaveInfo)]; - int result = readFromBufferedReader(reader, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - dlsi->tag = buffer[0]; - dlsi->bitOffset = buffer[1]; - dlsi->byteCount = getUInt16LE(&buffer[2]); - dlsi->index = getUInt32LE(&buffer[4]); - return result; -} - -/**********************************************************************/ -int readSavedDeltaList(DeltaListSaveInfo *dlsi, - byte data[DELTA_LIST_MAX_BYTE_COUNT], - BufferedReader *bufferedReader) -{ - int result = readDeltaListSaveInfo(bufferedReader, dlsi); - if (result == UDS_END_OF_FILE) { - return UDS_END_OF_FILE; - } - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, "failed to read delta list data"); - } - if ((dlsi->bitOffset >= CHAR_BIT) - || (dlsi->byteCount > DELTA_LIST_MAX_BYTE_COUNT)) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "corrupt delta list data"); - } - if (dlsi->tag == 'z') { - return UDS_END_OF_FILE; - } - result = readFromBufferedReader(bufferedReader, data, dlsi->byteCount); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, "failed to read delta list data"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int restoreDeltaList(DeltaMemory *deltaMemory, const DeltaListSaveInfo *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - unsigned int listNumber = dlsi->index - deltaMemory->firstList; - if (listNumber >= deltaMemory->numLists) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "invalid delta list number %u not in" - " range [%u,%u)", - dlsi->index, deltaMemory->firstList, - deltaMemory->firstList - + deltaMemory->numLists); - } - - if (getField(deltaMemory->flags, listNumber, 1) == 0) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "unexpected delta list number %u", - dlsi->index); - } - - DeltaList *deltaList = &deltaMemory->deltaLists[listNumber + 1]; - uint16_t bitSize = getDeltaListSize(deltaList); - unsigned int byteCount - = ((unsigned int) dlsi->bitOffset + bitSize + CHAR_BIT - 1) / CHAR_BIT; - if (dlsi->byteCount != byteCount) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "unexpected delta list size %u != %u", - dlsi->byteCount, byteCount); - } - - moveBits(data, dlsi->bitOffset, deltaMemory->memory, - getDeltaListStart(deltaList), bitSize); - setZero(deltaMemory->flags, listNumber, 1); - deltaMemory->numTransfers--; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void abortRestoringDeltaMemory(DeltaMemory *deltaMemory) -{ - clearTransferFlags(deltaMemory); - emptyDeltaLists(deltaMemory); -} - -/**********************************************************************/ -void startSavingDeltaMemory(DeltaMemory *deltaMemory, - BufferedWriter *bufferedWriter) -{ - flagNonEmptyDeltaLists(deltaMemory); - deltaMemory->bufferedWriter = bufferedWriter; -} - -/**********************************************************************/ -int finishSavingDeltaMemory(DeltaMemory *deltaMemory) -{ - unsigned int i; - for (i = 0; - !areDeltaMemoryTransfersDone(deltaMemory) - && (i < deltaMemory->numLists); - i++) { - lazyFlushDeltaList(deltaMemory, i); - } - if (deltaMemory->numTransfers > 0) { - deltaMemory->transferStatus - = logWarningWithStringError(UDS_CORRUPT_DATA, - "Not all delta lists written"); - } - deltaMemory->bufferedWriter = NULL; - return deltaMemory->transferStatus; -} - -/**********************************************************************/ -void abortSavingDeltaMemory(DeltaMemory *deltaMemory) -{ - clearTransferFlags(deltaMemory); - deltaMemory->bufferedWriter = NULL; -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int writeDeltaListSaveInfo(BufferedWriter *bufferedWriter, - DeltaListSaveInfo *dlsi) -{ - byte buffer[sizeof(DeltaListSaveInfo)]; - buffer[0] = dlsi->tag; - buffer[1] = dlsi->bitOffset; - storeUInt16LE(&buffer[2], dlsi->byteCount); - storeUInt32LE(&buffer[4], dlsi->index); - return writeToBufferedWriter(bufferedWriter, buffer, sizeof(buffer)); -} - -/**********************************************************************/ -void flushDeltaList(DeltaMemory *deltaMemory, unsigned int flushIndex) -{ - ASSERT_LOG_ONLY((getField(deltaMemory->flags, flushIndex, 1) != 0), - "flush bit is set"); - setZero(deltaMemory->flags, flushIndex, 1); - deltaMemory->numTransfers--; - - DeltaList *deltaList = &deltaMemory->deltaLists[flushIndex + 1]; - DeltaListSaveInfo dlsi; - dlsi.tag = deltaMemory->tag; - dlsi.bitOffset = getDeltaListStart(deltaList) % CHAR_BIT; - dlsi.byteCount = getDeltaListByteSize(deltaList); - dlsi.index = deltaMemory->firstList + flushIndex; - - int result = writeDeltaListSaveInfo(deltaMemory->bufferedWriter, &dlsi); - if (result != UDS_SUCCESS) { - if (deltaMemory->transferStatus == UDS_SUCCESS) { - logWarningWithStringError(result, "failed to write delta list memory"); - deltaMemory->transferStatus = result; - } - } - result = writeToBufferedWriter(deltaMemory->bufferedWriter, - deltaMemory->memory - + getDeltaListByteStart(deltaList), - dlsi.byteCount); - if (result != UDS_SUCCESS) { - if (deltaMemory->transferStatus == UDS_SUCCESS) { - logWarningWithStringError(result, "failed to write delta list memory"); - deltaMemory->transferStatus = result; - } - } -} - -/**********************************************************************/ -int writeGuardDeltaList(BufferedWriter *bufferedWriter) -{ - DeltaListSaveInfo dlsi; - dlsi.tag = 'z'; - dlsi.bitOffset = 0; - dlsi.byteCount = 0; - dlsi.index = 0; - int result = writeToBufferedWriter(bufferedWriter, (const byte *) &dlsi, - sizeof(DeltaListSaveInfo)); - if (result != UDS_SUCCESS) { - logWarningWithStringError(result, "failed to write guard delta list"); - } - return result; -} - -/**********************************************************************/ -int extendDeltaMemory(DeltaMemory *deltaMemory, unsigned int growingIndex, - size_t growingSize, bool doCopy) -{ - if (!isMutable(deltaMemory)) { - return logErrorWithStringError(UDS_BAD_STATE, - "Attempt to read into an immutable delta" - " list memory"); - } - - AbsTime startTime = currentTime(CLOCK_MONOTONIC); - - // Calculate the amount of space that is in use. Include the space that - // has a planned use. - DeltaList *deltaLists = deltaMemory->deltaLists; - size_t usedSpace = growingSize; - unsigned int i; - for (i = 0; i <= deltaMemory->numLists + 1; i++) { - usedSpace += getDeltaListByteSize(&deltaLists[i]); - } - - if (deltaMemory->size < usedSpace) { - return UDS_OVERFLOW; - } - - // Compute the new offsets of the delta lists - size_t spacing = (deltaMemory->size - usedSpace) / deltaMemory->numLists; - deltaMemory->tempOffsets[0] = 0; - for (i = 0; i <= deltaMemory->numLists; i++) { - deltaMemory->tempOffsets[i + 1] = (deltaMemory->tempOffsets[i] - + getDeltaListByteSize(&deltaLists[i]) - + spacing); - deltaMemory->tempOffsets[i] *= CHAR_BIT; - deltaMemory->tempOffsets[i] - += getDeltaListStart(&deltaLists[i]) % CHAR_BIT; - if (i == 0) { - deltaMemory->tempOffsets[i + 1] -= spacing / 2; - } - if (i + 1 == growingIndex) { - deltaMemory->tempOffsets[i + 1] += growingSize; - } - } - deltaMemory->tempOffsets[deltaMemory->numLists + 1] - = (deltaMemory->size * CHAR_BIT - - getDeltaListSize(&deltaLists[deltaMemory->numLists + 1])); - // When we rebalance the delta list, we will include the end guard list - // in the rebalancing. It contains the end guard data, which must be - // copied. - if (doCopy) { - rebalanceDeltaMemory(deltaMemory, 1, deltaMemory->numLists + 1); - AbsTime endTime = currentTime(CLOCK_MONOTONIC); - deltaMemory->rebalanceCount++; - deltaMemory->rebalanceTime += timeDifference(endTime, startTime); - } else { - for (i = 1; i <= deltaMemory->numLists + 1; i++) { - deltaLists[i].startOffset = deltaMemory->tempOffsets[i]; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int validateDeltaLists(const DeltaMemory *deltaMemory) -{ - // Validate the delta index fields set by restoring a delta index - if (deltaMemory->collisionCount > deltaMemory->recordCount) { - return logWarningWithStringError(UDS_BAD_STATE, - "delta index contains more collisions" - " (%ld) than records (%ld)", - deltaMemory->collisionCount, - deltaMemory->recordCount); - } - - // Validate the delta lists - DeltaList *deltaLists = deltaMemory->deltaLists; - if (getDeltaListStart(&deltaLists[0]) != 0) { - return logWarningWithStringError(UDS_BAD_STATE, - "the head guard delta list does not start" - " at 0: %llu", - getDeltaListStart(&deltaLists[0])); - } - uint64_t numBits = getDeltaListEnd(&deltaLists[deltaMemory->numLists + 1]); - if (numBits != deltaMemory->size * CHAR_BIT) { - return logWarningWithStringError(UDS_BAD_STATE, - "the tail guard delta list does not end " - "at end of allocated memory: %" PRIu64 - " != %zd", - numBits, deltaMemory->size * CHAR_BIT); - } - int numGuardBits = getDeltaListSize(&deltaLists[deltaMemory->numLists + 1]); - if (numGuardBits < GUARD_BITS) { - return logWarningWithStringError(UDS_BAD_STATE, - "the tail guard delta list does not " - "contain sufficient guard bits: %d < %d", - numGuardBits, GUARD_BITS); - } - unsigned int i; - for (i = 0; i <= deltaMemory->numLists + 1; i++) { - if (getDeltaListStart(&deltaLists[i]) > getDeltaListEnd(&deltaLists[i])) { - return logWarningWithStringError(UDS_BAD_STATE, - "invalid delta list %u: [%" PRIu64 - ", %llu)", - i, - getDeltaListStart(&deltaLists[i]), - getDeltaListEnd(&deltaLists[i])); - } - if (i > deltaMemory->numLists) { - // The rest of the checks do not apply to the tail guard list - continue; - } - if (getDeltaListEnd(&deltaLists[i]) - > getDeltaListStart(&deltaLists[i + 1])) { - return logWarningWithStringError(UDS_BAD_STATE, - "delta lists %u and %u overlap: %" - PRIu64 " > %llu", - i, i + 1, - getDeltaListEnd(&deltaLists[i]), - getDeltaListStart(&deltaLists[i + 1])); - } - if (i == 0) { - // The rest of the checks do not apply to the head guard list - continue; - } - if (deltaLists[i].saveOffset > getDeltaListSize(&deltaLists[i])) { - return logWarningWithStringError(UDS_BAD_STATE, - "delta lists %u saved offset is larger" - " than the list: %u > %u", - i, deltaLists[i].saveOffset, - getDeltaListSize(&deltaLists[i])); - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t getDeltaMemoryAllocated(const DeltaMemory *deltaMemory) -{ - return (deltaMemory->size - + getSizeOfDeltaLists(deltaMemory->numLists) - + getSizeOfFlags(deltaMemory->numLists) - + getSizeOfTempOffsets(deltaMemory->numLists)); -} - -/**********************************************************************/ -size_t getDeltaMemorySize(unsigned long numEntries, unsigned int meanDelta, - unsigned int numPayloadBits) -{ - unsigned short minBits; - unsigned int incrKeys, minKeys; - computeCodingConstants(meanDelta, &minBits, &minKeys, &incrKeys); - // On average, each delta is encoded into about minBits+1.5 bits. - return (numEntries * (numPayloadBits + minBits + 1) + numEntries / 2); -} diff --git a/uds/deltaMemory.h b/uds/deltaMemory.h deleted file mode 100644 index 1ffb3fd..0000000 --- a/uds/deltaMemory.h +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/deltaMemory.h#1 $ - */ - -#ifndef DELTAMEMORY_H -#define DELTAMEMORY_H 1 - -#include "bits.h" -#include "bufferedReader.h" -#include "bufferedWriter.h" -#include "compiler.h" -#include "cpu.h" -#include "timeUtils.h" - -/* - * We encode the delta list information into 16 bytes per list. - * - * Because the master index has 1 million delta lists, each byte of header - * information ends up costing us 1MB. We have an incentive to keep the - * size down. - * - * The master index delta list memory is currently about 780MB in size, - * which is more than 6 gigabits. Therefore we need at least 33 bits to - * address the master index memory and we use the uint64_t type. - * - * The master index delta lists have 256 entries of about 24 bits each, - * which is 6K bits. The index needs 13 bits to represent the size of a - * delta list and we use the uint16_t type. - */ - -typedef struct deltaList { - uint64_t startOffset; // The offset of the delta list start within memory - uint16_t size; // The number of bits in the delta list - uint16_t saveOffset; // Where the last search "found" the key - unsigned int saveKey; // The key for the record just before saveOffset. -} DeltaList; - -typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) deltaMemory { - byte *memory; // The delta list memory - DeltaList *deltaLists; // The delta list headers - uint64_t *tempOffsets; // Temporary starts of delta lists - byte *flags; // Transfer flags - BufferedWriter *bufferedWriter; // Buffered writer for saving an index - size_t size; // The size of delta list memory - RelTime rebalanceTime; // The time spent rebalancing - int rebalanceCount; // Number of memory rebalances - unsigned short valueBits; // The number of bits of value - unsigned short minBits; // The number of bits in the minimal key code - unsigned int minKeys; // The number of keys used in a minimal code - unsigned int incrKeys; // The number of keys used for another code bit - long recordCount; // The number of records in the index - long collisionCount; // The number of collision records - long discardCount; // The number of records removed - long overflowCount; // The number of UDS_OVERFLOWs detected - unsigned int firstList; // The index of the first delta list - unsigned int numLists; // The number of delta lists - unsigned int numTransfers; // Number of transfer flags that are set - int transferStatus; // Status of the transfers in progress - byte tag; // Tag belonging to this delta index -} DeltaMemory; - -typedef struct deltaListSaveInfo { - uint8_t tag; // Tag identifying which delta index this list is in - uint8_t bitOffset; // Bit offset of the start of the list data - uint16_t byteCount; // Number of bytes of list data - uint32_t index; // The delta list number within the delta index -} DeltaListSaveInfo; - -// The maximum size of a single delta list (in bytes). We add guard bytes -// to this because such a buffer can be used with moveBits. -enum { DELTA_LIST_MAX_BYTE_COUNT = ((UINT16_MAX + CHAR_BIT) / CHAR_BIT - + POST_FIELD_GUARD_BYTES) }; - -/** - * Initialize delta list memory. - * - * @param deltaMemory A delta memory structure - * @param size The initial size of the memory array - * @param firstList The index of the first delta list - * @param numLists The number of delta lists - * @param meanDelta The mean delta - * @param numPayloadBits The number of payload bits - * - * @return error code or UDS_SUCCESS - **/ -int initializeDeltaMemory(DeltaMemory *deltaMemory, size_t size, - unsigned int firstList, unsigned int numLists, - unsigned int meanDelta, unsigned int numPayloadBits) - __attribute__((warn_unused_result)); - -/** - * Uninitialize delta list memory. - * - * @param deltaMemory A delta memory structure - **/ -void uninitializeDeltaMemory(DeltaMemory *deltaMemory); - -/** - * Initialize delta list memory to refer to a cached page. - * - * @param deltaMemory A delta memory structure - * @param memory The memory page - * @param size The size of the memory page - * @param numLists The number of delta lists - * @param meanDelta The mean delta - * @param numPayloadBits The number of payload bits - **/ -void initializeDeltaMemoryPage(DeltaMemory *deltaMemory, byte *memory, - size_t size, unsigned int numLists, - unsigned int meanDelta, - unsigned int numPayloadBits); - -/** - * Empty the delta lists. - * - * @param deltaMemory The delta memory - **/ -void emptyDeltaLists(DeltaMemory *deltaMemory); - -/** - * Is there a delta list memory save or restore in progress? - * - * @param deltaMemory A delta memory structure - * - * @return true if there are no delta lists that need to be saved or - * restored - **/ -bool areDeltaMemoryTransfersDone(const DeltaMemory *deltaMemory); - -/** - * Start restoring delta list memory from a file descriptor - * - * @param deltaMemory A delta memory structure - * - * @return error code or UDS_SUCCESS - **/ -int startRestoringDeltaMemory(DeltaMemory *deltaMemory) - __attribute__((warn_unused_result)); - -/** - * Read a saved delta list from a file descriptor - * - * @param dlsi The DeltaListSaveInfo describing the delta list - * @param data The saved delta list bit stream - * @param bufferedReader The buffered reader to read the delta list from - * - * @return error code or UDS_SUCCESS - * or UDS_END_OF_FILE at end of the data stream - **/ -int readSavedDeltaList(DeltaListSaveInfo *dlsi, - byte data[DELTA_LIST_MAX_BYTE_COUNT], - BufferedReader *bufferedReader) - __attribute__((warn_unused_result)); - -/** - * Restore a saved delta list - * - * @param deltaMemory A delta memory structure - * @param dlsi The DeltaListSaveInfo describing the delta list - * @param data The saved delta list bit stream - * - * @return error code or UDS_SUCCESS - **/ -int restoreDeltaList(DeltaMemory *deltaMemory, const DeltaListSaveInfo *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) - __attribute__((warn_unused_result)); - -/** - * Abort restoring delta list memory from an input stream. - * - * @param deltaMemory A delta memory structure - **/ -void abortRestoringDeltaMemory(DeltaMemory *deltaMemory); - -/** - * Start saving delta list memory to a buffered output stream - * - * @param deltaMemory A delta memory structure - * @param bufferedWriter The index state component being written - **/ -void startSavingDeltaMemory(DeltaMemory *deltaMemory, - BufferedWriter *bufferedWriter); - -/** - * Finish saving delta list memory to an output stream. Force the writing - * of all of the remaining data. If an error occurred asynchronously - * during the save operation, it will be returned here. - * - * @param deltaMemory A delta memory structure - * - * @return error code or UDS_SUCCESS - **/ -int finishSavingDeltaMemory(DeltaMemory *deltaMemory) - __attribute__((warn_unused_result)); - -/** - * Abort saving delta list memory to an output stream. If an error - * occurred asynchronously during the save operation, it will be dropped. - * - * @param deltaMemory A delta memory structure - **/ -void abortSavingDeltaMemory(DeltaMemory *deltaMemory); - -/** - * Flush a delta list to an output stream - * - * @param deltaMemory A delta memory structure - * @param flushIndex Index of the delta list that may need to be flushed. - **/ -void flushDeltaList(DeltaMemory *deltaMemory, unsigned int flushIndex); - -/** - * Write a guard delta list to mark the end of the saved data - * - * @param bufferedWriter The buffered writer to write the guard delta list to - * - * @return error code or UDS_SUCCESS - **/ -int writeGuardDeltaList(BufferedWriter *bufferedWriter) - __attribute__((warn_unused_result)); - -/** - * Extend the memory used by the delta lists and rebalance the lists in the - * new chunk. - * - *

The delta memory contains N delta lists, which are guarded by two - * empty delta lists. The valid delta lists are numbered 1 to N, and the - * guards are numbered 0 and (N+1); - * - *

When the delta lista are bit streams, it is possible that the tail - * of list J and the head of list (J+1) are in the same byte. In this case - * oldOffsets[j]+sizes[j]==oldOffset[j]-1. We handle this correctly. - * - * @param deltaMemory A delta memory structure - * @param growingIndex Index of the delta list that needs additional space - * left before it (from 1 to N+1). - * @param growingSize Number of additional bytes needed before growingIndex - * @param doCopy True to copy the data, False to just balance the space - * - * @return UDS_SUCCESS or an error code - **/ -int extendDeltaMemory(DeltaMemory *deltaMemory, unsigned int growingIndex, - size_t growingSize, bool doCopy) - __attribute__((warn_unused_result)); - -/** - * Validate the delta list headers. - * - * @param deltaMemory A delta memory structure - * - * @return UDS_SUCCESS or an error code - **/ -int validateDeltaLists(const DeltaMemory *deltaMemory) - __attribute__((warn_unused_result)); - -/** - * Get the number of bytes allocated for delta index entries and any - * associated overhead. - * - * @param deltaMemory A delta memory structure - * - * @return The number of bytes allocated - **/ -size_t getDeltaMemoryAllocated(const DeltaMemory *deltaMemory); - -/** - * Get the expected number of bits used in a delta index - * - * @param numEntries The number of index entries - * @param meanDelta The mean delta value - * @param numPayloadBits The number of bits in the payload or value - * - * @return The expected size of a delta index in bits - **/ -size_t getDeltaMemorySize(unsigned long numEntries, unsigned int meanDelta, - unsigned int numPayloadBits) - __attribute__((warn_unused_result)); - -/** - * Get the bit offset to the start of the delta list bit stream - * - * @param deltaList The delta list header - * - * @return the start of the delta list - **/ -static INLINE uint64_t getDeltaListStart(const DeltaList *deltaList) -{ - return deltaList->startOffset; -} - -/** - * Get the number of bits in a delta list bit stream - * - * @param deltaList The delta list header - * - * @return the size of the delta list - **/ -static INLINE uint16_t getDeltaListSize(const DeltaList *deltaList) -{ - return deltaList->size; -} - -/** - * Get the bit offset to the end of the delta list bit stream - * - * @param deltaList The delta list header - * - * @return the end of the delta list - **/ -static INLINE uint64_t getDeltaListEnd(const DeltaList *deltaList) -{ - return getDeltaListStart(deltaList) + getDeltaListSize(deltaList); -} - -/** - * Identify mutable vs. immutable delta memory - * - * Mutable delta memory contains delta lists that can be modified, and is - * initialized using initializeDeltaMemory(). - * - * Immutable delta memory contains packed delta lists, cannot be modified, - * and is initialized using initializeDeltaMemoryPage(). - * - * For mutable delta memory, all of the following expressions are true. - * And for immutable delta memory, all of the following expressions are - * false. - * deltaLists != NULL - * tempOffsets != NULL - * flags != NULL - * - * @param deltaMemory A delta memory structure - * - * @return true if the delta memory is mutable - **/ -static INLINE bool isMutable(const DeltaMemory *deltaMemory) -{ - return deltaMemory->deltaLists != NULL; -} - -/** - * Lazily flush a delta list to an output stream - * - * @param deltaMemory A delta memory structure - * @param flushIndex Index of the delta list that may need to be flushed. - **/ -static INLINE void lazyFlushDeltaList(DeltaMemory *deltaMemory, - unsigned int flushIndex) -{ - if (getField(deltaMemory->flags, flushIndex, 1) != 0) { - flushDeltaList(deltaMemory, flushIndex); - } -} -#endif /* DELTAMEMORY_H */ diff --git a/uds/errors.c b/uds/errors.c deleted file mode 100644 index 5aab19e..0000000 --- a/uds/errors.c +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/errors.c#11 $ - */ - -#include "errors.h" - -#include "common.h" -#include "permassert.h" -#include "stringUtils.h" - -#ifdef __KERNEL__ -#include -#endif - -static const struct errorInfo successful = { "UDS_SUCCESS", "Success" }; - -#ifdef __KERNEL__ -static const char *const messageTable[] = { - [EPERM] = "Operation not permitted", - [ENOENT] = "No such file or directory", - [ESRCH] = "No such process", - [EINTR] = "Interrupted system call", - [EIO] = "Input/output error", - [ENXIO] = "No such device or address", - [E2BIG] = "Argument list too long", - [ENOEXEC] = "Exec format error", - [EBADF] = "Bad file descriptor", - [ECHILD] = "No child processes", - [EAGAIN] = "Resource temporarily unavailable", - [ENOMEM] = "Cannot allocate memory", - [EACCES] = "Permission denied", - [EFAULT] = "Bad address", - [ENOTBLK] = "Block device required", - [EBUSY] = "Device or resource busy", - [EEXIST] = "File exists", - [EXDEV] = "Invalid cross-device link", - [ENODEV] = "No such device", - [ENOTDIR] = "Not a directory", - [EISDIR] = "Is a directory", - [EINVAL] = "Invalid argument", - [ENFILE] = "Too many open files in system", - [EMFILE] = "Too many open files", - [ENOTTY] = "Inappropriate ioctl for device", - [ETXTBSY] = "Text file busy", - [EFBIG] = "File too large", - [ENOSPC] = "No space left on device", - [ESPIPE] = "Illegal seek", - [EROFS] = "Read-only file system", - [EMLINK] = "Too many links", - [EPIPE] = "Broken pipe", - [EDOM] = "Numerical argument out of domain", - [ERANGE] = "Numerical result out of range" -}; -#endif - -static const struct errorInfo errorList[] = { - { "UDS_UNINITIALIZED", "UDS library is not initialized" }, - { "UDS_SHUTTINGDOWN", "UDS library is shutting down" }, - { "UDS_EMODULE_LOAD", "Could not load modules" }, - { "UDS_ENOTHREADS", "Could not create a new thread" }, - { "UDS_NOCONTEXT", "Could not find the requested library context" }, - { "UDS_DISABLED", "UDS library context is disabled" }, - { "UDS_CORRUPT_COMPONENT", "Corrupt saved component" }, - { "UDS_UNKNOWN_ERROR", "Unknown error" }, - { "UDS_UNUSED_CODE_8", "Unused error code 8" }, - { "UDS_UNUSED_CODE_9", "Unused error code 9" }, - { "UDS_UNSUPPORTED_VERSION", "Unsupported version" }, - { "UDS_NO_INDEXSESSION", "Index session not known" }, - { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" }, - { "UDS_SHORT_READ", "Could not read requested number of bytes" }, - { "UDS_UNUSED_CODE_14", "Unused error code 14" }, - { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" }, - { "UDS_VOLUME_OVERFLOW", "Memory overflow due to storage failure" }, - { "UDS_UNUSED_CODE_17", "Unused error code 17" }, - { "UDS_UNUSED_CODE_18", "Unused error code 18" }, - { "UDS_UNUSED_CODE_19", "Unused error code 19" }, - { "UDS_CONF_PTR_REQUIRED", "A configuration pointer is required" }, - { "UDS_INDEX_STATS_PTR_REQUIRED", "An index stats pointer is required" }, - { "UDS_CONTEXT_STATS_PTR_REQUIRED", "A context stats pointer is required" }, - { "UDS_UNUSED_CODE_23", "Unused error code 23" }, - { "UDS_UNUSED_CODE_24", "Unused error code 24" }, - { "UDS_UNUSED_CODE_25", "Unused error code 25" }, - { "UDS_UNUSED_CODE_26", "Unused error code 26" }, - { "UDS_UNUSED_CODE_27", "Unused error code 27" }, - { "UDS_INVALID_MEMORY_SIZE", - "Configured memory too small or unsupported size" }, - { "UDS_UNUSED_CODE_29", "Unused error code 29" }, - { "UDS_INDEX_NAME_REQUIRED", "An index name is required" }, - { "UDS_CONF_REQUIRED", "A configuration is required" }, - { "UDS_UNUSED_CODE_32", "Unused error code 32" }, - { "UDS_UNUSED_CODE_33", "Unused error code 33" }, - { "UDS_UNUSED_CODE_34", "Unused error code 34" }, - { "UDS_UNUSED_CODE_35", "Unused error code 35" }, - { "UDS_UNUSED_CODE_36", "Unused error code 36" }, - { "UDS_NO_INDEX", "No index found" }, - { "UDS_BAD_CHECKPOINT_FREQUENCY", "Checkpoint frequency out of range" }, - { "UDS_WRONG_INDEX_CONFIG", "Wrong type of index configuration" }, - { "UDS_UNUSED_CODE_40", "Unused error code 40" }, - { "UDS_UNUSED_CODE_41", "Unused error code 41" }, - { "UDS_UNUSED_CODE_42", "Unused error code 42" }, - { "UDS_UNUSED_CODE_43", "Unused error code 43" }, - { "UDS_END_OF_FILE", "Unexpected end of file" }, - { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" }, - { "UDS_UNUSED_CODE_46", "Unused error code 46" }, - { "UDS_INSUFFICIENT_INDEX_SPACE", "Insufficient index space" }, - { "UDS_UNUSED_CODE_48", "Unused error code 48" }, - { "UDS_UNUSED_CODE_49", "Unused error code 49" }, - { "UDS_SUSPENDED", "Index suspended"}, - { "UDS_UNUSED_CODE_51", "Unused error code 51" }, - { "UDS_INDEXSESSION_IN_USE", "Index session in use"}, - { "UDS_CALLBACK_REQUIRED", "A callback function is required"}, - { "UDS_INVALID_OPERATION_TYPE", "Invalid type of request operation"}, -}; - -static const struct errorInfo internalErrorList[] = { - { "UDS_INTERNAL_UNUSED_0", "Unused internal error 0" }, - { "UDS_OVERFLOW", "Index overflow" }, - { "UDS_INTERNAL_UNUSED_2", "Unused internal error 2" }, - { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" }, - { "UDS_BAD_STATE", "UDS data structures are in an invalid state" }, - { "UDS_DUPLICATE_NAME", - "Attempt to enter the same name into a delta index twice" }, - { "UDS_UNEXPECTED_RESULT", "Unexpected result from internal routine" }, - { "UDS_INJECTED_ERROR", "Injected error" }, - { "UDS_ASSERTION_FAILED", "Assertion failed" }, - { "UDS_INTERNAL_UNUSED_9", "Unused internal error 9" }, - { "UDS_QUEUED", "Request queued" }, - { "UDS_INTERNAL_UNUSED_11", "Unused internal error 11" }, - { "UDS_INTERNAL_UNUSED_12", "Unused internal error 12" }, - { "UDS_BUFFER_ERROR", "Buffer error" }, - { "UDS_INTERNAL_UNUSED_14", "Unused internal error 14" }, - { "UDS_INTERNAL_UNUSED_15", "Unused internal error 15" }, - { "UDS_NO_DIRECTORY", "Expected directory is missing" }, - { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" }, - { "UDS_INTERNAL_UNUSED_18", "Unused internal error 18" }, - { "UDS_INTERNAL_UNUSED_19", "Unused internal error 19" }, - { "UDS_ALREADY_REGISTERED", "Error range already registered" }, - { "UDS_BAD_IO_DIRECTION", "Bad I/O direction" }, - { "UDS_INCORRECT_ALIGNMENT", "Offset not at block alignment" }, - { "UDS_OUT_OF_RANGE", "Cannot access data outside specified limits" }, -}; - -typedef struct errorBlock { - const char *name; - int base; - int last; - int max; - const ErrorInfo *infos; -} ErrorBlock; - -enum { - MAX_ERROR_BLOCKS = 6 // needed for testing -}; - -static struct errorInformation { - int allocated; - int count; - ErrorBlock blocks[MAX_ERROR_BLOCKS]; -} registeredErrors = { - .allocated = MAX_ERROR_BLOCKS, - .count = 2, - .blocks = { - { - .name = "UDS Error", - .base = UDS_ERROR_CODE_BASE, - .last = UDS_ERROR_CODE_LAST, - .max = UDS_ERROR_CODE_BLOCK_END, - .infos = errorList, - }, - { - .name = "UDS Internal Error", - .base = UDS_INTERNAL_ERROR_CODE_BASE, - .last = UDS_INTERNAL_ERROR_CODE_LAST, - .max = UDS_INTERNAL_ERROR_CODE_BLOCK_END, - .infos = internalErrorList, - } - } -}; - -/** - * Fetch the error info (if any) for the error number. - * - * @param errnum the error number - * @param infoPtr the place to store the info for this error (if known), - * otherwise set to NULL - * - * @return the name of the error block (if known), NULL othersise - **/ -static const char *getErrorInfo(int errnum, const ErrorInfo **infoPtr) -{ - - if (errnum == UDS_SUCCESS) { - if (infoPtr != NULL) { - *infoPtr = &successful; - } - return NULL; - } - - ErrorBlock *block; - for (block = registeredErrors.blocks; - block < registeredErrors.blocks + registeredErrors.count; - ++block) { - if ((errnum >= block->base) && (errnum < block->last)) { - if (infoPtr != NULL) { - *infoPtr = block->infos + (errnum - block->base); - } - return block->name; - } else if ((errnum >= block->last) && (errnum < block->max)) { - if (infoPtr != NULL) { - *infoPtr = NULL; - } - return block->name; - } - } - if (infoPtr != NULL) { - *infoPtr = NULL; - } - return NULL; -} - -/** - * Return string describing a system error message - * - * @param errnum System error number - * @param buf Buffer that can be used to contain the return value - * @param buflen Length of the buffer - * - * @return The error string, which may be a string constant or may be - * returned in the buf argument - **/ -#ifdef __KERNEL__ -static const char *systemStringError(int errnum, char *buf, size_t buflen) -{ - const char *errorString = NULL; - if ((errnum > 0) && (errnum < COUNT_OF(messageTable))) { - errorString = messageTable[errnum]; - } - - size_t len = ((errorString == NULL) - ? snprintf(buf, buflen, "Unknown error %d", errnum) - : snprintf(buf, buflen, "%s", errorString)); - if (len < buflen) { - return buf; - } - - buf[0] = '\0'; - return "System error"; -} -#else -static INLINE const char *systemStringError(int errnum, char *buf, - size_t buflen) -{ - return strerror_r(errnum, buf, buflen); -} -#endif - -/*****************************************************************************/ -const char *stringError(int errnum, char *buf, size_t buflen) -{ - if (buf == NULL) { - return NULL; - } - - char *buffer = buf; - char *bufEnd = buf + buflen; - - if (isUnrecoverable(errnum)) { - buffer = appendToBuffer(buffer, bufEnd, "Unrecoverable error: "); - errnum = sansUnrecoverable(errnum); - } - - const ErrorInfo *info = NULL; - const char *blockName = getErrorInfo(errnum, &info); - - if (blockName != NULL) { - if (info != NULL) { - buffer = appendToBuffer(buffer, bufEnd, - "%s: %s", blockName, info->message); - } else { - buffer = appendToBuffer(buffer, bufEnd, - "Unknown %s %d", blockName, errnum); - } - } else if (info != NULL) { - buffer = appendToBuffer(buffer, bufEnd, "%s", info->message); - } else { - const char *tmp = systemStringError(errnum, buffer, bufEnd - buffer); - if (tmp != buffer) { - buffer = appendToBuffer(buffer, bufEnd, "%s", tmp); - } else { - buffer += strlen(tmp); - } - } - return buf; -} - -/*****************************************************************************/ -const char *stringErrorName(int errnum, char *buf, size_t buflen) -{ - errnum = sansUnrecoverable(errnum); - - char *buffer = buf; - char *bufEnd = buf + buflen; - - const ErrorInfo *info = NULL; - const char *blockName = getErrorInfo(errnum, &info); - - if (blockName != NULL) { - if (info != NULL) { - buffer = appendToBuffer(buffer, bufEnd, "%s", info->name); - } else { - buffer = appendToBuffer(buffer, bufEnd, "%s %d", blockName, errnum); - } - } else if (info != NULL) { - buffer = appendToBuffer(buffer, bufEnd, "%s", info->name); - } else { - const char *tmp = systemStringError(errnum, buffer, bufEnd - buffer); - if (tmp != buffer) { - buffer = appendToBuffer(buffer, bufEnd, "%s", tmp); - } else { - buffer += strlen(tmp); - } - } - return buf; -} - -/*****************************************************************************/ -int registerErrorBlock(const char *blockName, - int firstError, - int lastReservedError, - const ErrorInfo *infos, - size_t infoSize) -{ - int result = ASSERT(firstError < lastReservedError, - "bad error block range"); - if (result != UDS_SUCCESS) { - return result; - } - - if (registeredErrors.count == registeredErrors.allocated) { - // could reallocate and grow, but should never happen - return UDS_OVERFLOW; - } - - ErrorBlock *block; - for (block = registeredErrors.blocks; - block < registeredErrors.blocks + registeredErrors.count; - ++block) { - if (strcmp(blockName, block->name) == 0) { - return UDS_DUPLICATE_NAME; - } - // check for overlap in error ranges - if ((firstError < block->max) && (lastReservedError > block->base)) { - return UDS_ALREADY_REGISTERED; - } - } - - registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { - .name = blockName, - .base = firstError, - .last = firstError + (infoSize / sizeof(ErrorInfo)), - .max = lastReservedError, - .infos = infos - }; - - return UDS_SUCCESS; -} diff --git a/uds/errors.h b/uds/errors.h deleted file mode 100644 index faccd5a..0000000 --- a/uds/errors.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/errors.h#4 $ - */ - -#ifndef ERRORS_H -#define ERRORS_H - -#include "compiler.h" -#include "typeDefs.h" -#include "uds-error.h" - -enum udsInternalErrorCodes { - /** Used as a base value for reporting internal errors */ - UDS_INTERNAL_ERROR_CODE_BASE = 66560, - /** Unused */ - UDS_INTERNAL_UNUSED_0 = UDS_INTERNAL_ERROR_CODE_BASE + 0, - /** Index overflow */ - UDS_OVERFLOW = UDS_INTERNAL_ERROR_CODE_BASE + 1, - /** Unused */ - UDS_INTERNAL_UNUSED_2 = UDS_INTERNAL_ERROR_CODE_BASE + 2, - /** Invalid argument passed to internal routine */ - UDS_INVALID_ARGUMENT = UDS_INTERNAL_ERROR_CODE_BASE + 3, - /** UDS data structures are in an invalid state */ - UDS_BAD_STATE = UDS_INTERNAL_ERROR_CODE_BASE + 4, - /** Attempt to enter the same name into an internal structure twice */ - UDS_DUPLICATE_NAME = UDS_INTERNAL_ERROR_CODE_BASE + 5, - /** An internal protocol violation between system components */ - UDS_UNEXPECTED_RESULT = UDS_INTERNAL_ERROR_CODE_BASE + 6, - /** An error created by test case processing */ - UDS_INJECTED_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 7, - /** An assertion failed */ - UDS_ASSERTION_FAILED = UDS_INTERNAL_ERROR_CODE_BASE + 8, - /** Unused */ - UDS_INTERNAL_UNUSED_9 = UDS_INTERNAL_ERROR_CODE_BASE + 9, - /** Not an actual error, but reporting that the result will be delayed */ - UDS_QUEUED = UDS_INTERNAL_ERROR_CODE_BASE + 10, - /** Unused */ - UDS_INTERNAL_UNUSED_11 = UDS_INTERNAL_ERROR_CODE_BASE + 11, - /** Unused */ - UDS_INTERNAL_UNUSED_12 = UDS_INTERNAL_ERROR_CODE_BASE + 12, - /** A problem has occured with a Buffer */ - UDS_BUFFER_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 13, - /** Unused */ - UDS_INTERNAL_UNUSED_14 = UDS_INTERNAL_ERROR_CODE_BASE + 14, - /** Unused */ - UDS_INTERNAL_UNUSED_15 = UDS_INTERNAL_ERROR_CODE_BASE + 15, - /** No directory was found where one was expected */ - UDS_NO_DIRECTORY = UDS_INTERNAL_ERROR_CODE_BASE + 16, - /** Checkpoint not completed */ - UDS_CHECKPOINT_INCOMPLETE = UDS_INTERNAL_ERROR_CODE_BASE + 17, - /** Unused */ - UDS_INTERNAL_UNUSED_18 = UDS_INTERNAL_ERROR_CODE_BASE + 18, - /** Unused */ - UDS_INTERNAL_UNUSED_19 = UDS_INTERNAL_ERROR_CODE_BASE + 19, - /** This error range has already been registered */ - UDS_ALREADY_REGISTERED = UDS_INTERNAL_ERROR_CODE_BASE + 20, - /** Either read-only or write-only */ - UDS_BAD_IO_DIRECTION = UDS_INTERNAL_ERROR_CODE_BASE + 21, - /** Cannot do I/O at this offset */ - UDS_INCORRECT_ALIGNMENT = UDS_INTERNAL_ERROR_CODE_BASE + 22, - /** Attempt to read or write data outside the bounds established for it */ - UDS_OUT_OF_RANGE = UDS_INTERNAL_ERROR_CODE_BASE + 23, - /** One more than the last UDS_INTERNAL error code */ - UDS_INTERNAL_ERROR_CODE_LAST, - /** One more than the last error this block will ever use */ - UDS_INTERNAL_ERROR_CODE_BLOCK_END = UDS_INTERNAL_ERROR_CODE_BASE + 440 -}; - -enum { - ERRBUF_SIZE = 128 // default size for buffer passed to stringError -}; - -// Error attributes - or into top half of error code -enum { UDS_UNRECOVERABLE = (1 << 17) }; - -const char *stringError(int errnum, char *buf, size_t buflen); -const char *stringErrorName(int errnum, char *buf, size_t buflen); - -/* - * Identify that an result code is a successful result. - * - * @param result A result code - * - * @return true if the result represents a success. - */ -__attribute__((warn_unused_result)) -static INLINE bool isSuccessful(int result) -{ - return (result == UDS_SUCCESS) || (result == UDS_QUEUED); -} - -/* - * Identify that an result code has been marked unrecoverable. - * - * @param result A result code - * - * @return true if the result has been marked unrecoverable. - */ -__attribute__((warn_unused_result)) -static INLINE bool isUnrecoverable(int result) -{ - return (result & UDS_UNRECOVERABLE) != 0; -} - -/* - * Mark a result code as unrecoverable. - * - * @param result A result code - * - * @return the result code with the unrecoverable marker added - */ -__attribute__((warn_unused_result)) -static INLINE int makeUnrecoverable(int result) -{ - return isSuccessful(result) ? result : (result | UDS_UNRECOVERABLE); -} - -/* - * Remove the unrecoverable marker from a result code. - * - * @param result A result code - * - * @return the result code with the unrecoverable marker removed - */ -__attribute__((warn_unused_result)) -static INLINE int sansUnrecoverable(int result) -{ - return result & ~UDS_UNRECOVERABLE; -} - -typedef struct errorInfo { - const char *name; - const char *message; -} ErrorInfo; - -/** - * Register an error code block for stringError and stringErrorName. - * - * @param blockName the name of the block of error codes - * @param firstError the first error code in the block - * @param lastReservedError one past the highest possible error in the bloc - * @param infos a pointer to the error info array for the block - * @param infoSize the size of the error info array, which - * determines the last actual error for which - * information is available - * - * @return a success or error code, particularly UDS_DUPLICATE_NAME if the - * block name is already present, or UDS_ALREADY_REGISTERED if a - * block with the specified error code is present - **/ -int registerErrorBlock(const char *blockName, - int firstError, - int lastReservedError, - const ErrorInfo *infos, - size_t infoSize); - -/** - * Return the first error between result1 and result2. - * - * @param result1 A success or error code. - * @param result2 A success or error code. - * - * @return result1 if that is an error, else result2 - **/ -static INLINE int firstError(int result1, int result2) -{ - return result1 == UDS_SUCCESS ? result2 : result1; -} - -#endif /* ERRORS_H */ diff --git a/uds/geometry.c b/uds/geometry.c deleted file mode 100644 index 6d8cfa6..0000000 --- a/uds/geometry.c +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/geometry.c#3 $ - */ - -#include "geometry.h" - -#include "deltaIndex.h" -#include "errors.h" -#include "hashUtils.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" - -/**********************************************************************/ -static int initializeGeometry(Geometry *geometry, - size_t bytesPerPage, - unsigned int recordPagesPerChapter, - unsigned int chaptersPerVolume, - unsigned int sparseChaptersPerVolume) -{ - int result = ASSERT_WITH_ERROR_CODE(bytesPerPage >= BYTES_PER_RECORD, - UDS_BAD_STATE, - "page is smaller than a record: %zu", - bytesPerPage); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT_WITH_ERROR_CODE(chaptersPerVolume > sparseChaptersPerVolume, - UDS_INVALID_ARGUMENT, - "sparse chapters per volume (%u) must be less" - " than chapters per volume (%u)", - sparseChaptersPerVolume, - chaptersPerVolume); - if (result != UDS_SUCCESS) { - return result; - } - - geometry->bytesPerPage = bytesPerPage; - geometry->recordPagesPerChapter = recordPagesPerChapter; - geometry->chaptersPerVolume = chaptersPerVolume; - geometry->sparseChaptersPerVolume = sparseChaptersPerVolume; - geometry->denseChaptersPerVolume = - chaptersPerVolume - sparseChaptersPerVolume; - - // Calculate the number of records in a page, chapter, and volume. - geometry->recordsPerPage = bytesPerPage / BYTES_PER_RECORD; - geometry->recordsPerChapter - = geometry->recordsPerPage * recordPagesPerChapter; - geometry->recordsPerVolume - = (unsigned long) geometry->recordsPerChapter * chaptersPerVolume; - geometry->openChapterLoadRatio = DEFAULT_OPEN_CHAPTER_LOAD_RATIO; - - // Initialize values for delta chapter indexes. - geometry->chapterMeanDelta = 1 << DEFAULT_CHAPTER_MEAN_DELTA_BITS; - geometry->chapterPayloadBits = computeBits(recordPagesPerChapter - 1); - // We want 1 delta list for every 64 records in the chapter. The "| 077" - // ensures that the chapterDeltaListBits computation does not underflow. - geometry->chapterDeltaListBits - = computeBits((geometry->recordsPerChapter - 1) | 077) - 6; - geometry->deltaListsPerChapter = 1 << geometry->chapterDeltaListBits; - // We need enough address bits to achieve the desired mean delta. - geometry->chapterAddressBits - = (DEFAULT_CHAPTER_MEAN_DELTA_BITS - geometry->chapterDeltaListBits - + computeBits(geometry->recordsPerChapter - 1)); - // Let the delta index code determine how many pages are needed for the index - geometry->indexPagesPerChapter - = getDeltaIndexPageCount(geometry->recordsPerChapter, - geometry->deltaListsPerChapter, - geometry->chapterMeanDelta, - geometry->chapterPayloadBits, - bytesPerPage); - - // Now that we have the size of a chapter index, we can calculate the - // space used by chapters and volumes. - geometry->pagesPerChapter - = geometry->indexPagesPerChapter + recordPagesPerChapter; - geometry->pagesPerVolume = geometry->pagesPerChapter * chaptersPerVolume; - geometry->headerPagesPerVolume = 1; - geometry->bytesPerVolume = bytesPerPage * - (geometry->pagesPerVolume + geometry->headerPagesPerVolume); - geometry->bytesPerChapter = bytesPerPage * geometry->pagesPerChapter; - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int makeGeometry(size_t bytesPerPage, - unsigned int recordPagesPerChapter, - unsigned int chaptersPerVolume, - unsigned int sparseChaptersPerVolume, - Geometry **geometryPtr) -{ - Geometry *geometry; - int result = ALLOCATE(1, Geometry, "geometry", &geometry); - if (result != UDS_SUCCESS) { - return result; - } - result = initializeGeometry(geometry, bytesPerPage, recordPagesPerChapter, - chaptersPerVolume, sparseChaptersPerVolume); - if (result != UDS_SUCCESS) { - freeGeometry(geometry); - return result; - } - - *geometryPtr = geometry; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int copyGeometry(Geometry *source, Geometry **geometryPtr) -{ - return makeGeometry(source->bytesPerPage, - source->recordPagesPerChapter, - source->chaptersPerVolume, - source->sparseChaptersPerVolume, - geometryPtr); -} - -/**********************************************************************/ -void freeGeometry(Geometry *geometry) -{ - FREE(geometry); -} - -/**********************************************************************/ -uint64_t mapToVirtualChapterNumber(Geometry *geometry, - uint64_t newestVirtualChapter, - unsigned int physicalChapter) -{ - unsigned int newestPhysicalChapter - = mapToPhysicalChapter(geometry, newestVirtualChapter); - uint64_t virtualChapter - = newestVirtualChapter - newestPhysicalChapter + physicalChapter; - if (physicalChapter > newestPhysicalChapter) { - virtualChapter -= geometry->chaptersPerVolume; - } - return virtualChapter; -} - -/**********************************************************************/ -bool hasSparseChapters(const Geometry *geometry, - uint64_t oldestVirtualChapter, - uint64_t newestVirtualChapter) -{ - return (isSparse(geometry) - && ((newestVirtualChapter - oldestVirtualChapter + 1) - > geometry->denseChaptersPerVolume)); -} - -/**********************************************************************/ -bool isChapterSparse(const Geometry *geometry, - uint64_t oldestVirtualChapter, - uint64_t newestVirtualChapter, - uint64_t virtualChapterNumber) -{ - return (hasSparseChapters(geometry, oldestVirtualChapter, - newestVirtualChapter) - && ((virtualChapterNumber + geometry->denseChaptersPerVolume) - <= newestVirtualChapter)); -} - -/**********************************************************************/ -bool areSamePhysicalChapter(const Geometry *geometry, - uint64_t chapter1, - uint64_t chapter2) -{ - return ((chapter1 % geometry->chaptersPerVolume) - == (chapter2 % geometry->chaptersPerVolume)); -} diff --git a/uds/geometry.h b/uds/geometry.h deleted file mode 100644 index 47f771d..0000000 --- a/uds/geometry.h +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/geometry.h#3 $ - */ - -#ifndef GEOMETRY_H -#define GEOMETRY_H 1 - -#include "compiler.h" -#include "typeDefs.h" -#include "uds.h" -#include "uds-block.h" - -/** - * Geometry defines constants and a record that parameterize the layout of an - * Albireo index volume. - * - *

An index volume is divided into a fixed number of fixed-size - * chapters, each consisting of a fixed number of fixed-size - * pages. The volume layout is defined by two assumptions and four - * parameters. The assumptions (constants) are that index records are - * 64 bytes (32-byte block name plus 32-byte metadata) and that open - * chapter index hash slots are one byte long. The four parameters are - * the number of bytes in a page, the number of chapters in a volume, - * the number of record pages in a chapter, and the number of chapters - * that are sparse. From these parameters, we derive the rest of the - * layout and derived properties, ranging from the number of pages in - * a chapter to the number of records in the volume. - * - *

The default geometry is 64 KByte pages, 1024 chapters, 256 - * record pages in a chapter, and zero sparse chapters. This will - * allow us to store 2^28 entries (indexing 1TB of 4K blocks) in an - * approximately 16.5 MByte volume using fourteen index pages in each - * chapter. - **/ -typedef struct geometry { - /** Length of a page in a chapter, in bytes */ - size_t bytesPerPage; - /** Number of record pages in a chapter */ - unsigned int recordPagesPerChapter; - /** Number of (total) chapters in a volume */ - unsigned int chaptersPerVolume; - /** Number of sparsely-indexed chapters in a volume */ - unsigned int sparseChaptersPerVolume; - /** Number of bits used to determine delta list numbers */ - unsigned int chapterDeltaListBits; - - // These are derived properties, expressed as fields for convenience. - /** Total number of pages in a volume, excluding header */ - unsigned int pagesPerVolume; - /** Total number of header pages per volume */ - unsigned int headerPagesPerVolume; - /** Total number of bytes in a volume, including header */ - size_t bytesPerVolume; - /** Total number of bytes in a chapter */ - size_t bytesPerChapter; - /** Number of pages in a chapter */ - unsigned int pagesPerChapter; - /** Number of index pages in a chapter index */ - unsigned int indexPagesPerChapter; - /** The minimum ratio of hash slots to records in an open chapter */ - unsigned int openChapterLoadRatio; - /** Number of records that fit on a page */ - unsigned int recordsPerPage; - /** Number of records that fit in a chapter */ - unsigned int recordsPerChapter; - /** Number of records that fit in a volume */ - uint64_t recordsPerVolume; - /** Number of deltaLists per chapter index */ - unsigned int deltaListsPerChapter; - /** Mean delta in chapter indexes */ - unsigned int chapterMeanDelta; - /** Number of bits needed for record page numbers */ - unsigned int chapterPayloadBits; - /** Number of bits used to compute addresses for chapter delta lists */ - unsigned int chapterAddressBits; - /** Number of densely-indexed chapters in a volume */ - unsigned int denseChaptersPerVolume; -} Geometry; - -enum { - /* The number of bytes in a record (name + metadata) */ - BYTES_PER_RECORD = (UDS_CHUNK_NAME_SIZE + UDS_MAX_BLOCK_DATA_SIZE), - - /* The default length of a page in a chapter, in bytes */ - DEFAULT_BYTES_PER_PAGE = 1024 * BYTES_PER_RECORD, - - /* The default maximum number of records per page */ - DEFAULT_RECORDS_PER_PAGE = DEFAULT_BYTES_PER_PAGE / BYTES_PER_RECORD, - - /** The default number of record pages in a chapter */ - DEFAULT_RECORD_PAGES_PER_CHAPTER = 256, - - /** The default number of record pages in a chapter for a small index */ - SMALL_RECORD_PAGES_PER_CHAPTER = 64, - - /** The default number of chapters in a volume */ - DEFAULT_CHAPTERS_PER_VOLUME = 1024, - - /** The default number of sparsely-indexed chapters in a volume */ - DEFAULT_SPARSE_CHAPTERS_PER_VOLUME = 0, - - /** The log2 of the default mean delta */ - DEFAULT_CHAPTER_MEAN_DELTA_BITS = 16, - - /** The log2 of the number of delta lists in a large chapter */ - DEFAULT_CHAPTER_DELTA_LIST_BITS = 12, - - /** The log2 of the number of delta lists in a small chapter */ - SMALL_CHAPTER_DELTA_LIST_BITS = 10, - - /** The default min ratio of slots to records in an open chapter */ - DEFAULT_OPEN_CHAPTER_LOAD_RATIO = 2, - - /** Checkpoint every n chapters written. Default is to not checkpoint */ - DEFAULT_CHECKPOINT_FREQUENCY = 0 -}; - -/** - * Allocate and initialize all fields of a volume geometry using the - * specified layout parameters. - * - * @param bytesPerPage The length of a page in a chapter, in bytes - * @param recordPagesPerChapter The number of pages in a chapter - * @param chaptersPerVolume The number of chapters in a volume - * @param sparseChaptersPerVolume The number of sparse chapters in a volume - * @param geometryPtr A pointer to hold the new geometry - * - * @return UDS_SUCCESS or an error code - **/ -int makeGeometry(size_t bytesPerPage, - unsigned int recordPagesPerChapter, - unsigned int chaptersPerVolume, - unsigned int sparseChaptersPerVolume, - Geometry **geometryPtr) - __attribute__((warn_unused_result)); - -/** - * Allocate a new geometry and initialize it with the same parameters as an - * existing geometry. - * - * @param source The geometry record to copy - * @param geometryPtr A pointer to hold the new geometry - * - * @return UDS_SUCCESS or an error code - **/ -int copyGeometry(Geometry *source, - Geometry **geometryPtr) - __attribute__((warn_unused_result)); - -/** - * Clean up a geometry and its memory. - * - * @param geometry The geometry record to free - **/ -void freeGeometry(Geometry *geometry); - -/** - * Map a virtual chapter number to a physical chapter number - * - * @param geometry The geometry - * @param virtualChapter The virtual chapter number - * - * @return the corresponding physical chapter number - **/ -__attribute__((warn_unused_result)) -static INLINE unsigned int mapToPhysicalChapter(const Geometry *geometry, - uint64_t virtualChapter) -{ - return (virtualChapter % geometry->chaptersPerVolume); -} - -/** - * Convert a physical chapter number to its current virtual chapter number. - * - * @param geometry The geometry - * @param newestVirtualChapter The number of the newest virtual chapter - * @param physicalChapter The physical chapter number to convert - * - * @return The current virtual chapter number of the physical chapter - * in question - **/ -uint64_t mapToVirtualChapterNumber(Geometry *geometry, - uint64_t newestVirtualChapter, - unsigned int physicalChapter); - -/** - * Check whether this geometry is for a sparse index. - * - * @param geometry The geometry to check - * - * @return true if this geometry has sparse chapters - **/ -__attribute__((warn_unused_result)) -static INLINE bool isSparse(const Geometry *geometry) -{ - return (geometry->sparseChaptersPerVolume > 0); -} - -/** - * Check whether any sparse chapters have been filled. - * - * @param geometry The geometry of the index - * @param oldestVirtualChapter The number of the oldest chapter in the - * index - * @param newestVirtualChapter The number of the newest chapter in the - * index - * - * @return true if the index has filled at least one sparse chapter - **/ -bool hasSparseChapters(const Geometry *geometry, - uint64_t oldestVirtualChapter, - uint64_t newestVirtualChapter) - __attribute__((warn_unused_result)); - -/** - * Check whether a chapter is sparse or dense. - * - * @param geometry The geometry of the index containing the chapter - * @param oldestVirtualChapter The number of the oldest chapter in the index - * @param newestVirtualChapter The number of the newest chapter in the index - * @param virtualChapterNumber The number of the chapter to check - * - * @return true if the chapter is sparse - **/ -bool isChapterSparse(const Geometry *geometry, - uint64_t oldestVirtualChapter, - uint64_t newestVirtualChapter, - uint64_t virtualChapterNumber) - __attribute__((warn_unused_result)); - -/** - * Check whether two virtual chapter numbers correspond to the same - * physical chapter. - * - * @param geometry The geometry of the index - * @param chapter1 The first chapter to compare - * @param chapter2 The second chapter to compare - * - * @return true if both chapters correspond to the same - * physical chapter - **/ -bool areSamePhysicalChapter(const Geometry *geometry, - uint64_t chapter1, - uint64_t chapter2) - __attribute__((warn_unused_result)); - -#endif /* GEOMETRY_H */ diff --git a/uds/hashUtils.c b/uds/hashUtils.c deleted file mode 100644 index 45b2c81..0000000 --- a/uds/hashUtils.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/hashUtils.c#2 $ - */ - -#include "hashUtils.h" - -#include "errors.h" -#include "logger.h" -#include "permassert.h" -#include "stringUtils.h" -#include "uds.h" - -/** - * Convert a byte string to the hex representation. - * - * @param data binary data to convert - * @param dataLen length of binary data - * @param hex target to write hex string into - * @param hexLen capacity of target string - * - * @return UDS_SUCCESS, - * or UDS_INVALID_ARGUMENT if hexLen - * is too short. - **/ -static int dataToHex(const unsigned char *data, size_t dataLen, - char *hex, size_t hexLen) -{ - if (hexLen < 2 * dataLen + 1) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "hex data incorrect size"); - } - size_t i; - for (i = 0; i < dataLen; ++i) { - int rc = fixedSprintf(__func__, &hex[2 * i], hexLen - (2 * i), - UDS_INVALID_ARGUMENT, "%02X", data[i]); - - if (rc != UDS_SUCCESS) { - return rc; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int chunkNameToHex(const UdsChunkName *chunkName, - char *hexData, size_t hexDataLen) -{ - return dataToHex(chunkName->name, UDS_CHUNK_NAME_SIZE, - hexData, hexDataLen); -} - -/**********************************************************************/ -int chunkDataToHex(const UdsChunkData *chunkData, - char *hexData, size_t hexDataLen) -{ - return dataToHex(chunkData->data, UDS_MAX_BLOCK_DATA_SIZE, - hexData, hexDataLen); -} - -/**********************************************************************/ -unsigned int computeBits(unsigned int maxValue) -{ - // __builtin_clz() counts leading (high-order) zero bits, so if - // we ever need this to be fast, under GCC we can do: - // return ((maxValue == 0) ? 0 : (32 - __builtin_clz(maxValue))); - - unsigned int bits = 0; - while (maxValue > 0) { - maxValue >>= 1; - bits++; - } - return bits; -} - -/**********************************************************************/ -void hashUtilsCompileTimeAssertions(void) -{ - STATIC_ASSERT((UDS_CHUNK_NAME_SIZE % sizeof(uint64_t)) == 0); - STATIC_ASSERT(UDS_CHUNK_NAME_SIZE == 16); -} diff --git a/uds/hashUtils.h b/uds/hashUtils.h deleted file mode 100644 index 2d6d0a8..0000000 --- a/uds/hashUtils.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/hashUtils.h#1 $ - */ - -#ifndef HASH_UTILS_H -#define HASH_UTILS_H 1 - -#include "compiler.h" -#include "common.h" -#include "geometry.h" -#include "numeric.h" -#include "uds.h" - -// How various portions of a hash are apportioned. Size dependent. -enum { - MASTER_INDEX_BYTES_OFFSET = 0, // size 8 - CHAPTER_INDEX_BYTES_OFFSET = 8, // size 6 - SAMPLE_BYTES_OFFSET = 14, // size 2 - MASTER_INDEX_BYTES_COUNT = 8, - CHAPTER_INDEX_BYTES_COUNT = 6, - SAMPLE_BYTES_COUNT = 2, -}; - -/** - * Extract the portion of a block name used by the chapter index. - * - * @param name The block name - * - * @return The chapter index bytes - **/ -static INLINE uint64_t extractChapterIndexBytes(const UdsChunkName *name) -{ - // Get the high order 16 bits, then the low order 32 bits - uint64_t bytes - = (uint64_t) getUInt16BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET]) << 32; - bytes |= getUInt32BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET + 2]); - return bytes; -} - -/** - * Extract the portion of a block name used by the master index. - * - * @param name The block name - * - * @return The master index portion of the block name - **/ -static INLINE uint64_t extractMasterIndexBytes(const UdsChunkName *name) -{ - return getUInt64BE(&name->name[MASTER_INDEX_BYTES_OFFSET]); -} - -/** - * Extract the portion of a block name used for sparse sampling. - * - * @param name The block name - * - * @return The sparse sample portion of the block name - **/ -static INLINE uint32_t extractSamplingBytes(const UdsChunkName *name) -{ - return getUInt16BE(&name->name[SAMPLE_BYTES_OFFSET]); -} - -/** - * For a given block, find the chapter delta list to use - * - * @param name The block name to hash - * @param geometry The geometry to use - * - * @return The chapter delta list where we expect to find the given blockname - **/ -static INLINE unsigned int hashToChapterDeltaList(const UdsChunkName *name, - const Geometry *geometry) -{ - return (unsigned int) ((extractChapterIndexBytes(name) - >> geometry->chapterAddressBits) - & ((1 << geometry->chapterDeltaListBits) - 1)); -} - -/** - * For a given block, find the chapter delta address to use - * - * @param name The block name to hash - * @param geometry The geometry to use - * - * @return The chapter delta address to use - **/ -static INLINE unsigned int hashToChapterDeltaAddress(const UdsChunkName *name, - const Geometry *geometry) -{ - return (unsigned int) (extractChapterIndexBytes(name) - & ((1 << geometry->chapterAddressBits) - 1)); -} - -/** - * For a given block name, find the slot in the open chapter hash table - * where it is expected to reside. - * - * @param name The block name to hash - * @param slotCount The size of the hash table - * - * @return the record number in the index page where we expect to find - # the given blockname - **/ -static INLINE unsigned int nameToHashSlot(const UdsChunkName *name, - unsigned int slotCount) -{ - return (unsigned int) (extractChapterIndexBytes(name) % slotCount); -} - -/** - * Convert a chunk name to hex to make it more readable. - * - * @param chunkName The chunk name - * @param hexData The resulting hexdata from the given chunk name - * @param hexDataLen The capacity of hexData - * - * @return UDS_SUCCESS, - * or UDS_INVALID_ARGUMENT if hexDataLen - * is too short. - **/ -int chunkNameToHex(const UdsChunkName *chunkName, - char *hexData, - size_t hexDataLen) - __attribute__((warn_unused_result)); - -/** - * Convert chunk data to hex to make it more readable. - * - * @param chunkData The chunk data - * @param hexData The resulting hexdata from the given chunk data - * @param hexDataLen The capacity of hexData - * - * @return UDS_SUCCESS, - * or UDS_INVALID_ARGUMENT if hexDataLen - * is too short. - **/ -int chunkDataToHex(const UdsChunkData *chunkData, - char *hexData, - size_t hexDataLen) - __attribute__((warn_unused_result)); - -/** - * Compute the number of bits required to store a field with the given - * maximum value. - * - * @param maxValue The maximum value of the field - * - * @return the number of bits required - **/ -unsigned int computeBits(unsigned int maxValue) - __attribute__((warn_unused_result)); - -/** - * FOR TESTING. Set the portion of a block name used by the chapter index. - * - * @param name The block name - * @param value The value to store - **/ -static INLINE void setChapterIndexBytes(UdsChunkName *name, uint64_t value) -{ - // Store the high order bytes, then the low-order bytes - storeUInt16BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET], - (uint16_t)(value >> 32)); - storeUInt32BE(&name->name[CHAPTER_INDEX_BYTES_OFFSET + 2], - (uint32_t)value); -} - -/** - * FOR TESTING. Set the bits used to find a chapter delta list - * - * @param name The block name - * @param geometry The geometry to use - * @param value The value to store - **/ -static INLINE void setChapterDeltaListBits(UdsChunkName *name, - const Geometry *geometry, - uint64_t value) -{ - uint64_t deltaAddress = hashToChapterDeltaAddress(name, geometry); - deltaAddress |= value << geometry->chapterAddressBits; - setChapterIndexBytes(name, deltaAddress); -} - -/** - * FOR TESTING. Set the portion of a block name used by the master index. - * - * @param name The block name - * @param val The value to store - **/ -static INLINE void setMasterIndexBytes(UdsChunkName *name, uint64_t val) -{ - storeUInt64BE(&name->name[MASTER_INDEX_BYTES_OFFSET], val); -} - -/** - * Set the portion of a block name used for sparse sampling. - * - * @param name The block name - * @param value The value to store - **/ -static INLINE void setSamplingBytes(UdsChunkName *name, uint32_t value) -{ - storeUInt16BE(&name->name[SAMPLE_BYTES_OFFSET], (uint16_t)value); -} - -/** - * Special function wrapper required for compile-time assertions. This - * function will fail to compile if UDS_CHUNK_NAME_SIZE is not an integer - * multiple of 8. - **/ -void hashUtilsCompileTimeAssertions(void); - -#endif /* HASH_UTILS_H */ diff --git a/uds/index.c b/uds/index.c deleted file mode 100644 index a84d50f..0000000 --- a/uds/index.c +++ /dev/null @@ -1,908 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/index.c#15 $ - */ - -#include "index.h" - -#include "hashUtils.h" -#include "indexCheckpoint.h" -#include "indexInternals.h" -#include "logger.h" - -static const uint64_t NO_LAST_CHECKPOINT = UINT_MAX; - - -/** - * Replay an index which was loaded from a checkpoint. - * - * @param index The index to replay - * @param lastCheckpointChapter The number of the chapter where the - * last checkpoint was made - * - * @return UDS_SUCCESS or an error code. - **/ -static int replayIndexFromCheckpoint(Index *index, - uint64_t lastCheckpointChapter) -{ - // Find the volume chapter boundaries - uint64_t lowestVCN, highestVCN; - bool isEmpty = false; - IndexLookupMode oldLookupMode = index->volume->lookupMode; - index->volume->lookupMode = LOOKUP_FOR_REBUILD; - int result = findVolumeChapterBoundaries(index->volume, &lowestVCN, - &highestVCN, &isEmpty); - index->volume->lookupMode = oldLookupMode; - if (result != UDS_SUCCESS) { - return logFatalWithStringError(result, - "cannot replay index: " - "unknown volume chapter boundaries"); - } - if (lowestVCN > highestVCN) { - logFatal("cannot replay index: no valid chapters exist"); - return UDS_CORRUPT_COMPONENT; - } - - if (isEmpty) { - // The volume is empty, so the index should also be empty - if (index->newestVirtualChapter != 0) { - logFatal("cannot replay index from empty volume"); - return UDS_CORRUPT_COMPONENT; - } - return UDS_SUCCESS; - } - - unsigned int chaptersPerVolume = index->volume->geometry->chaptersPerVolume; - index->oldestVirtualChapter = lowestVCN; - index->newestVirtualChapter = highestVCN + 1; - if (index->newestVirtualChapter == lowestVCN + chaptersPerVolume) { - // skip the chapter shadowed by the open chapter - index->oldestVirtualChapter++; - } - - uint64_t firstReplayChapter = lastCheckpointChapter; - if (firstReplayChapter < index->oldestVirtualChapter) { - firstReplayChapter = index->oldestVirtualChapter; - } - return replayVolume(index, firstReplayChapter); -} - -/**********************************************************************/ -static int loadIndex(Index *index, bool allowReplay) -{ - bool replayRequired = false; - - int result = loadIndexState(index->state, &replayRequired); - if (result != UDS_SUCCESS) { - return result; - } - - if (replayRequired && !allowReplay) { - return logErrorWithStringError( - UDS_INDEX_NOT_SAVED_CLEANLY, - "index not saved cleanly: open chapter missing"); - } - - uint64_t lastCheckpointChapter - = ((index->lastCheckpoint != NO_LAST_CHECKPOINT) - ? index->lastCheckpoint : 0); - - logInfo("loaded index from chapter %llu through chapter %llu", - index->oldestVirtualChapter, lastCheckpointChapter); - - if (replayRequired) { - result = replayIndexFromCheckpoint(index, lastCheckpointChapter); - if (result != UDS_SUCCESS) { - return result; - } - } - - unsigned int i; - for (i = 0; i < index->zoneCount; i++) { - setActiveChapters(index->zones[i]); - } - - index->loadedType = replayRequired ? LOAD_REPLAY : LOAD_LOAD; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int rebuildIndex(Index *index) -{ - // Find the volume chapter boundaries - uint64_t lowestVCN, highestVCN; - bool isEmpty = false; - IndexLookupMode oldLookupMode = index->volume->lookupMode; - index->volume->lookupMode = LOOKUP_FOR_REBUILD; - int result = findVolumeChapterBoundaries(index->volume, &lowestVCN, - &highestVCN, &isEmpty); - index->volume->lookupMode = oldLookupMode; - if (result != UDS_SUCCESS) { - return logFatalWithStringError(result, - "cannot rebuild index: " - "unknown volume chapter boundaries"); - } - if (lowestVCN > highestVCN) { - logFatal("cannot rebuild index: no valid chapters exist"); - return UDS_CORRUPT_COMPONENT; - } - - if (isEmpty) { - index->newestVirtualChapter = index->oldestVirtualChapter = 0; - } else { - unsigned int numChapters = index->volume->geometry->chaptersPerVolume; - index->newestVirtualChapter = highestVCN + 1; - index->oldestVirtualChapter = lowestVCN; - if (index->newestVirtualChapter - == (index->oldestVirtualChapter + numChapters)) { - // skip the chapter shadowed by the open chapter - index->oldestVirtualChapter++; - } - } - - if ((index->newestVirtualChapter - index->oldestVirtualChapter) > - index->volume->geometry->chaptersPerVolume) { - return logFatalWithStringError(UDS_CORRUPT_COMPONENT, - "cannot rebuild index: " - "volume chapter boundaries too large"); - } - - setMasterIndexOpenChapter(index->masterIndex, 0); - if (isEmpty) { - index->loadedType = LOAD_EMPTY; - return UDS_SUCCESS; - } - - result = replayVolume(index, index->oldestVirtualChapter); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int i; - for (i = 0; i < index->zoneCount; i++) { - setActiveChapters(index->zones[i]); - } - - index->loadedType = LOAD_REBUILD; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int makeIndex(IndexLayout *layout, - const Configuration *config, - const struct uds_parameters *userParams, - unsigned int zoneCount, - LoadType loadType, - IndexLoadContext *loadContext, - Index **newIndex) -{ - Index *index; - int result = allocateIndex(layout, config, userParams, zoneCount, loadType, - &index); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "could not allocate index"); - } - - index->loadContext = loadContext; - - uint64_t nonce = getVolumeNonce(layout); - result = makeMasterIndex(config, zoneCount, nonce, &index->masterIndex); - if (result != UDS_SUCCESS) { - freeIndex(index); - return logErrorWithStringError(result, "could not make master index"); - } - - result = addIndexStateComponent(index->state, MASTER_INDEX_INFO, NULL, - index->masterIndex); - if (result != UDS_SUCCESS) { - freeIndex(index); - return result; - } - - result = addIndexStateComponent(index->state, &INDEX_PAGE_MAP_INFO, - index->volume->indexPageMap, NULL); - if (result != UDS_SUCCESS) { - freeIndex(index); - return result; - } - - result = makeChapterWriter(index, getIndexVersion(layout), - &index->chapterWriter); - if (result != UDS_SUCCESS) { - freeIndex(index); - return result; - } - - if ((loadType == LOAD_LOAD) || (loadType == LOAD_REBUILD)) { - if (!index->existed) { - freeIndex(index); - return UDS_NO_INDEX; - } - result = loadIndex(index, loadType == LOAD_REBUILD); - switch (result) { - case UDS_SUCCESS: - break; - case ENOMEM: - // We should not try a rebuild for this error. - logErrorWithStringError(result, "index could not be loaded"); - break; - default: - logErrorWithStringError(result, "index could not be loaded"); - if (loadType == LOAD_REBUILD) { - result = rebuildIndex(index); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "index could not be rebuilt"); - } - } - break; - } - } else { - index->loadedType = LOAD_CREATE; - discardIndexStateData(index->state); - } - - if (result != UDS_SUCCESS) { - freeIndex(index); - return logUnrecoverable(result, "fatal error in makeIndex"); - } - - if (index->loadContext != NULL) { - lockMutex(&index->loadContext->mutex); - index->loadContext->status = INDEX_READY; - // If we get here, suspend is meaningless, but notify any thread trying - // to suspend us so it doesn't hang. - broadcastCond(&index->loadContext->cond); - unlockMutex(&index->loadContext->mutex); - } - - index->hasSavedOpenChapter = index->loadedType == LOAD_LOAD; - *newIndex = index; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeIndex(Index *index) -{ - if (index == NULL) { - return; - } - freeChapterWriter(index->chapterWriter); - - if (index->masterIndex != NULL) { - freeMasterIndex(index->masterIndex); - } - releaseIndex(index); -} - -/**********************************************************************/ -int saveIndex(Index *index) -{ - waitForIdleChapterWriter(index->chapterWriter); - int result = finishCheckpointing(index); - if (result != UDS_SUCCESS) { - logInfo("save index failed"); - return result; - } - beginSave(index, false, index->newestVirtualChapter); - - result = saveIndexState(index->state); - if (result != UDS_SUCCESS) { - logInfo("save index failed"); - index->lastCheckpoint = index->prevCheckpoint; - } else { - index->hasSavedOpenChapter = true; - logInfo("finished save (vcn %llu)", index->lastCheckpoint); - } - return result; -} - -/** - * Get the zone for a request. - * - * @param index The index - * @param request The request - * - * @return The zone for the request - **/ -static IndexZone *getRequestZone(Index *index, Request *request) -{ - return index->zones[request->zoneNumber]; -} - -/** - * Search an index zone. This function is only correct for LRU. - * - * @param zone The index zone to query. - * @param request The request originating the query. - * - * @return UDS_SUCCESS or an error code - **/ -static int searchIndexZone(IndexZone *zone, Request *request) -{ - MasterIndexRecord record; - int result = getMasterIndexRecord(zone->index->masterIndex, - &request->chunkName, &record); - if (result != UDS_SUCCESS) { - return result; - } - - bool found = false; - if (record.isFound) { - result = getRecordFromZone(zone, request, &found, record.virtualChapter); - if (result != UDS_SUCCESS) { - return result; - } - if (found) { - request->location = computeIndexRegion(zone, record.virtualChapter); - } - } - - /* - * If a record has overflowed a chapter index in more than one chapter - * (or overflowed in one chapter and collided with an existing record), - * it will exist as a collision record in the master index, but we won't - * find it in the volume. This case needs special handling. - */ - bool overflowRecord = (record.isFound && record.isCollision && !found); - uint64_t chapter = zone->newestVirtualChapter; - if (found || overflowRecord) { - if ((request->action == REQUEST_QUERY) - && (!request->update || overflowRecord)) { - /* This is a query without update, or with nothing to update */ - return UDS_SUCCESS; - } - - if (record.virtualChapter != chapter) { - /* - * Update the master index to reference the new chapter for the block. - * If the record had been deleted or dropped from the chapter index, it - * will be back. - */ - result = setMasterIndexRecordChapter(&record, chapter); - } else if (request->action != REQUEST_UPDATE) { - /* The record is already in the open chapter, so we're done */ - return UDS_SUCCESS; - } - } else { - // The record wasn't in the master index, so check whether the name - // is in a cached sparse chapter. - if (!isMasterIndexSample(zone->index->masterIndex, &request->chunkName) - && isSparse(zone->index->volume->geometry)) { - // Passing UINT64_MAX triggers a search of the entire sparse cache. - result = searchSparseCacheInZone(zone, request, UINT64_MAX, &found); - if (result != UDS_SUCCESS) { - return result; - } - - if (found) { - request->location = LOC_IN_SPARSE; - } - } - - if (request->action == REQUEST_QUERY) { - if (!found || !request->update) { - // This is a query without update or for a new record, so we're done. - return UDS_SUCCESS; - } - } - - /* - * Add a new entry to the master index referencing the open chapter. - * This needs to be done both for new records, and for records from - * cached sparse chapters. - */ - result = putMasterIndexRecord(&record, chapter); - } - - if (result == UDS_OVERFLOW) { - /* - * The master index encountered a delta list overflow. The condition - * was already logged. We will go on without adding the chunk to the - * open chapter. - */ - return UDS_SUCCESS; - } - - if (result != UDS_SUCCESS) { - return result; - } - - UdsChunkData *metadata; - if (!found || (request->action == REQUEST_UPDATE)) { - // This is a new record or we're updating an existing record. - metadata = &request->newMetadata; - } else { - // This is a duplicate, so move the record to the open chapter (for LRU). - metadata = &request->oldMetadata; - } - return putRecordInZone(zone, request, metadata); -} - -/**********************************************************************/ -static int removeFromIndexZone(IndexZone *zone, Request *request) -{ - MasterIndexRecord record; - int result = getMasterIndexRecord(zone->index->masterIndex, - &request->chunkName, &record); - if (result != UDS_SUCCESS) { - return result; - } - - if (!record.isFound) { - // The name does not exist in master index, so there is nothing to remove. - return UDS_SUCCESS; - } - - if (!record.isCollision) { - // Non-collision records are hints, so resolve the name in the chapter. - bool found; - int result = getRecordFromZone(zone, request, &found, - record.virtualChapter); - if (result != UDS_SUCCESS) { - return result; - } - - if (!found) { - // The name does not exist in the chapter, so there is nothing to remove. - return UDS_SUCCESS; - } - } - - request->location = computeIndexRegion(zone, record.virtualChapter); - - /* - * Delete the master index entry for the named record only. Note that a - * later search might later return stale advice if there is a colliding name - * in the same chapter, but it's a very rare case (1 in 2^21). - */ - result = removeMasterIndexRecord(&record); - if (result != UDS_SUCCESS) { - return result; - } - - // If the record is in the open chapter, we must remove it or mark it - // deleted to avoid trouble if the record is added again later. - if (request->location == LOC_IN_OPEN_CHAPTER) { - bool hashExists = false; - removeFromOpenChapter(zone->openChapter, &request->chunkName, &hashExists); - result = ASSERT(hashExists, "removing record not found in open chapter"); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/** - * Simulate the creation of a sparse cache barrier message by the triage - * queue, and the later execution of that message in an index zone. - * - * If the index receiving the request is multi-zone or dense, this function - * does nothing. This simulation is an optimization for single-zone sparse - * indexes. It also supports unit testing of indexes without routers and - * queues. - * - * @param zone the index zone responsible for the index request - * @param request the index request about to be executed - * - * @return UDS_SUCCESS always - **/ -static int simulateIndexZoneBarrierMessage(IndexZone *zone, Request *request) -{ - // Do nothing unless this is a single-zone sparse index. - if ((zone->index->zoneCount > 1) - || !isSparse(zone->index->volume->geometry)) { - return UDS_SUCCESS; - } - - // Check if the index request is for a sampled name in a sparse chapter. - uint64_t sparseVirtualChapter = triageIndexRequest(zone->index, request); - if (sparseVirtualChapter == UINT64_MAX) { - // Not indexed, not a hook, or in a chapter that is still dense, which - // means there should be no change to the sparse chapter index cache. - return UDS_SUCCESS; - } - - /* - * The triage queue would have generated and enqueued a barrier message - * preceding this request, which we simulate by directly invoking the - * execution hook for an equivalent message. - */ - BarrierMessageData barrier = { .virtualChapter = sparseVirtualChapter }; - return executeSparseCacheBarrierMessage(zone, &barrier); -} - -/**********************************************************************/ -static int dispatchIndexZoneRequest(IndexZone *zone, Request *request) -{ - if (!request->requeued) { - // Single-zone sparse indexes don't have a triage queue to generate cache - // barrier requests, so see if we need to synthesize a barrier. - int result = simulateIndexZoneBarrierMessage(zone, request); - if (result != UDS_SUCCESS) { - return result; - } - } - - // Set the default location. It will be overwritten if we find the chunk. - request->location = LOC_UNAVAILABLE; - - int result; - switch (request->action) { - case REQUEST_INDEX: - case REQUEST_UPDATE: - case REQUEST_QUERY: - result = makeUnrecoverable(searchIndexZone(zone, request)); - break; - - case REQUEST_DELETE: - result = makeUnrecoverable(removeFromIndexZone(zone, request)); - break; - - default: - result = logWarningWithStringError(UDS_INVALID_ARGUMENT, - "attempted to execute invalid action:" - " %d", - request->action); - break; - } - - return result; -} - -/**********************************************************************/ -int dispatchIndexRequest(Index *index, Request *request) -{ - return dispatchIndexZoneRequest(getRequestZone(index, request), request); -} - -/**********************************************************************/ -static int rebuildIndexPageMap(Index *index, uint64_t vcn) -{ - Geometry *geometry = index->volume->geometry; - unsigned int chapter = mapToPhysicalChapter(geometry, vcn); - unsigned int expectedListNumber = 0; - unsigned int indexPageNumber; - for (indexPageNumber = 0; - indexPageNumber < geometry->indexPagesPerChapter; - indexPageNumber++) { - DeltaIndexPage *chapterIndexPage; - int result = getPage(index->volume, chapter, indexPageNumber, - CACHE_PROBE_INDEX_FIRST, NULL, &chapterIndexPage); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "failed to read index page %u" - " in chapter %u", - indexPageNumber, chapter); - } - unsigned int lowestDeltaList = chapterIndexPage->lowestListNumber; - unsigned int highestDeltaList = chapterIndexPage->highestListNumber; - if (lowestDeltaList != expectedListNumber) { - return logErrorWithStringError(UDS_CORRUPT_DATA, - "chapter %u index page %u is corrupt", - chapter, indexPageNumber); - } - result = updateIndexPageMap(index->volume->indexPageMap, vcn, chapter, - indexPageNumber, highestDeltaList); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "failed to update chapter %u index page" - " %u", - chapter, indexPageNumber); - } - expectedListNumber = highestDeltaList + 1; - } - return UDS_SUCCESS; -} - -/** - * Add an entry to the master index when rebuilding. - * - * @param index The index to query. - * @param name The block name of interest. - * @param virtualChapter The virtual chapter number to write to the - * master index - * @param willBeSparseChapter True if this entry will be in the sparse portion - * of the index at the end of rebuilding - * - * @return UDS_SUCCESS or an error code - **/ -static int replayRecord(Index *index, - const UdsChunkName *name, - uint64_t virtualChapter, - bool willBeSparseChapter) -{ - if (willBeSparseChapter && !isMasterIndexSample(index->masterIndex, name)) { - // This entry will be in a sparse chapter after the rebuild completes, - // and it is not a sample, so just skip over it. - return UDS_SUCCESS; - } - - MasterIndexRecord record; - int result = getMasterIndexRecord(index->masterIndex, name, &record); - if (result != UDS_SUCCESS) { - return result; - } - - bool updateRecord; - if (record.isFound) { - if (record.isCollision) { - if (record.virtualChapter == virtualChapter) { - /* The record is already correct, so we don't need to do anything */ - return UDS_SUCCESS; - } - updateRecord = true; - } else if (record.virtualChapter == virtualChapter) { - /* - * There is a master index entry pointing to the current - * chapter, but we don't know if it is for the same name as the - * one we are currently working on or not. For now, we're just - * going to assume that it isn't. This will create one extra - * collision record if there was a deleted record in the current - * chapter. - */ - updateRecord = false; - } else { - /* - * If we're rebuilding, we don't normally want to go to disk to see if - * the record exists, since we will likely have just read the record from - * disk (i.e. we know it's there). The exception to this is when we - * already find an entry in the master index that has a different chapter. - * In this case, we need to search that chapter to determine if the - * master index entry was for the same record or a different one. - */ - result = searchVolumePageCache(index->volume, NULL, name, - record.virtualChapter, NULL, - &updateRecord); - if (result != UDS_SUCCESS) { - return result; - } - } - } else { - updateRecord = false; - } - - if (updateRecord) { - /* - * Update the master index to reference the new chapter for the block. - * If the record had been deleted or dropped from the chapter index, it - * will be back. - */ - result = setMasterIndexRecordChapter(&record, virtualChapter); - } else { - /* - * Add a new entry to the master index referencing the open - * chapter. This should be done regardless of whether we are a brand - * new record or a sparse record, i.e. one that doesn't exist in the - * index but does on disk, since for a sparse record, we would want to - * un-sparsify if it did exist. - */ - result = putMasterIndexRecord(&record, virtualChapter); - } - - if ((result == UDS_DUPLICATE_NAME) || (result == UDS_OVERFLOW)) { - /* Ignore duplicate record and delta list overflow errors */ - return UDS_SUCCESS; - } - - return result; -} - -/**********************************************************************/ -void beginSave(Index *index, bool checkpoint, uint64_t openChapterNumber) -{ - index->prevCheckpoint = index->lastCheckpoint; - index->lastCheckpoint = ((openChapterNumber == 0) - ? NO_LAST_CHECKPOINT - : openChapterNumber - 1); - - const char *what = (checkpoint ? "checkpoint" : "save"); - logInfo("beginning %s (vcn %llu)", what, index->lastCheckpoint); -} - -/** - * Suspend the index if necessary and wait for a signal to resume. - * - * @param index The index to replay - * - * @return true if the replay should terminate - **/ -static bool checkForSuspend(Index *index) -{ - if (index->loadContext == NULL) { - return false; - } - - lockMutex(&index->loadContext->mutex); - if (index->loadContext->status != INDEX_SUSPENDING) { - unlockMutex(&index->loadContext->mutex); - return false; - } - - // Notify that we are suspended and wait for the resume. - index->loadContext->status = INDEX_SUSPENDED; - broadcastCond(&index->loadContext->cond); - - while ((index->loadContext->status != INDEX_OPENING) - && (index->loadContext->status != INDEX_FREEING)) { - waitCond(&index->loadContext->cond, &index->loadContext->mutex); - } - - bool retVal = (index->loadContext->status == INDEX_FREEING); - unlockMutex(&index->loadContext->mutex); - return retVal; -} - -/**********************************************************************/ -int replayVolume(Index *index, uint64_t fromVCN) -{ - int result; - uint64_t uptoVCN = index->newestVirtualChapter; - logInfo("Replaying volume from chapter %llu through chapter %" - PRIu64, - fromVCN, uptoVCN); - setMasterIndexOpenChapter(index->masterIndex, uptoVCN); - setMasterIndexOpenChapter(index->masterIndex, fromVCN); - - /* - * At least two cases to deal with here! - * - index loaded but replaying from lastCheckpoint; maybe full, maybe not - * - index failed to load, full rebuild - * Starts empty, then dense-only, then dense-plus-sparse. - * Need to sparsify while processing individual chapters. - */ - IndexLookupMode oldLookupMode = index->volume->lookupMode; - index->volume->lookupMode = LOOKUP_FOR_REBUILD; - /* - * Go through each record page of each chapter and add the records back to - * the master index. This should not cause anything to be written to either - * the open chapter or on disk volume. Also skip the on disk chapter - * corresponding to upto, as this would have already been - * purged from the master index when the chapter was opened. - * - * Also, go through each index page for each chapter and rebuild the - * index page map. - */ - const Geometry *geometry = index->volume->geometry; - uint64_t oldIPMupdate = getLastUpdate(index->volume->indexPageMap); - uint64_t vcn; - for (vcn = fromVCN; vcn < uptoVCN; ++vcn) { - if (checkForSuspend(index)) { - logInfo("Replay interrupted by index shutdown at chapter %llu", vcn); - return UDS_SHUTTINGDOWN; - } - - bool willBeSparseChapter = isChapterSparse(geometry, fromVCN, uptoVCN, - vcn); - unsigned int chapter = mapToPhysicalChapter(geometry, vcn); - prefetchVolumePages(&index->volume->volumeStore, - mapToPhysicalPage(geometry, chapter, 0), - geometry->pagesPerChapter); - setMasterIndexOpenChapter(index->masterIndex, vcn); - result = rebuildIndexPageMap(index, vcn); - if (result != UDS_SUCCESS) { - index->volume->lookupMode = oldLookupMode; - return logErrorWithStringError(result, - "could not rebuild index page map for" - " chapter %u", - chapter); - } - - unsigned int j; - for (j = 0; j < geometry->recordPagesPerChapter; j++) { - unsigned int recordPageNumber = geometry->indexPagesPerChapter + j; - byte *recordPage; - result = getPage(index->volume, chapter, recordPageNumber, - CACHE_PROBE_RECORD_FIRST, &recordPage, NULL); - if (result != UDS_SUCCESS) { - index->volume->lookupMode = oldLookupMode; - return logUnrecoverable(result, "could not get page %d", - recordPageNumber); - } - unsigned int k; - for (k = 0; k < geometry->recordsPerPage; k++) { - const byte *nameBytes = recordPage + (k * BYTES_PER_RECORD); - - UdsChunkName name; - memcpy(&name.name, nameBytes, UDS_CHUNK_NAME_SIZE); - - result = replayRecord(index, &name, vcn, willBeSparseChapter); - if (result != UDS_SUCCESS) { - char hexName[(2 * UDS_CHUNK_NAME_SIZE) + 1]; - if (chunkNameToHex(&name, hexName, sizeof(hexName)) != UDS_SUCCESS) { - strncpy(hexName, "", sizeof(hexName)); - } - index->volume->lookupMode = oldLookupMode; - return logUnrecoverable(result, - "could not find block %s during rebuild", - hexName); - } - } - } - } - index->volume->lookupMode = oldLookupMode; - - // We also need to reap the chapter being replaced by the open chapter - setMasterIndexOpenChapter(index->masterIndex, uptoVCN); - - uint64_t newIPMupdate = getLastUpdate(index->volume->indexPageMap); - - if (newIPMupdate != oldIPMupdate) { - logInfo("replay changed index page map update from %llu to %llu", - oldIPMupdate, newIPMupdate); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -void getIndexStats(Index *index, UdsIndexStats *counters) -{ - uint64_t cwAllocated = getChapterWriterMemoryAllocated(index->chapterWriter); - // We're accessing the master index while not on a zone thread, but that's - // safe to do when acquiring statistics. - MasterIndexStats denseStats, sparseStats; - getMasterIndexStats(index->masterIndex, &denseStats, &sparseStats); - - counters->entriesIndexed = (denseStats.recordCount - + sparseStats.recordCount); - counters->memoryUsed = ((uint64_t) denseStats.memoryAllocated - + (uint64_t) sparseStats.memoryAllocated - + (uint64_t) getCacheSize(index->volume) - + cwAllocated); - counters->collisions = (denseStats.collisionCount - + sparseStats.collisionCount); - counters->entriesDiscarded = (denseStats.discardCount - + sparseStats.discardCount); - counters->checkpoints = getCheckpointCount(index->checkpoint); -} - -/**********************************************************************/ -void advanceActiveChapters(Index *index) -{ - index->newestVirtualChapter++; - if (areSamePhysicalChapter(index->volume->geometry, - index->newestVirtualChapter, - index->oldestVirtualChapter)) { - index->oldestVirtualChapter++; - } -} - -/**********************************************************************/ -uint64_t triageIndexRequest(Index *index, Request *request) -{ - MasterIndexTriage triage; - lookupMasterIndexName(index->masterIndex, &request->chunkName, &triage); - if (!triage.inSampledChapter) { - // Not indexed or not a hook. - return UINT64_MAX; - } - - IndexZone *zone = getRequestZone(index, request); - if (!isZoneChapterSparse(zone, triage.virtualChapter)) { - return UINT64_MAX; - } - - // XXX Optimize for a common case by remembering the chapter from the most - // recent barrier message and skipping this chapter if is it the same. - - // Return the sparse chapter number to trigger the barrier messages. - return triage.virtualChapter; -} diff --git a/uds/index.h b/uds/index.h deleted file mode 100644 index d2bc805..0000000 --- a/uds/index.h +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/index.h#3 $ - */ - -#ifndef INDEX_H -#define INDEX_H - -#include "chapterWriter.h" -#include "indexLayout.h" -#include "indexSession.h" -#include "indexZone.h" -#include "loadType.h" -#include "masterIndexOps.h" -#include "volume.h" - - -/** - * Index checkpoint state private to indexCheckpoint.c. - **/ -typedef struct indexCheckpoint IndexCheckpoint; - -typedef struct index { - bool existed; - bool hasSavedOpenChapter; - LoadType loadedType; - IndexLoadContext *loadContext; - IndexLayout *layout; - IndexState *state; - MasterIndex *masterIndex; - Volume *volume; - unsigned int zoneCount; - IndexZone **zones; - - /* - * ATTENTION!!! - * The meaning of the next two fields has changed. - * - * They now represent the oldest and newest chapters only at load time, - * and when the index is quiescent. At other times, they may lag individual - * zones' views of the index depending upon the progress made by the chapter - * writer. - */ - uint64_t oldestVirtualChapter; - uint64_t newestVirtualChapter; - - uint64_t lastCheckpoint; - uint64_t prevCheckpoint; - ChapterWriter *chapterWriter; - - // checkpoint state used by indexCheckpoint.c - IndexCheckpoint *checkpoint; -} Index; - -/** - * Construct a new index from the given configuration. - * - * @param layout The index layout - * @param config The configuration to use - * @param userParams The index session parameters. If NULL, the default - * session parameters will be used. - * @param zoneCount The number of zones for this index to use - * @param loadType How to create the index: it can be create only, allow - * loading from files, and allow rebuilding from the volume - * @param loadContext The load context to use - * @param newIndex A pointer to hold a pointer to the new index - * - * @return UDS_SUCCESS or an error code - **/ -int makeIndex(IndexLayout *layout, - const Configuration *config, - const struct uds_parameters *userParams, - unsigned int zoneCount, - LoadType loadType, - IndexLoadContext *loadContext, - Index **newIndex) - __attribute__((warn_unused_result)); - -/** - * Save an index. - * - * Before saving an index and while saving an index, the caller must ensure - * that there are no index requests in progress. - * - * Some users follow saveIndex immediately with a freeIndex. But some tests - * use the IndexLayout to modify the saved index. The Index will then have - * some cached information that does not reflect these updates. - * - * @param index The index to save - * - * @return UDS_SUCCESS if successful - **/ -int saveIndex(Index *index) __attribute__((warn_unused_result)); - -/** - * Clean up the index and its memory. - * - * @param index The index to destroy. - **/ -void freeIndex(Index *index); - -/** - * Perform the index operation specified by the action field of a UDS request. - * - * For UDS API requests, this searches the index for the chunk name in the - * request. If the chunk name is already present in the index, the location - * field of the request will be set to the IndexRegion where it was found. If - * the action is not DELETE, the oldMetadata field of the request will also be - * filled in with the prior metadata for the name. - * - * If the API request action is: - * - * REQUEST_INDEX, a record will be added to the open chapter with the - * metadata in the request for new records, and the existing metadata for - * existing records - * - * REQUEST_UPDATE, a record will be added to the open chapter with the - * metadata in the request - * - * REQUEST_QUERY, if the update flag is set in the request, any record - * found will be moved to the open chapter. In all other cases the contents - * of the index will remain unchanged. - * - * REQUEST_REMOVE, the any entry with the name will removed from the index - * - * For non-API requests, no chunk name search is involved. - * - * @param index The index - * @param request The originating request - * - * @return UDS_SUCCESS, UDS_QUEUED, or an error code - **/ -int dispatchIndexRequest(Index *index, Request *request) - __attribute__((warn_unused_result)); - -/** - * Internal helper to prepare the index for saving. - * - * @param index the index - * @param checkpoint whether the save is a checkpoint - * @param openChapterNumber the virtual chapter number of the open chapter - **/ -void beginSave(Index *index, bool checkpoint, uint64_t openChapterNumber); - -/** - * Replay the volume file to repopulate the master index. - * - * @param index The index - * @param fromVCN The virtual chapter to start replaying - * - * @return UDS_SUCCESS if successful - **/ -int replayVolume(Index *index, uint64_t fromVCN) - __attribute__((warn_unused_result)); - -/** - * Gather statistics from the master index, volume, and cache. - * - * @param index The index - * @param counters the statistic counters for the index - **/ -void getIndexStats(Index *index, UdsIndexStats *counters); - -/** - * Set lookup state for this index. Disabling lookups means assume - * all records queried are new (intended for debugging uses, e.g., - * albfill). - * - * @param index The index - * @param enabled The new lookup state - **/ -void setIndexLookupState(Index *index, bool enabled); - -/** - * Advance the newest virtual chapter. If this will overwrite the oldest - * virtual chapter, advance that also. - * - * @param index The index to advance - **/ -void advanceActiveChapters(Index *index); - -/** - * Triage an index request, deciding whether it requires that a sparse cache - * barrier message precede it. - * - * This resolves the chunk name in the request in the master index, - * determining if it is a hook or not, and if a hook, what virtual chapter (if - * any) it might be found in. If a virtual chapter is found, it checks whether - * that chapter appears in the sparse region of the index. If all these - * conditions are met, the (sparse) virtual chapter number is returned. In all - * other cases it returns UINT64_MAX. - * - * @param index the index that will process the request - * @param request the index request containing the chunk name to triage - * - * @return the sparse chapter number for the sparse cache barrier message, or - * UINT64_MAX if the request does not require a barrier - **/ -uint64_t triageIndexRequest(Index *index, Request *request) - __attribute__((warn_unused_result)); - -#endif /* INDEX_H */ diff --git a/uds/indexCheckpoint.c b/uds/indexCheckpoint.c deleted file mode 100644 index 9c803b6..0000000 --- a/uds/indexCheckpoint.c +++ /dev/null @@ -1,377 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexCheckpoint.c#2 $ - */ - -#include "indexCheckpoint.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "threads.h" -#include "typeDefs.h" - -/** - * index checkpointState values - * - * @note The order of these values is significant, - * see indexState.c doIndexStateCheckpointInZone(). - **/ -typedef enum checkpointState { - NOT_CHECKPOINTING, - CHECKPOINT_IN_PROGRESS, - CHECKPOINT_ABORTING -} CheckpointState; - -/** - * Private structure which tracks checkpointing. - **/ -struct indexCheckpoint { - Mutex mutex; // covers this group of fields - uint64_t chapter; // vcn of the starting chapter - CheckpointState state; // is checkpoint in progress or aborting - unsigned int zonesBusy; // count of zones not yet done - unsigned int frequency; // number of chapters between checkpoints - uint64_t checkpoints; // number of checkpoints this session -}; - -/** - * Enum return value of indexCheckpointTrigger function. - **/ -typedef enum indexCheckpointTriggerValue { - ICTV_IDLE, //< no checkpointing right now - ICTV_START, //< start a new checkpoint now - ICTV_CONTINUE, //< continue checkpointing if needed - ICTV_FINISH, //< finish checkpointing, next time will start new cycle - ICTV_ABORT //< immediately abort checkpointing -} IndexCheckpointTriggerValue; - -typedef int CheckpointFunction(Index *index, unsigned int zone); - -// These functions are called while holding the checkpoint->mutex but are -// expected to release it. -// -static CheckpointFunction doCheckpointStart; -static CheckpointFunction doCheckpointProcess; -static CheckpointFunction doCheckpointFinish; -static CheckpointFunction doCheckpointAbort; - -CheckpointFunction *const checkpointFuncs[] = { - NULL, - doCheckpointStart, - doCheckpointProcess, - doCheckpointFinish, - doCheckpointAbort -}; - -/**********************************************************************/ -int makeIndexCheckpoint(Index *index) -{ - IndexCheckpoint *checkpoint; - int result - = ALLOCATE(1, IndexCheckpoint, "IndexCheckpoint", &checkpoint); - if (result != UDS_SUCCESS) { - return result; - } - - result = initMutex(&checkpoint->mutex); - if (result != UDS_SUCCESS) { - FREE(checkpoint); - return result; - } - - checkpoint->checkpoints = 0; - - index->checkpoint = checkpoint; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeIndexCheckpoint(IndexCheckpoint *checkpoint) -{ - if (checkpoint != NULL) { - destroyMutex(&checkpoint->mutex); - FREE(checkpoint); - } -} - -/**********************************************************************/ -unsigned int getIndexCheckpointFrequency(IndexCheckpoint *checkpoint) -{ - lockMutex(&checkpoint->mutex); - unsigned int frequency = checkpoint->frequency; - unlockMutex(&checkpoint->mutex); - return frequency; -} - -/**********************************************************************/ -unsigned int setIndexCheckpointFrequency(IndexCheckpoint *checkpoint, - unsigned int frequency) -{ - lockMutex(&checkpoint->mutex); - unsigned int oldFrequency = checkpoint->frequency; - checkpoint->frequency = frequency; - unlockMutex(&checkpoint->mutex); - return oldFrequency; -} - -/**********************************************************************/ -uint64_t getCheckpointCount(IndexCheckpoint *checkpoint) -{ - return checkpoint->checkpoints; -} - -/**********************************************************************/ -static IndexCheckpointTriggerValue -getCheckpointAction(IndexCheckpoint *checkpoint, - uint64_t virtualChapter) -{ - if (checkpoint->frequency == 0) { - return ICTV_IDLE; - } - unsigned int value = virtualChapter % checkpoint->frequency; - if (checkpoint->state == CHECKPOINT_ABORTING) { - return ICTV_ABORT; - } else if (checkpoint->state == CHECKPOINT_IN_PROGRESS) { - if (value == checkpoint->frequency - 1) { - return ICTV_FINISH; - } else { - return ICTV_CONTINUE; - } - } else { - if (value == 0) { - return ICTV_START; - } else { - return ICTV_IDLE; - } - } -} - -/**********************************************************************/ -int processCheckpointing(Index *index, - unsigned int zone, - uint64_t newVirtualChapter) -{ - IndexCheckpoint *checkpoint = index->checkpoint; - lockMutex(&checkpoint->mutex); - - IndexCheckpointTriggerValue ictv - = getCheckpointAction(checkpoint, newVirtualChapter); - - if (ictv == ICTV_START) { - checkpoint->chapter = newVirtualChapter; - } - - CheckpointFunction *func = checkpointFuncs[ictv]; - if (func == NULL) { - // nothing to do in idle state - unlockMutex(&checkpoint->mutex); - return UDS_SUCCESS; - } - - return (*func)(index, zone); -} - -/**********************************************************************/ -int processChapterWriterCheckpointSaves(Index *index) -{ - IndexCheckpoint *checkpoint = index->checkpoint; - - int result = UDS_SUCCESS; - - lockMutex(&checkpoint->mutex); - if (checkpoint->state == CHECKPOINT_IN_PROGRESS) { - result = - performIndexStateCheckpointChapterSynchronizedSaves(index->state); - - if (result != UDS_SUCCESS) { - checkpoint->state = CHECKPOINT_ABORTING; - logInfo("checkpoint failed"); - index->lastCheckpoint = index->prevCheckpoint; - } - } - - unlockMutex(&checkpoint->mutex); - return result; -} - -/** - * Helper function used to abort checkpoint if an error has occurred. - * - * @param index the index - * @param result the error result - * - * @return result - **/ -static int abortCheckpointing(Index *index, int result) -{ - if (index->checkpoint->state != NOT_CHECKPOINTING) { - index->checkpoint->state = CHECKPOINT_ABORTING; - logInfo("checkpoint failed"); - index->lastCheckpoint = index->prevCheckpoint; - } - return result; -} - -/**********************************************************************/ -int finishCheckpointing(Index *index) -{ - IndexCheckpoint *checkpoint = index->checkpoint; - - int result = processChapterWriterCheckpointSaves(index); - if (result != UDS_SUCCESS) { - return result; - } - - lockMutex(&checkpoint->mutex); - - unsigned int z; - for (z = 0; z < index->zoneCount; ++z) { - if (checkpoint->state != CHECKPOINT_IN_PROGRESS) { - break; - } - result = doCheckpointFinish(index, z); - // reacquire mutex released by doCheckpointFinish - lockMutex(&checkpoint->mutex); - if (result != UDS_SUCCESS) { - break; - } - } - - if ((result == UDS_SUCCESS) && - (checkpoint->state == CHECKPOINT_IN_PROGRESS)) { - result = finishIndexStateCheckpoint(index->state); - if (result == UDS_SUCCESS) { - checkpoint->state = NOT_CHECKPOINTING; - } - } - - unlockMutex(&checkpoint->mutex); - return result; -} - -/** - * Starts an incremental checkpoint. - * - * Called by the first zone to finish a chapter which starts a checkpoint. - * - * @param index the index - * @param zone the zone number - * - * @return UDS_SUCCESS or an error code - **/ -static int doCheckpointStart(Index *index, unsigned int zone) -{ - IndexCheckpoint *checkpoint = index->checkpoint; - beginSave(index, true, checkpoint->chapter); - int result = startIndexStateCheckpoint(index->state); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "cannot start index checkpoint"); - index->lastCheckpoint = index->prevCheckpoint; - unlockMutex(&checkpoint->mutex); - return result; - } - - checkpoint->state = CHECKPOINT_IN_PROGRESS; - checkpoint->zonesBusy = index->zoneCount; - - return doCheckpointProcess(index, zone); -} - -/**********************************************************************/ -static int doCheckpointProcess(Index *index, unsigned int zone) -{ - IndexCheckpoint *checkpoint = index->checkpoint; - unlockMutex(&checkpoint->mutex); - CompletionStatus status = CS_NOT_COMPLETED; - int result = performIndexStateCheckpointInZone(index->state, zone, &status); - if (result != UDS_SUCCESS) { - lockMutex(&checkpoint->mutex); - logErrorWithStringError(result, "cannot continue index checkpoint"); - result = abortCheckpointing(index, result); - unlockMutex(&checkpoint->mutex); - } else if (status == CS_JUST_COMPLETED) { - lockMutex(&checkpoint->mutex); - if (--checkpoint->zonesBusy == 0) { - checkpoint->checkpoints += 1; - logInfo("finished checkpoint"); - result = finishIndexStateCheckpoint(index->state); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "%s checkpoint finish failed", - __func__); - } - checkpoint->state = NOT_CHECKPOINTING; - } - unlockMutex(&checkpoint->mutex); - } - return result; -} - -/**********************************************************************/ -static int doCheckpointAbort(Index *index, unsigned int zone) -{ - IndexCheckpoint *checkpoint = index->checkpoint; - CompletionStatus status = CS_NOT_COMPLETED; - int result = abortIndexStateCheckpointInZone(index->state, zone, &status); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "cannot abort index checkpoint"); - } else if (status == CS_JUST_COMPLETED) { - if (--checkpoint->zonesBusy == 0) { - logInfo("aborted checkpoint"); - result = abortIndexStateCheckpoint(index->state); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "checkpoint abort failed"); - } - checkpoint->state = NOT_CHECKPOINTING; - } - } - unlockMutex(&checkpoint->mutex); - - return result; -} - -/**********************************************************************/ -static int doCheckpointFinish(Index *index, unsigned int zone) -{ - IndexCheckpoint *checkpoint = index->checkpoint; - CompletionStatus status = CS_NOT_COMPLETED; - unlockMutex(&checkpoint->mutex); - int result = finishIndexStateCheckpointInZone(index->state, zone, &status); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "cannot finish index checkpoint"); - lockMutex(&checkpoint->mutex); - result = abortCheckpointing(index, result); - unlockMutex(&checkpoint->mutex); - } else if (status == CS_JUST_COMPLETED) { - lockMutex(&checkpoint->mutex); - if (--checkpoint->zonesBusy == 0) { - checkpoint->checkpoints += 1; - logInfo("finished checkpoint"); - result = finishIndexStateCheckpoint(index->state); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "%s checkpoint finish failed", - __func__); - } - checkpoint->state = NOT_CHECKPOINTING; - } - unlockMutex(&checkpoint->mutex); - } - return result; -} diff --git a/uds/indexCheckpoint.h b/uds/indexCheckpoint.h deleted file mode 100644 index 02d2936..0000000 --- a/uds/indexCheckpoint.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexCheckpoint.h#1 $ - */ - -#ifndef INDEX_CHECKPOINT_H -#define INDEX_CHECKPOINT_H - -#include "index.h" - -/** - * Construct and initialize the checkpoint sub-structure of an index. - * - * @param index the index receive the new checkpoint structure. - * - * @return UDS_SUCCESS or an error code - **/ -int makeIndexCheckpoint(Index *index) __attribute__((warn_unused_result)); - -/** - * Free the checkpoint sub-structure of an index. - * - * @param checkpoint the structure to free - **/ -void freeIndexCheckpoint(IndexCheckpoint *checkpoint); - -/** - * Get the current checkpointing frequency of an index. - * - * @param checkpoint the checkpoint state of the index - * - * @return the number of chapters between checkpoints - **/ -unsigned int getIndexCheckpointFrequency(IndexCheckpoint *checkpoint) - __attribute__((warn_unused_result)); - -/** - * Set checkpointing frequency for the index. - * - * @param checkpoint the checkpoint state of the index - * @param frequency The new checkpointing frequency - * - * @return the old checkpointing frequency - **/ -unsigned int setIndexCheckpointFrequency(IndexCheckpoint *checkpoint, - unsigned int frequency); - -/** - * Gets the number of checkpoints completed during the lifetime of this index - * - * @param checkpoint the checkpoint state of the index - * - * @return the number of checkpoints completed - **/ -uint64_t getCheckpointCount(IndexCheckpoint *checkpoint) - __attribute__((warn_unused_result)); - -/** - * If incremental checkpointing is in progress, finish it. - * - * @param index The index - * - * @return UDS_SUCCESS or an error code - * - * @note This function is called automatically during normal operation; - * its presence here is for tests that expect checkpointing to - * have completed at some point in their logic. It is not an - * error to call this function if checkpointing is not in - * progress, it silently returns success. - **/ -int finishCheckpointing(Index *index) __attribute__((warn_unused_result)); - -/** - * Process one zone's incremental checkpoint operation. Automatically - * starts, processes, and finishes a checkpoint over multiple invocations - * as successive chapters are closed and written. - * - * Uses its own mutex to serialize the starting and finishing or aborting, - * but allows parallel execution of the incremental progress. - * - * @param index The index to checkpoint - * @param zone The current zone number - * @param newVirtualChapter The number of the chapter which the calling - * zone has just opened - * - * @return UDS_SUCCESS or an error code. - **/ -int processCheckpointing(Index *index, - unsigned int zone, - uint64_t newVirtualChapter) - __attribute__((warn_unused_result)); - -/** - * Process saves done outside any zone by the chapter writer. - * - * Grabs the mutex associated with processCheckpointing(). - * - * @param index The index to process. - * - * @return UDS_SUCCESS or an error code. - **/ -int processChapterWriterCheckpointSaves(Index *index) - __attribute__((warn_unused_result)); - -#endif // INDEX_CHECKPOINT_H diff --git a/uds/indexComponent.c b/uds/indexComponent.c deleted file mode 100644 index c932b8d..0000000 --- a/uds/indexComponent.c +++ /dev/null @@ -1,745 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexComponent.c#8 $ - */ - -#include "indexComponent.h" - -#include "compiler.h" -#include "errors.h" -#include "indexLayout.h" -#include "indexState.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "typeDefs.h" - -/*****************************************************************************/ -int makeIndexComponent(IndexState *state, - const IndexComponentInfo *info, - unsigned int zoneCount, - void *data, - void *context, - IndexComponent **componentPtr) -{ - if ((info == NULL) || (info->name == NULL)) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "invalid component or directory specified"); - } - if (info->loader == NULL) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "no .loader function specified " - "for component %s", - info->name); - } - if ((info->saver == NULL) && (info->incremental == NULL)) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "neither .saver function nor .incremental " - "function specified for component %s", - info->name); - } - - IndexComponent *component = NULL; - int result = ALLOCATE(1, IndexComponent, "index component", &component); - if (result != UDS_SUCCESS) { - return result; - } - - component->componentData = data; - component->context = context; - component->info = info; - component->numZones = info->multiZone ? zoneCount : 1; - component->state = state; - component->writeZones = NULL; - *componentPtr = component; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static void freeWriteZones(IndexComponent *component) -{ - if (component->writeZones != NULL) { - unsigned int z; - for (z = 0; z < component->numZones; ++z) { - WriteZone *wz = component->writeZones[z]; - if (wz == NULL) { - continue; - } - freeBufferedWriter(wz->writer); - FREE(wz); - } - FREE(component->writeZones); - component->writeZones = NULL; - } -} - -/*****************************************************************************/ -void freeIndexComponent(IndexComponent **componentPtr) -{ - if (componentPtr == NULL) { - return; - } - IndexComponent *component = *componentPtr; - if (component == NULL) { - return; - } - *componentPtr = NULL; - - freeWriteZones(component); - FREE(component); -} - -/** - * Destroy, deallocate, and expunge a read portal. - * - * @param readPortal the readzone array - **/ -static void freeReadPortal(ReadPortal *readPortal) -{ - if (readPortal == NULL) { - return; - } - unsigned int z; - for (z = 0; z < readPortal->zones; ++z) { - if (readPortal->readers[z] != NULL) { - freeBufferedReader(readPortal->readers[z]); - } - } - FREE(readPortal->readers); - FREE(readPortal); -} - -/*****************************************************************************/ -int getBufferedReaderForPortal(ReadPortal *portal, - unsigned int part, - BufferedReader **readerPtr) -{ - if (part >= portal->zones) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "%s: cannot access zone %u of %u", - __func__, part, portal->zones); - } - IndexComponent *component = portal->component; - if (component->info->ioStorage && (portal->readers[part] == NULL)) { - int result = openStateBufferedReader(component->state, - component->info->kind, part, - &portal->readers[part]); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "%s: cannot make buffered reader " - "for zone %u", __func__, part); - } - } - *readerPtr = portal->readers[part]; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int readIndexComponent(IndexComponent *component) -{ - ReadPortal *portal; - int result = ALLOCATE(1, ReadPortal, "index component read portal", &portal); - if (result != UDS_SUCCESS) { - return result; - } - int readZones = component->state->loadZones; - result = ALLOCATE(readZones, BufferedReader *, "read zone buffered readers", - &portal->readers); - if (result != UDS_SUCCESS) { - FREE(portal); - return result; - } - - portal->component = component; - portal->zones = readZones; - result = (*component->info->loader)(portal); - freeReadPortal(portal); - return result; -} - -/** - * Determine the writeZone structure for the specified component and zone. - * - * @param [in] component the index component - * @param [in] zone the zone number - * @param [out] writeZonePtr the resulting write zone instance - * - * @return UDS_SUCCESS or an error code - **/ -static int resolveWriteZone(const IndexComponent *component, - unsigned int zone, - WriteZone **writeZonePtr) -{ - int result = ASSERT(writeZonePtr != NULL, - "output parameter is null"); - if (result != UDS_SUCCESS) { - return result; - } - - if (component->writeZones == NULL) { - return logErrorWithStringError(UDS_BAD_STATE, - "cannot resolve index component write zone:" - " not allocated"); - } - - if (zone >= component->numZones) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "cannot resolve index component write zone:" - " zone out of range"); - } - *writeZonePtr = component->writeZones[zone]; - return UDS_SUCCESS; -} - -/** - * Non-incremental save function used to emulate a regular save - * using an incremental save function as a basis. - * - * @param component the index component - * @param writer the buffered writer - * @param zone the zone number - * - * @return UDS_SUCCESS or an error code - **/ -static int indexComponentSaverIncrementalWrapper(IndexComponent *component, - BufferedWriter *writer, - unsigned int zone) -{ - IncrementalWriter incrFunc = component->info->incremental; - bool completed = false; - - int result = (*incrFunc)(component, writer, zone, IWC_START, &completed); - if (result != UDS_SUCCESS) { - return result; - } - - if (!completed) { - result = (*incrFunc)(component, writer, zone, IWC_FINISH, &completed); - if (result != UDS_SUCCESS) { - return result; - } - } - - result = flushBufferedWriter(writer); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/** - * Specify that writing to a specific zone file has finished. - * - * If a syncer has been registered with the index component, the file - * descriptor will be enqueued upon it for fsyncing and closing. - * If not, or if the enqueue fails, the file will be fsynced and closed - * immediately. - * - * @param writeZone the index component write zone - * - * @return UDS_SUCCESS or an error code - **/ -static int doneWithZone(WriteZone *writeZone) -{ - const IndexComponent *component = writeZone->component; - if (writeZone->writer != NULL) { - int result = flushBufferedWriter(writeZone->writer); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "cannot flush buffered writer for " - "%s component (zone %u)", - component->info->name, writeZone->zone); - } - } - return UDS_SUCCESS; -} - -/** - * Construct the array of WriteZone instances for this component. - * - * @param component the index component - * - * @return UDS_SUCCESS or an error code - * - * If this is a multizone component, each zone will be fully defined, - * otherwise zone 0 stands in for the single state file. - **/ -static int makeWriteZones(IndexComponent *component) -{ - unsigned int z; - if (component->writeZones != NULL) { - // just reinitialize states - for (z = 0; z < component->numZones; ++z) { - WriteZone *wz = component->writeZones[z]; - wz->phase = IWC_IDLE; - } - return UDS_SUCCESS; - } - - int result = ALLOCATE(component->numZones, WriteZone *, - "index component write zones", &component->writeZones); - if (result != UDS_SUCCESS) { - return result; - } - - for (z = 0; z < component->numZones; ++z) { - result = ALLOCATE(1, WriteZone, "plain write zone", - &component->writeZones[z]); - if (result != UDS_SUCCESS) { - freeWriteZones(component); - return result; - } - *component->writeZones[z] = (WriteZone) { - .component = component, - .phase = IWC_IDLE, - .zone = z, - }; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static int openBufferedWriters(IndexComponent *component) -{ - int result = UDS_SUCCESS; - WriteZone **wzp; - for (wzp = component->writeZones; - wzp < component->writeZones + component->numZones; - ++wzp) { - WriteZone *wz = *wzp; - wz->phase = IWC_START; - - result = ASSERT(wz->writer == NULL, "write zone writer already exists"); - if (result != UDS_SUCCESS) { - return result; - } - - if (component->info->ioStorage) { - int result = openStateBufferedWriter(component->state, - component->info->kind, wz->zone, - &wz->writer); - if (result != UDS_SUCCESS) { - return result; - } - } - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static int startIndexComponentSave(IndexComponent *component) -{ - int result = makeWriteZones(component); - if (result != UDS_SUCCESS) { - return result; - } - - result = openBufferedWriters(component); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int startIndexComponentIncrementalSave(IndexComponent *component) -{ - return startIndexComponentSave(component); -} - -/*****************************************************************************/ -int writeIndexComponent(IndexComponent *component) -{ - Saver saver = component->info->saver; - if ((saver == NULL) && (component->info->incremental != NULL)) { - saver = indexComponentSaverIncrementalWrapper; - } - - int result = startIndexComponentSave(component); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int z; - for (z = 0; z < component->numZones; ++z) { - WriteZone *writeZone = component->writeZones[z]; - - result = (*saver)(component, writeZone->writer, z); - if (result != UDS_SUCCESS) { - break; - } - - result = doneWithZone(writeZone); - if (result != UDS_SUCCESS) { - break; - } - - freeBufferedWriter(writeZone->writer); - writeZone->writer = NULL; - } - - if (result != UDS_SUCCESS) { - freeWriteZones(component); - return logErrorWithStringError(result, "index component write failed"); - } - - return UDS_SUCCESS; -} - -/** - * Close a specific buffered writer in a component write zone. - * - * @param writeZone the write zone - * - * @return UDS_SUCCESS or an error code - * - * @note closing a buffered writer causes its file descriptor to be - * passed to doneWithZone - **/ -static int closeBufferedWriter(WriteZone *writeZone) -{ - if (writeZone->writer == NULL) { - return UDS_SUCCESS; - } - - int result = doneWithZone(writeZone); - freeBufferedWriter(writeZone->writer); - writeZone->writer = NULL; - - return result; -} - -/** - * Faux incremental saver function for index components which only define - * a simple saver. Conforms to IncrementalWriter signature. - * - * @param [in] component the index component - * @param [in] writer the buffered writer that does the output - * @param [in] zone the zone number - * @param [in] command the incremental writer command - * @param [out] completed if non-NULL, set to whether the save is complete - * - * @return UDS_SUCCESS or an error code - * - * @note This wrapper always calls the non-incremental saver when - * the IWC_START command is issued, and always reports that - * the save is complete unless the saver failed. - **/ -static int wrapSaverAsIncremental(IndexComponent *component, - BufferedWriter *writer, - unsigned int zone, - IncrementalWriterCommand command, - bool *completed) -{ - int result = UDS_SUCCESS; - - if ((command >= IWC_START) && (command <= IWC_FINISH)) { - result = (*component->info->saver)(component, writer, zone); - if ((result == UDS_SUCCESS) && (writer != NULL)) { - noteBufferedWriterUsed(writer); - } - } - if ((result == UDS_SUCCESS) && (completed != NULL)) { - *completed = true; - } - return result; -} - -/** - * Return the appropriate incremental writer function depending on - * the component's type and whether this is the first zone. - * - * @param component the index component - * - * @return the correct IncrementalWriter function to use, or - * NULL signifying no progress can be made at this time. - **/ -static IncrementalWriter getIncrementalWriter(IndexComponent *component) -{ - IncrementalWriter incrFunc = component->info->incremental; - - if (incrFunc == NULL) { - incrFunc = &wrapSaverAsIncremental; - } - - return incrFunc; -} - -/*****************************************************************************/ -int performIndexComponentZoneSave(IndexComponent *component, - unsigned int zone, - CompletionStatus *completed) -{ - CompletionStatus comp = CS_NOT_COMPLETED; - - WriteZone *wz = NULL; - int result = resolveWriteZone(component, zone, &wz); - if (result != UDS_SUCCESS) { - return result; - } - - if (wz->phase == IWC_IDLE) { - comp = CS_COMPLETED_PREVIOUSLY; - } else if (wz->phase == IWC_DONE) { - comp = CS_JUST_COMPLETED; - wz->phase = IWC_IDLE; - } else if (!component->info->chapterSync) { - bool done = false; - IncrementalWriter incrFunc = getIncrementalWriter(component); - int result = (*incrFunc)(component, wz->writer, zone, wz->phase, &done); - if (result != UDS_SUCCESS) { - if (wz->phase == IWC_ABORT) { - wz->phase = IWC_IDLE; - } else { - wz->phase = IWC_ABORT; - } - return result; - } - if (done) { - comp = CS_JUST_COMPLETED; - wz->phase = IWC_IDLE; - } else if (wz->phase == IWC_START) { - wz->phase = IWC_CONTINUE; - } - } - - if (completed != NULL) { - *completed = comp; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int performIndexComponentChapterWriterSave(IndexComponent *component) -{ - WriteZone *wz = NULL; - int result = resolveWriteZone(component, 0, &wz); - if (result != UDS_SUCCESS) { - return result; - } - - if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { - bool done = false; - IncrementalWriter incrFunc = getIncrementalWriter(component); - int result = ASSERT(incrFunc != NULL, "no writer function"); - if (result != UDS_SUCCESS) { - return result; - } - result = (*incrFunc)(component, wz->writer, 0, wz->phase, &done); - if (result != UDS_SUCCESS) { - if (wz->phase == IWC_ABORT) { - wz->phase = IWC_IDLE; - } else { - wz->phase = IWC_ABORT; - } - return result; - } - if (done) { - wz->phase = IWC_DONE; - } else if (wz->phase == IWC_START) { - wz->phase = IWC_CONTINUE; - } - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int finishIndexComponentZoneSave(IndexComponent *component, - unsigned int zone, - CompletionStatus *completed) -{ - WriteZone *wz = NULL; - int result = resolveWriteZone(component, zone, &wz); - if (result != UDS_SUCCESS) { - return result; - } - - CompletionStatus comp; - switch (wz->phase) { - case IWC_IDLE: - comp = CS_COMPLETED_PREVIOUSLY; - break; - - case IWC_DONE: - comp = CS_JUST_COMPLETED; - break; - - default: - comp = CS_NOT_COMPLETED; - } - - IncrementalWriter incrFunc = getIncrementalWriter(component); - if ((wz->phase >= IWC_START) && (wz->phase < IWC_ABORT)) { - bool done = false; - int result = (*incrFunc)(component, wz->writer, zone, IWC_FINISH, &done); - if (result != UDS_SUCCESS) { - wz->phase = IWC_ABORT; - return result; - } - if (!done) { - logWarning("finish incremental save did not complete for %s zone %u", - component->info->name, zone); - return UDS_CHECKPOINT_INCOMPLETE; - } - wz->phase = IWC_IDLE; - comp = CS_JUST_COMPLETED; - } - - if (completed != NULL) { - *completed = comp; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int finishIndexComponentIncrementalSave(IndexComponent *component) -{ - unsigned int zone; - for (zone = 0; zone < component->numZones; ++zone) { - WriteZone *wz = component->writeZones[zone]; - IncrementalWriter incrFunc = getIncrementalWriter(component); - if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { - // Note: this is only safe if no other threads are currently processing - // this particular index - bool done = false; - int result = (*incrFunc)(component, wz->writer, zone, IWC_FINISH, &done); - if (result != UDS_SUCCESS) { - return result; - } - if (!done) { - logWarning("finishing incremental save did not complete for %s zone %u", - component->info->name, zone); - return UDS_UNEXPECTED_RESULT; - } - wz->phase = IWC_IDLE; - } - - if ((wz->writer != NULL) && !wasBufferedWriterUsed(wz->writer)) { - return logErrorWithStringError(UDS_CHECKPOINT_INCOMPLETE, - "component %s zone %u did not get written", - component->info->name, zone); - } - - int result = closeBufferedWriter(wz); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int abortIndexComponentZoneSave(IndexComponent *component, - unsigned int zone, - CompletionStatus *status) -{ - WriteZone *wz = NULL; - int result = resolveWriteZone(component, zone, &wz); - if (result != UDS_SUCCESS) { - return result; - } - - CompletionStatus comp = CS_COMPLETED_PREVIOUSLY; - - IncrementalWriter incrFunc = getIncrementalWriter(component); - if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { - result = (*incrFunc)(component, wz->writer, zone, IWC_ABORT, NULL); - wz->phase = IWC_IDLE; - if (result != UDS_SUCCESS) { - return result; - } - comp = CS_JUST_COMPLETED; - } - - if (status != NULL) { - *status = comp; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int abortIndexComponentIncrementalSave(IndexComponent *component) -{ - int result = UDS_SUCCESS; - unsigned int zone; - for (zone = 0; zone < component->numZones; ++zone) { - WriteZone *wz = component->writeZones[zone]; - IncrementalWriter incrFunc = getIncrementalWriter(component); - if ((wz->phase != IWC_IDLE) && (wz->phase != IWC_DONE)) { - // Note: this is only safe if no other threads are currently processing - // this particular index - result = (*incrFunc)(component, wz->writer, zone, IWC_ABORT, NULL); - wz->phase = IWC_IDLE; - if (result != UDS_SUCCESS) { - return result; - } - } - - int result = closeBufferedWriter(wz); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int discardIndexComponent(IndexComponent *component) -{ - if (!component->info->ioStorage) { - return UDS_INVALID_ARGUMENT; - } - - unsigned int numZones = 0; - unsigned int saveSlot = 0; - int result = findLatestIndexSaveSlot(component->state->layout, &numZones, - &saveSlot); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int oldSaveSlot = component->state->saveSlot; - component->state->saveSlot = saveSlot; - - unsigned int z; - for (z = 0; z < numZones; ++z) { - BufferedWriter *writer; - int result = openStateBufferedWriter(component->state, - component->info->kind, z, &writer); - if (result != UDS_SUCCESS) { - break; - } - result = writeZerosToBufferedWriter(writer, UDS_BLOCK_SIZE); - if (result != UDS_SUCCESS) { - break; - } - result = flushBufferedWriter(writer); - if (result != UDS_SUCCESS) { - break; - } - freeBufferedWriter(writer); - } - - component->state->saveSlot = oldSaveSlot; - return result; -} diff --git a/uds/indexComponent.h b/uds/indexComponent.h deleted file mode 100644 index 22066b1..0000000 --- a/uds/indexComponent.h +++ /dev/null @@ -1,363 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexComponent.h#5 $ - */ - -#ifndef INDEX_COMPONENT_H -#define INDEX_COMPONENT_H 1 - -#include "common.h" - -#include "bufferedReader.h" -#include "bufferedWriter.h" -#include "compiler.h" -#include "regionIdentifiers.h" - -typedef enum completionStatus { - CS_NOT_COMPLETED, // operation has not completed - CS_JUST_COMPLETED, // operation just completed - CS_COMPLETED_PREVIOUSLY // operation completed previously -} CompletionStatus; - -typedef struct readPortal { - struct indexComponent *component; - BufferedReader **readers; - unsigned int zones; -} ReadPortal; - -/** - * Prototype for functions which can load an index component from its - * saved state. - * - * @param portal A component portal which can be used to load the - * specified component. - * @return UDS_SUCCESS or an error code - **/ -typedef int (*Loader)(ReadPortal *portal); - -/** - * Prototype for functions which can save an index component. - * - * @param component The index component. - * @param writer A buffered writer. - * @param zone The zone number. - * - * @return UDS_SUCCESS or an error code - **/ -typedef int (*Saver)(struct indexComponent *component, - BufferedWriter *writer, - unsigned int zone); - -/** - * Command code used by IncrementalWriter function protocol. - **/ -typedef enum incrementalWriterCommand { - IWC_START, //< start an incremental save - IWC_CONTINUE, //< continue an incremental save - IWC_FINISH, //< force finish of incremental save - IWC_ABORT, //< abort incremental save - IWC_IDLE = -1,//< not a command, used internally to signify not in progress - IWC_DONE = -2 //< not a command, used internally to signify async completion -} IncrementalWriterCommand; - -typedef struct writeZone { - struct indexComponent *component; - IncrementalWriterCommand phase; - BufferedWriter *writer; - unsigned int zone; -} WriteZone; - -/** - * @param [in] component The index component. - * @param [in] writer A buffered writer. - * @param [in] zone The zone number (0 for non-multi-zone). - * @param [in] command The incremental writer command. - * @param [out] completed If non-NULL, set to whether save is done. - * - * @return UDS_SUCCESS or an error code - **/ -typedef int (*IncrementalWriter)(struct indexComponent *component, - BufferedWriter *writer, - unsigned int zone, - IncrementalWriterCommand command, - bool *completed); - -/** - * The structure describing how to load or save an index component. - * At least one of saver or incremental must be specified. - **/ -typedef struct indexComponentInfo { - RegionKind kind; // Region kind - const char *name; // The name of the component (for logging) - bool saveOnly; // Used for saves but not checkpoints - bool chapterSync; // Saved by the chapter writer - bool multiZone; // Does this component have multiple zones? - bool ioStorage; // Do we do I/O directly to storage? - Loader loader; // The function load this component - Saver saver; // The function to store this component - IncrementalWriter incremental; // The function for incremental writing -} IndexComponentInfo; - -/** - * The structure representing a savable (and loadable) part of an index. - **/ -typedef struct indexComponent { - const IndexComponentInfo *info; // IndexComponentInfo specification - void *componentData; // The object to load or save - void *context; // The context used to load or save - struct indexState *state; // The index state - unsigned int numZones; // Number of zones in write portal - WriteZone **writeZones; // State for writing component -} IndexComponent; - -/** - * Make an index component - * - * @param state The index state in which this component instance - * shall reside. - * @param info The component info specification for this component. - * @param zoneCount How many active zones are in use. - * @param data Component-specific data. - * @param context Component-specific context. - * @param componentPtr Where to store the resulting component. - * - * @return UDS_SUCCESS or an error code - **/ -int makeIndexComponent(struct indexState *state, - const IndexComponentInfo *info, - unsigned int zoneCount, - void *data, - void *context, - IndexComponent **componentPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy and index component. - * - * @param componentPtr A pointer to the component to be freed. - **/ -void freeIndexComponent(IndexComponent **componentPtr); - -/** - * Return the index component name for this component. - **/ -static INLINE const char *indexComponentName(IndexComponent *component) -{ - return component->info->name; -} - -/** - * Return the index component data for this component. - **/ -static INLINE void *indexComponentData(IndexComponent *component) -{ - return component->componentData; -} - -/** - * Return the index component context for this component. - **/ -static INLINE void *indexComponentContext(IndexComponent *component) -{ - return component->context; -} - -/** - * Determine whether this component may be skipped for a checkpoint. - * - * @param component the component, - * - * @return whether the component may be skipped - **/ -static INLINE bool skipIndexComponentOnCheckpoint(IndexComponent *component) -{ - return component->info->saveOnly; -} - -/** - * Determine whether actual saving during a checkpoint should be - * invoked by the chapter writer thread. - **/ -static INLINE bool -deferIndexComponentCheckpointToChapterWriter(IndexComponent *component) -{ - return component->info->chapterSync; -} - -/** - * Determine whether a replay is required if component is missing. - * - * @param component the component - * - * @return whether the component is final (that is, contains shutdown state) - **/ -static INLINE bool -missingIndexComponentRequiresReplay(IndexComponent *component) -{ - return component->info->saveOnly; -} - -/** - * Read a component's state. - * - * @param component The component to read. - * - * @return UDS_SUCCESS, an error code from reading, or UDS_INVALID_ARGUMENT - * if the component is NULL. - **/ -int readIndexComponent(IndexComponent *component) - __attribute__((warn_unused_result)); - -/** - * Write a state file. - * - * @param component The component to write - * - * @return UDS_SUCCESS, an error code from writing, or UDS_INVALID_ARGUMENT - * if the component is NULL. - **/ -int writeIndexComponent(IndexComponent *component) - __attribute__((warn_unused_result)); - -/** - * Start an incremental save for this component (all zones). - * - * @param [in] component The index component. - * - * @return UDS_SUCCESS or an error code. - **/ -int startIndexComponentIncrementalSave(IndexComponent *component) - __attribute__((warn_unused_result)); - -/** - * Perform an incremental save for a component in a particular zone. - * - * @param [in] component The index component. - * @param [in] zone The zone number. - * @param [out] completed Pointer to hold completion status result. - * - * @return UDS_SUCCESS or an error code. - * - * @note If an incremental save is not supported, a regular - * save will be performed if this is the first call in zone 0. - **/ - int performIndexComponentZoneSave(IndexComponent *component, - unsigned int zone, - CompletionStatus *completed) - __attribute__((warn_unused_result)); - -/** - * Perform an incremental save for a non-multizone component synchronized - * with the chapter writer. - * - * @param component The index component. - **/ -int performIndexComponentChapterWriterSave(IndexComponent *component) - __attribute__((warn_unused_result)); - -/** - * Force the completion of an incremental save currently in progress in - * a particular zone. - * - * @param [in] component The index component. - * @param [in] zone The zone number. - * @param [out] completed Pointer to hold completion status result. - * - * @return UDS_SUCCESS or an error code. - **/ -int finishIndexComponentZoneSave(IndexComponent *component, - unsigned int zone, - CompletionStatus *completed) - __attribute__((warn_unused_result)); - -/** - * Force the completion of an incremental save in all zones and complete - * the overal save. - * - * @param [in] component The index component. - * - * @return UDS_SUCCESS or an error code. - * - * @note If all zones call finishIndexComponentZoneSave first, only - * the common non-index-related completion code is required, - * which protects access to the index data structures from the - * invoking thread. - **/ -int finishIndexComponentIncrementalSave(IndexComponent *component) - __attribute__((warn_unused_result)); - -/** - * Abort the incremental save currently in progress in a particular zone. - * - * @param [in] component The index component. - * @param [in] zone The zone number. - * @param [out] completed Pointer to hold completion status result. - * - * @return UDS_SUCCESS or an error code. - * - * @note "Completed" in this case means completed or aborted. - * Once any zone calls this function the entire save is - * useless unless every zone indicates CS_COMPLETED_PREVIOUSLY. - **/ -int abortIndexComponentZoneSave(IndexComponent *component, - unsigned int zone, - CompletionStatus *completed) - __attribute__((warn_unused_result)); - -/** - * Abort an incremental save currently in progress - * - * @param [in] component The index component. - * - * @return UDS_SUCCESS or an error code. - * - * @note If all zones call abortIndexComponentZoneSave first, only - * the common non-index-related completion code is required, - * which protects access to the index data structures from the - * invoking thread. - **/ -int abortIndexComponentIncrementalSave(IndexComponent *component) - __attribute__((warn_unused_result)); - -/** - * Remove or invalidate component state. - * - * @param component The component whose file is to be removed. If NULL - * no action is taken. - **/ -__attribute__((warn_unused_result)) -int discardIndexComponent(IndexComponent *component); - -/** - * Get a buffered reader for the specified component part. - * - * @param [in] portal The component portal. - * @param [in] part The component ordinal number. - * @param [out] readerPtr Where to put the buffered reader. - * - * @return UDS_SUCCESS or an error code. - * - * @note the reader is managed by the component portal - **/ -__attribute__((warn_unused_result)) -int getBufferedReaderForPortal(ReadPortal *portal, - unsigned int part, - BufferedReader **readerPtr); - -#endif /* INDEX_COMPONENT_H */ diff --git a/uds/indexConfig.c b/uds/indexConfig.c deleted file mode 100644 index 7ef86f2..0000000 --- a/uds/indexConfig.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexConfig.c#2 $ - */ - -#include "indexConfig.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" - -static const byte INDEX_CONFIG_MAGIC[] = "ALBIC"; -static const byte INDEX_CONFIG_VERSION[] = "06.02"; -static const byte INDEX_CONFIG_VERSION_6_01[] = "06.01"; - -enum { - INDEX_CONFIG_MAGIC_LENGTH = sizeof(INDEX_CONFIG_MAGIC) - 1, - INDEX_CONFIG_VERSION_LENGTH = sizeof(INDEX_CONFIG_VERSION) - 1 -}; - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int decodeIndexConfig(Buffer *buffer, UdsConfiguration config) -{ - int result = getUInt32LEFromBuffer(buffer, &config->recordPagesPerChapter); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &config->chaptersPerVolume); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &config->sparseChaptersPerVolume); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &config->cacheChapters); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &config->checkpointFrequency); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &config->masterIndexMeanDelta); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &config->bytesPerPage); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &config->sparseSampleRate); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &config->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, - "%zu bytes decoded of %zu expected", - bufferLength(buffer) - contentLength(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - result = UDS_CORRUPT_COMPONENT; - } - return result; -} - -/**********************************************************************/ -static int readVersion(BufferedReader *reader, - UdsConfiguration conf, - const char **versionPtr) -{ - byte buffer[INDEX_CONFIG_VERSION_LENGTH]; - int result = readFromBufferedReader(reader, buffer, - INDEX_CONFIG_VERSION_LENGTH); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot read index config version"); - } - if (memcmp(INDEX_CONFIG_VERSION, buffer, INDEX_CONFIG_VERSION_LENGTH) == 0) { - Buffer *buffer; - result = makeBuffer(sizeof(*conf), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(reader, getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return logErrorWithStringError(result, "cannot read config data"); - } - clearBuffer(buffer); - result = decodeIndexConfig(buffer, conf); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - if (versionPtr != NULL) { - *versionPtr = "current"; - } - return result; - } else if (memcmp(INDEX_CONFIG_VERSION_6_01, buffer, - INDEX_CONFIG_VERSION_LENGTH) == 0) { - struct udsConfiguration6_01 oldConf; - result = readFromBufferedReader(reader, &oldConf, sizeof(oldConf)); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, - "failed to read version 6.01 config file"); - return result; - } - conf->recordPagesPerChapter = oldConf.recordPagesPerChapter; - conf->chaptersPerVolume = oldConf.chaptersPerVolume; - conf->sparseChaptersPerVolume = oldConf.sparseChaptersPerVolume; - conf->cacheChapters = oldConf.cacheChapters; - conf->checkpointFrequency = oldConf.checkpointFrequency; - conf->masterIndexMeanDelta = oldConf.masterIndexMeanDelta; - conf->bytesPerPage = oldConf.bytesPerPage; - conf->sparseSampleRate = oldConf.sparseSampleRate; - conf->nonce = 0; - if (versionPtr != NULL) { - *versionPtr = "6.01"; - } - return UDS_UNSUPPORTED_VERSION; - } - - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "unsupported configuration version: '%.*s'", - INDEX_CONFIG_VERSION_LENGTH, buffer); -} - -/**********************************************************************/ -int readConfigContents(BufferedReader *reader, - UdsConfiguration config) -{ - int result = verifyBufferedData(reader, INDEX_CONFIG_MAGIC, - INDEX_CONFIG_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - - const char *version = NULL; - result = readVersion(reader, config, &version); - if (result != UDS_SUCCESS) { - if (result == UDS_UNSUPPORTED_VERSION) { - logNoticeWithStringError(result, "Found index config version %s", - version); - } else { - logErrorWithStringError(result, "Failed to read index config"); - } - } - return result; -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int encodeIndexConfig(Buffer *buffer, UdsConfiguration config) -{ - int result = putUInt32LEIntoBuffer(buffer, config->recordPagesPerChapter); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, config->chaptersPerVolume); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, config->sparseChaptersPerVolume); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, config->cacheChapters); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, config-> checkpointFrequency); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, config->masterIndexMeanDelta); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, config->bytesPerPage); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, config->sparseSampleRate); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, config->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(*config), - "%zu bytes encoded, of %zu expected", - contentLength(buffer), sizeof(*config)); - return result; -} - -/**********************************************************************/ -int writeConfigContents(BufferedWriter *writer, - UdsConfiguration config) -{ - int result = writeToBufferedWriter(writer, INDEX_CONFIG_MAGIC, - INDEX_CONFIG_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - result = writeToBufferedWriter(writer, INDEX_CONFIG_VERSION, - INDEX_CONFIG_VERSION_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - Buffer *buffer; - result = makeBuffer(sizeof(*config), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = encodeIndexConfig(buffer, config); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = writeToBufferedWriter(writer, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - return result; -} - -/**********************************************************************/ -int makeConfiguration(UdsConfiguration conf, Configuration **configPtr) -{ - *configPtr = NULL; - if (conf == NULL) { - return logErrorWithStringError(UDS_CONF_REQUIRED, - "received an invalid config"); - } - - Configuration *config; - int result = ALLOCATE(1, Configuration, "configuration", &config); - if (result != UDS_SUCCESS) { - return result; - } - - result = makeGeometry(conf->bytesPerPage, - conf->recordPagesPerChapter, - conf->chaptersPerVolume, - conf->sparseChaptersPerVolume, - &config->geometry); - if (result != UDS_SUCCESS) { - freeConfiguration(config); - return result; - } - - config->sparseSampleRate = conf->sparseSampleRate; - config->cacheChapters = conf->cacheChapters; - config->masterIndexMeanDelta = conf->masterIndexMeanDelta; - - *configPtr = config; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeConfiguration(Configuration *config) -{ - if (config != NULL) { - freeGeometry(config->geometry); - FREE(config); - } -} diff --git a/uds/indexConfig.h b/uds/indexConfig.h deleted file mode 100644 index dab3d6a..0000000 --- a/uds/indexConfig.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexConfig.h#2 $ - */ - -#ifndef INDEX_CONFIG_H -#define INDEX_CONFIG_H 1 - -#include "config.h" -#include "geometry.h" - -/** - * A set of configuration parameters for the indexer. - **/ -struct configuration { - /* Parameters for the volume */ - - /* The volume layout */ - Geometry *geometry; - - /* Size of the page cache and sparse chapter index cache, in chapters */ - unsigned int cacheChapters; - - /** Parameters for the master index */ - - /* The mean delta for the master index */ - unsigned int masterIndexMeanDelta; - - /* Sampling rate for sparse indexing */ - unsigned int sparseSampleRate; -}; - -#endif /* INDEX_CONFIG_H */ diff --git a/uds/indexInternals.c b/uds/indexInternals.c deleted file mode 100644 index 48268c7..0000000 --- a/uds/indexInternals.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexInternals.c#7 $ - */ - -#include "indexInternals.h" - -#include "errors.h" -#include "indexCheckpoint.h" -#include "indexStateData.h" -#include "indexZone.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "openChapter.h" -#include "request.h" -#include "stringUtils.h" -#include "threads.h" -#include "typeDefs.h" -#include "volume.h" -#include "zone.h" - -static const unsigned int MAX_COMPONENT_COUNT = 4; - -/**********************************************************************/ -int allocateIndex(IndexLayout *layout, - const Configuration *config, - const struct uds_parameters *userParams, - unsigned int zoneCount, - LoadType loadType, - Index **newIndex) -{ - unsigned int checkpoint_frequency - = userParams == NULL ? 0 : userParams->checkpoint_frequency; - if (checkpoint_frequency >= config->geometry->chaptersPerVolume) { - return UDS_BAD_CHECKPOINT_FREQUENCY; - } - - Index *index; - int result = ALLOCATE(1, Index, "index", &index); - if (result != UDS_SUCCESS) { - return result; - } - - index->existed = (loadType != LOAD_CREATE); - index->hasSavedOpenChapter = true; - index->loadedType = LOAD_UNDEFINED; - - result = makeIndexCheckpoint(index); - if (result != UDS_SUCCESS) { - freeIndex(index); - return result; - } - setIndexCheckpointFrequency(index->checkpoint, checkpoint_frequency); - - getIndexLayout(layout, &index->layout); - index->zoneCount = zoneCount; - - result = ALLOCATE(index->zoneCount, IndexZone *, "zones", - &index->zones); - if (result != UDS_SUCCESS) { - freeIndex(index); - return result; - } - - result = makeIndexState(layout, index->zoneCount, MAX_COMPONENT_COUNT, - &index->state); - if (result != UDS_SUCCESS) { - freeIndex(index); - return result; - } - - result = addIndexStateComponent(index->state, &INDEX_STATE_INFO, index, - NULL); - if (result != UDS_SUCCESS) { - freeIndex(index); - return result; - } - - result = makeVolume(config, index->layout, userParams, - VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS, index->zoneCount, - &index->volume); - if (result != UDS_SUCCESS) { - freeIndex(index); - return result; - } - index->volume->lookupMode = LOOKUP_NORMAL; - - unsigned int i; - for (i = 0; i < index->zoneCount; i++) { - result = makeIndexZone(index, i); - if (result != UDS_SUCCESS) { - freeIndex(index); - return logErrorWithStringError(result, "Could not create index zone"); - } - } - - result = addIndexStateComponent(index->state, &OPEN_CHAPTER_INFO, index, - NULL); - if (result != UDS_SUCCESS) { - freeIndex(index); - return logErrorWithStringError(result, "Could not create open chapter"); - } - - *newIndex = index; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void releaseIndex(Index *index) -{ - if (index == NULL) { - return; - } - - if (index->zones != NULL) { - unsigned int i; - for (i = 0; i < index->zoneCount; i++) { - freeIndexZone(index->zones[i]); - } - FREE(index->zones); - } - - freeVolume(index->volume); - - freeIndexState(&index->state); - freeIndexCheckpoint(index->checkpoint); - putIndexLayout(&index->layout); - FREE(index); -} diff --git a/uds/indexInternals.h b/uds/indexInternals.h deleted file mode 100644 index 16cb56a..0000000 --- a/uds/indexInternals.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexInternals.h#3 $ - */ - -#ifndef INDEX_INTERNALS_H -#define INDEX_INTERNALS_H - -#include "index.h" -#include "loadType.h" -#include "request.h" - -/** - * Construct a new index from the given configuration. - * - * @param layout The index layout to use - * @param config The configuration to use - * @param userParams The index session parameters. If NULL, the default - * session parameters will be used. - * @param zoneCount The number of zones for this index to use - * @param loadType How to create the index: it can be create only, allow - * loading from files, and allow rebuilding from the volume - * @param newIndex A pointer to hold a pointer to the new index - * - * @return UDS_SUCCESS or an error code - **/ -int allocateIndex(IndexLayout *layout, - const Configuration *config, - const struct uds_parameters *userParams, - unsigned int zoneCount, - LoadType loadType, - Index **newIndex) - __attribute__((warn_unused_result)); - -/** - * Clean up the index and its memory. - * - * @param index The index to destroy. - **/ -void releaseIndex(Index *index); - -#endif /* INDEX_INTERNALS_H */ diff --git a/uds/indexLayout.c b/uds/indexLayout.c deleted file mode 100644 index cb019ff..0000000 --- a/uds/indexLayout.c +++ /dev/null @@ -1,2409 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexLayout.c#19 $ - */ - -#include "indexLayout.h" - -#include "buffer.h" -#include "compiler.h" -#include "config.h" -#include "indexConfig.h" -#include "layoutRegion.h" -#include "logger.h" -#include "masterIndexOps.h" -#include "memoryAlloc.h" -#include "nonce.h" -#include "openChapter.h" - -/* - * Overall layout of an index on disk: - * - * The layout is divided into a number of fixed-size regions, the sizes of - * which are computed when the index is created. Every header and region - * begins on 4K block boundary. Save regions are further sub-divided into - * regions of their own. - * - * Each region has a kind and an instance number. Some kinds only have one - * instance and therefore use RL_SOLE_INSTANCE (-1) as the instance number. - * The RL_KIND_INDEX uses instances to represent sub-indices, where used. - * A save region can either hold a checkpoint or a clean shutdown (determined - * by the type). The instances determine which available save slot is used. - * The RL_KIND_MASTER_INDEX uses instances to record which zone is being saved. - * - * +-+-+--------+--------+--------+-----+--- -+-+ - * | | | I N D E X 0 101, 0 | ... | | - * |H|C+--------+--------+--------+-----+--- -+S| - * |D|f| Volume | Save | Save | | |e| - * |R|g| Region | Region | Region | ... | ... |a| - * | | | 201 -1 | 202 0 | 202 1 | | |l| - * +-+-+--------+--------+--------+-----+--- -+-+ - * - * The header contains the encoded regional layout table as well as - * the saved index configuration record. The sub-index regions and their - * subdivisions are maintained in the same table. - * - * There are at least two save regions per sub-index to preserve the old - * state should the saving of a state be incomplete. They are used in - * a round-robin fashion. - * - * Anatomy of a save region: - * - * +-+-----+------+------+-----+ -+-----+ - * |H| IPM | MI | MI | | | OC | - * |D| | zone | zone | ... | | | - * |R| 301 | 302 | 302 | | | 303 | - * | | -1 | 0 | 1 | | | -1 | - * +-+-----+------+------+-----+ -+-----+ - * - * Every region header has a type (and version). In save regions, - * the open chapter only appears in RL_TYPE_SAVE not RL_TYPE_CHECKPOINT, - * although the same space is reserved for both. - * - * The header contains the encoded regional layout table as well as the - * index state record for that save or checkpoint. Each save or checkpoint - * has a unique generation number and nonce which is used to seed the - * checksums of those regions. - */ - -typedef struct indexSaveData_v1 { - uint64_t timestamp; // ms since epoch... - uint64_t nonce; - uint32_t version; // 1 - uint32_t unused__; -} IndexSaveData; - -typedef struct indexSaveLayout { - LayoutRegion indexSave; - LayoutRegion header; - unsigned int numZones; - LayoutRegion indexPageMap; - LayoutRegion freeSpace; - LayoutRegion *masterIndexZones; - LayoutRegion *openChapter; - IndexSaveType saveType; - IndexSaveData saveData; - Buffer *indexStateBuffer; - bool read; - bool written; -} IndexSaveLayout; - -typedef struct subIndexLayout { - LayoutRegion subIndex; - uint64_t nonce; - LayoutRegion volume; - IndexSaveLayout *saves; -} SubIndexLayout; - -typedef struct superBlockData_v1 { - byte magicLabel[32]; - byte nonceInfo[32]; - uint64_t nonce; - uint32_t version; // 2 - uint32_t blockSize; // for verification - uint16_t numIndexes; // 1 - uint16_t maxSaves; - uint64_t openChapterBlocks; - uint64_t pageMapBlocks; -} SuperBlockData; - -struct indexLayout { - IOFactory *factory; - off_t offset; - struct index_version indexVersion; - SuperBlockData super; - LayoutRegion header; - LayoutRegion config; - SubIndexLayout index; - LayoutRegion seal; - uint64_t totalBlocks; - int refCount; -}; - -/** - * Structure used to compute single file layout sizes. - * - * Note that the masterIndexBlocks represent all zones and are sized for - * the maximum number of blocks that would be needed regardless of the number - * of zones (up to the maximum value) that are used at run time. - * - * Similarly, the number of saves is sized for the minimum safe value - * assuming checkpointing is enabled, since that is also a run-time parameter. - **/ -typedef struct saveLayoutSizes { - Configuration config; // this is a captive copy - Geometry geometry; // this is a captive copy - unsigned int numSaves; // per sub-index - size_t blockSize; // in bytes - uint64_t volumeBlocks; // per sub-index - uint64_t masterIndexBlocks; // per save - uint64_t pageMapBlocks; // per save - uint64_t openChapterBlocks; // per save - uint64_t saveBlocks; // per sub-index - uint64_t subIndexBlocks; // per sub-index - uint64_t totalBlocks; // for whole layout -} SaveLayoutSizes; - -enum { - INDEX_STATE_BUFFER_SIZE = 512, - MAX_SAVES = 5, -}; - -static const byte SINGLE_FILE_MAGIC_1[32] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; -enum { - SINGLE_FILE_MAGIC_1_LENGTH = sizeof(SINGLE_FILE_MAGIC_1), -}; - -static int reconstituteSingleFileLayout(IndexLayout *layout, - SuperBlockData *super, - RegionTable *table, - uint64_t firstBlock) - __attribute__((warn_unused_result)); -static int writeIndexSaveLayout(IndexLayout *layout, IndexSaveLayout *isl) - __attribute__((warn_unused_result)); - -/*****************************************************************************/ -static INLINE uint64_t blockCount(uint64_t bytes, uint32_t blockSize) -{ - uint64_t blocks = bytes / blockSize; - if (bytes % blockSize > 0) { - ++blocks; - } - return blocks; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int computeSizes(SaveLayoutSizes *sls, - const UdsConfiguration config, - size_t blockSize, - unsigned int numCheckpoints) -{ - if (config->bytesPerPage % blockSize != 0) { - return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, - "page size not a multiple of block size"); - } - - Configuration *cfg = NULL; - int result = makeConfiguration(config, &cfg); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot compute layout size"); - } - - memset(sls, 0, sizeof(*sls)); - - // internalize the configuration and geometry... - - sls->geometry = *cfg->geometry; - sls->config = *cfg; - sls->config.geometry = &sls->geometry; - - freeConfiguration(cfg); - - sls->numSaves = 2 + numCheckpoints; - sls->blockSize = blockSize; - sls->volumeBlocks = sls->geometry.bytesPerVolume / blockSize; - - result = computeMasterIndexSaveBlocks(&sls->config, blockSize, - &sls->masterIndexBlocks); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot compute index save size"); - } - - sls->pageMapBlocks = - blockCount(computeIndexPageMapSaveSize(&sls->geometry), blockSize); - sls->openChapterBlocks = - blockCount(computeSavedOpenChapterSize(&sls->geometry), blockSize); - sls->saveBlocks = 1 + (sls->masterIndexBlocks + - sls->pageMapBlocks + sls->openChapterBlocks); - sls->subIndexBlocks = sls->volumeBlocks + (sls->numSaves * sls->saveBlocks); - sls->totalBlocks = 3 + sls->subIndexBlocks; - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int udsComputeIndexSize(const UdsConfiguration config, - unsigned int numCheckpoints, - uint64_t *indexSize) -{ - SaveLayoutSizes sizes; - int result = computeSizes(&sizes, config, UDS_BLOCK_SIZE, numCheckpoints); - if (result != UDS_SUCCESS) { - return result; - } - - if (indexSize != NULL) { - *indexSize = sizes.totalBlocks * sizes.blockSize; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int openLayoutReader(IndexLayout *layout, - LayoutRegion *lr, - BufferedReader **readerPtr) -{ - off_t start = lr->startBlock * layout->super.blockSize; - size_t size = lr->numBlocks * layout->super.blockSize; - return openBufferedReader(layout->factory, start, size, readerPtr); -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int openLayoutWriter(IndexLayout *layout, - LayoutRegion *lr, - BufferedWriter **writerPtr) -{ - off_t start = lr->startBlock * layout->super.blockSize; - size_t size = lr->numBlocks * layout->super.blockSize; - return openBufferedWriter(layout->factory, start, size, writerPtr); -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int decodeIndexSaveData(Buffer *buffer, IndexSaveData *saveData) -{ - int result = getUInt64LEFromBuffer(buffer, &saveData->timestamp); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &saveData->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &saveData->version); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &saveData->unused__); - if (result != UDS_SUCCESS) { - return result; - } - // The unused padding has to be zeroed for correct nonce calculation - if (saveData->unused__ != 0) { - return UDS_CORRUPT_COMPONENT; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, - "%zu bytes decoded of %zu expected", - bufferLength(buffer), sizeof(*saveData)); - if (result != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int decodeRegionHeader(Buffer *buffer, RegionHeader *header) -{ - int result = getUInt64LEFromBuffer(buffer, &header->magic); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &header->regionBlocks); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEFromBuffer(buffer, &header->type); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEFromBuffer(buffer, &header->version); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEFromBuffer(buffer, &header->numRegions); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEFromBuffer(buffer, &header->payload); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, - "%zu bytes decoded of %zu expected", - bufferLength(buffer), sizeof(*header)); - if (result != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int decodeLayoutRegion(Buffer *buffer, LayoutRegion *region) -{ - size_t cl1 = contentLength(buffer); - - int result = getUInt64LEFromBuffer(buffer, ®ion->startBlock); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, ®ion->numBlocks); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, ®ion->checksum); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEFromBuffer(buffer, ®ion->kind); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEFromBuffer(buffer, ®ion->instance); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(cl1 - contentLength(buffer) == sizeof(*region), - "%zu bytes decoded, of %zu expected", - cl1 - contentLength(buffer), sizeof(*region)); - if (result != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int loadRegionTable(BufferedReader *reader, RegionTable **tablePtr) -{ - Buffer *buffer; - int result = makeBuffer(sizeof(RegionHeader), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(reader, getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return logErrorWithStringError(result, "cannot read region table header"); - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - RegionHeader header; - result = decodeRegionHeader(buffer, &header); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - if (header.magic != REGION_MAGIC) { - return UDS_NO_INDEX; - } - if (header.version != 1) { - return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, - "unknown region table version %" PRIu16, - header.version); - } - - RegionTable *table; - result = ALLOCATE_EXTENDED(RegionTable, header.numRegions, LayoutRegion, - "single file layout region table", &table); - if (result != UDS_SUCCESS) { - return result; - } - - table->header = header; - result = makeBuffer(header.numRegions * sizeof(LayoutRegion), &buffer); - if (result != UDS_SUCCESS) { - FREE(table); - return result; - } - result = readFromBufferedReader(reader, getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - FREE(table); - freeBuffer(&buffer); - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "cannot read region table layouts"); - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - FREE(table); - freeBuffer(&buffer); - return result; - } - unsigned int i; - for (i = 0; i < header.numRegions; i++){ - result = decodeLayoutRegion(buffer, &table->regions[i]); - if (result != UDS_SUCCESS) { - FREE(table); - freeBuffer(&buffer); - return result; - } - } - freeBuffer(&buffer); - *tablePtr = table; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int decodeSuperBlockData(Buffer *buffer, SuperBlockData *super) -{ - int result = getBytesFromBuffer(buffer, 32, super->magicLabel); - if (result != UDS_SUCCESS) { - return result; - } - result = getBytesFromBuffer(buffer, 32, super->nonceInfo); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &super->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &super->version); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &super->blockSize); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEFromBuffer(buffer, &super->numIndexes); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEFromBuffer(buffer, &super->maxSaves); - if (result != UDS_SUCCESS) { - return result; - } - result = skipForward(buffer, 4); // aligment - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &super->openChapterBlocks); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &super->pageMapBlocks); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, - "%zu bytes decoded of %zu expected", - bufferLength(buffer), sizeof(*super)); - if (result != UDS_SUCCESS) { - return UDS_CORRUPT_COMPONENT; - } - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int readSuperBlockData(BufferedReader *reader, - SuperBlockData *super, - size_t savedSize) -{ - if (savedSize != sizeof(SuperBlockData)) { - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "unexpected super block data size %zu", - savedSize); - } - - if (sizeof(super->magicLabel) != SINGLE_FILE_MAGIC_1_LENGTH) { - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "super block magic label size incorrect"); - } - - Buffer *buffer; - int result = makeBuffer(savedSize, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(reader, getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return logErrorWithStringError(result, "cannot read region table header"); - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = decodeSuperBlockData(buffer, super); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot read super block data"); - } - - if (memcmp(super->magicLabel, SINGLE_FILE_MAGIC_1, - SINGLE_FILE_MAGIC_1_LENGTH) != 0) { - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "unknown superblock magic label"); - } - - if ((super->version < SUPER_VERSION_MINIMUM) - || (super->version > SUPER_VERSION_MAXIMUM)) { - return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, - "unknown superblock version number %" - PRIu32, - super->version); - } - - // We dropped the usage of multiple subindices before we ever ran UDS code in - // the kernel. We do not have code that will handle multiple subindices. - if (super->numIndexes != 1) { - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "invalid subindex count %" PRIu32, - super->numIndexes); - } - - if (generateMasterNonce(super->nonceInfo, sizeof(super->nonceInfo)) != - super->nonce) - { - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "inconsistent superblock nonce"); - } - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int allocateSingleFileParts(IndexLayout *layout, - SuperBlockData *super) -{ - int result = ALLOCATE(super->maxSaves, IndexSaveLayout, __func__, - &layout->index.saves); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int loadSuperBlock(IndexLayout *layout, - size_t blockSize, - uint64_t firstBlock, - BufferedReader *reader) -{ - RegionTable *table = NULL; - int result = loadRegionTable(reader, &table); - if (result != UDS_SUCCESS) { - return result; - } - - if (table->header.type != RH_TYPE_SUPER) { - FREE(table); - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "not a superblock region table"); - } - - SuperBlockData superBlockData; - result = readSuperBlockData(reader, &superBlockData, table->header.payload); - if (result != UDS_SUCCESS) { - FREE(table); - return logErrorWithStringError(result, "unknown superblock format"); - } - - if (superBlockData.blockSize != blockSize) { - FREE(table); - return logErrorWithStringError(UDS_WRONG_INDEX_CONFIG, - "superblock saved blockSize %" PRIu32 - " differs from supplied blockSize %zu", - superBlockData.blockSize, blockSize); - } - initializeIndexVersion(&layout->indexVersion, superBlockData.version); - - result = allocateSingleFileParts(layout, &superBlockData); - if (result != UDS_SUCCESS) { - FREE(table); - return result; - } - - result = reconstituteSingleFileLayout(layout, &superBlockData, table, - firstBlock); - FREE(table); - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int readIndexSaveData(BufferedReader *reader, - IndexSaveData *saveData, - size_t savedSize, - Buffer **bufferPtr) -{ - int result = UDS_SUCCESS; - if (savedSize == 0) { - memset(saveData, 0, sizeof(*saveData)); - } else { - if (savedSize < sizeof(IndexSaveData)) { - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "unexpected index save data size %zu", - savedSize); - } - - Buffer *buffer; - result = makeBuffer(sizeof(*saveData), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(reader, getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return logErrorWithStringError(result, "cannot read index save data"); - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - - result = decodeIndexSaveData(buffer, saveData); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - - savedSize -= sizeof(IndexSaveData); - - if (saveData->version > 1) { - return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, - "unkown index save verion number %" - PRIu32, - saveData->version); - } - - if (savedSize > INDEX_STATE_BUFFER_SIZE) { - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "unexpected index state buffer size %zu", - savedSize); - } - } - - Buffer *buffer = NULL; - - if (saveData->version != 0) { - result = makeBuffer(INDEX_STATE_BUFFER_SIZE, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - if (savedSize > 0) { - result = readFromBufferedReader(reader, getBufferContents(buffer), - savedSize); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = resetBufferEnd(buffer, savedSize); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - } - } - - *bufferPtr = buffer; - return UDS_SUCCESS; -} - -/*****************************************************************************/ - -typedef struct { - LayoutRegion *nextRegion; - LayoutRegion *lastRegion; - uint64_t nextBlock; - int result; -} RegionIterator; - -/*****************************************************************************/ -__attribute__((format(printf, 2, 3))) -static void iterError(RegionIterator *iter, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - int r = vLogWithStringError(LOG_ERR, UDS_UNEXPECTED_RESULT, fmt, args); - va_end(args); - if (iter->result == UDS_SUCCESS) { - iter->result = r; - } -} - -/** - * Set the next layout region in the layout according to a region table - * iterator, unless the iterator already contains an error - * - * @param expect whether to record an error or return false - * @param lr the layout region field to set - * @param iter the region iterator, which also holds the cumulative - * result - * @param numBlocks if non-zero, the expected number of blocks - * @param kind the expected kind of the region - * @param instance the expected instance number of the region - * - * @return true if we meet expectations, false if we do not - **/ -static bool expectLayout(bool expect, - LayoutRegion *lr, - RegionIterator *iter, - uint64_t numBlocks, - RegionKind kind, - unsigned int instance) -{ - if (iter->result != UDS_SUCCESS) { - return false; - } - - if (iter->nextRegion == iter->lastRegion) { - if (expect) { - iterError(iter, "ran out of layout regions in region table"); - } - return false; - } - - if (iter->nextRegion->startBlock != iter->nextBlock) { - iterError(iter, "layout region not at expected offset"); - return false; - } - - if (iter->nextRegion->kind != kind) { - if (expect) { - iterError(iter, "layout region has incorrect kind"); - } - return false; - } - - if (iter->nextRegion->instance != instance) { - iterError(iter, "layout region has incorrect instance"); - return false; - } - - if (numBlocks > 0 && iter->nextRegion->numBlocks != numBlocks) { - iterError(iter, "layout region size is incorrect"); - return false; - } - - if (lr != NULL) { - *lr = *iter->nextRegion; - } - - iter->nextBlock += iter->nextRegion->numBlocks; - iter->nextRegion++; - return true; -} - -/*****************************************************************************/ -static void setupLayout(LayoutRegion *lr, - uint64_t *nextAddrPtr, - uint64_t regionSize, - unsigned int kind, - unsigned int instance) -{ - *lr = (LayoutRegion) { - .startBlock = *nextAddrPtr, - .numBlocks = regionSize, - .checksum = 0, - .kind = kind, - .instance = instance, - }; - *nextAddrPtr += regionSize; -} - -/*****************************************************************************/ -static void populateIndexSaveLayout(IndexSaveLayout *isl, - SuperBlockData *super, - unsigned int numZones, - IndexSaveType saveType) -{ - uint64_t nextBlock = isl->indexSave.startBlock; - - setupLayout(&isl->header, &nextBlock, 1, RL_KIND_HEADER, RL_SOLE_INSTANCE); - setupLayout(&isl->indexPageMap, &nextBlock, super->pageMapBlocks, - RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); - - uint64_t blocksAvail = (isl->indexSave.numBlocks - - (nextBlock - isl->indexSave.startBlock) - - super->openChapterBlocks); - - if (numZones > 0) { - uint64_t miBlockCount = blocksAvail / numZones; - unsigned int z; - for (z = 0; z < numZones; ++z) { - LayoutRegion *miz = &isl->masterIndexZones[z]; - setupLayout(miz, &nextBlock, miBlockCount, RL_KIND_MASTER_INDEX, z); - } - } - if (saveType == IS_SAVE && isl->openChapter != NULL) { - setupLayout(isl->openChapter, &nextBlock, super->openChapterBlocks, - RL_KIND_OPEN_CHAPTER, RL_SOLE_INSTANCE); - } - setupLayout(&isl->freeSpace, &nextBlock, - (isl->indexSave.numBlocks - - (nextBlock - isl->indexSave.startBlock)), - RL_KIND_SCRATCH, RL_SOLE_INSTANCE); -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int reconstructIndexSave(IndexSaveLayout *isl, - IndexSaveData *saveData, - SuperBlockData *super, - RegionTable *table) -{ - isl->numZones = 0; - isl->saveData = *saveData; - isl->read = false; - isl->written = false; - - if (table->header.type == RH_TYPE_SAVE) { - isl->saveType = IS_SAVE; - } else if (table->header.type == RH_TYPE_CHECKPOINT) { - isl->saveType = IS_CHECKPOINT; - } else { - isl->saveType = NO_SAVE; - } - - if ((table->header.numRegions == 0) || - ((table->header.numRegions == 1) && - (table->regions[0].kind == RL_KIND_SCRATCH))) - { - populateIndexSaveLayout(isl, super, 0, NO_SAVE); - return UDS_SUCCESS; - } - - RegionIterator iter = { - .nextRegion = table->regions, - .lastRegion = table->regions + table->header.numRegions, - .nextBlock = isl->indexSave.startBlock, - .result = UDS_SUCCESS, - }; - - expectLayout(true, &isl->header, &iter, 1, RL_KIND_HEADER, RL_SOLE_INSTANCE); - expectLayout(true, &isl->indexPageMap, &iter, 0, - RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); - unsigned int n = 0; - RegionIterator tmpIter; - for (tmpIter = iter; - expectLayout(false, NULL, &tmpIter, 0, RL_KIND_MASTER_INDEX, n); - ++n) - ; - isl->numZones = n; - - int result = UDS_SUCCESS; - - if (isl->numZones > 0) { - result = ALLOCATE(n, LayoutRegion, "master index layout regions", - &isl->masterIndexZones); - if (result != UDS_SUCCESS) { - return result; - } - } - - if (isl->saveType == IS_SAVE) { - result = ALLOCATE(1, LayoutRegion, "open chapter layout region", - &isl->openChapter); - if (result != UDS_SUCCESS) { - FREE(isl->masterIndexZones); - return result; - } - } - - unsigned int z; - for (z = 0; z < isl->numZones; ++z) { - expectLayout(true, &isl->masterIndexZones[z], &iter, 0, - RL_KIND_MASTER_INDEX, z); - } - if (isl->saveType == IS_SAVE) { - expectLayout(true, isl->openChapter, &iter, 0, - RL_KIND_OPEN_CHAPTER, RL_SOLE_INSTANCE); - } - if (!expectLayout(false, &isl->freeSpace, &iter, 0, - RL_KIND_SCRATCH, RL_SOLE_INSTANCE)) - { - isl->freeSpace = (LayoutRegion) { - .startBlock = iter.nextBlock, - .numBlocks = (isl->indexSave.startBlock + - isl->indexSave.numBlocks) - iter.nextBlock, - .checksum = 0, - .kind = RL_KIND_SCRATCH, - .instance = RL_SOLE_INSTANCE, - }; - iter.nextBlock = isl->freeSpace.startBlock + isl->freeSpace.numBlocks; - } - - if (iter.result != UDS_SUCCESS) { - return iter.result; - } - if (iter.nextRegion != iter.lastRegion) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "expected %ld additional regions", - iter.lastRegion - iter.nextRegion); - } - if (iter.nextBlock != isl->indexSave.startBlock + isl->indexSave.numBlocks) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "index save layout table incomplete"); - } - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int loadIndexSave(IndexSaveLayout *isl, - SuperBlockData *super, - BufferedReader *reader, - unsigned int saveId) -{ - RegionTable *table = NULL; - int result = loadRegionTable(reader, &table); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "cannot read index 0 save %u header", - saveId); - } - - if (table->header.regionBlocks != isl->indexSave.numBlocks) { - uint64_t regionBlocks = table->header.regionBlocks; - FREE(table); - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "unexpected index 0 save %u " - "region block count %llu", - saveId, regionBlocks); - } - - if (table->header.type != RH_TYPE_SAVE && - table->header.type != RH_TYPE_CHECKPOINT && - table->header.type != RH_TYPE_UNSAVED) - { - unsigned int type = table->header.type; - FREE(table); - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, "unexpected" - " index 0 save %u header type %u", - saveId, type); - } - - IndexSaveData indexSaveData; - result = readIndexSaveData(reader, &indexSaveData, table->header.payload, - &isl->indexStateBuffer); - if (result != UDS_SUCCESS) { - FREE(table); - return logErrorWithStringError(result, - "unknown index 0 save %u data format", - saveId); - } - - result = reconstructIndexSave(isl, &indexSaveData, super, table); - FREE(table); - - if (result != UDS_SUCCESS) { - freeBuffer(&isl->indexStateBuffer); - return logErrorWithStringError(result, - "cannot reconstruct index 0 save %u", - saveId); - } - isl->read = true; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int loadSubIndexRegions(IndexLayout *layout) -{ - unsigned int j; - for (j = 0; j < layout->super.maxSaves; ++j) { - IndexSaveLayout *isl = &layout->index.saves[j]; - - BufferedReader *reader; - int result = openLayoutReader(layout, &isl->indexSave, &reader); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "cannot get reader for index 0 save %u", - j); - while (j-- > 0) { - IndexSaveLayout *isl = &layout->index.saves[j]; - FREE(isl->masterIndexZones); - FREE(isl->openChapter); - freeBuffer(&isl->indexStateBuffer); - } - return result; - } - - result = loadIndexSave(isl, &layout->super, reader, j); - freeBufferedReader(reader); - if (result != UDS_SUCCESS) { - while (j-- > 0) { - IndexSaveLayout *isl = &layout->index.saves[j]; - FREE(isl->masterIndexZones); - FREE(isl->openChapter); - freeBuffer(&isl->indexStateBuffer); - } - return result; - } - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static int loadIndexLayout(IndexLayout *layout) -{ - BufferedReader *reader; - int result = openBufferedReader(layout->factory, layout->offset, - UDS_BLOCK_SIZE, &reader); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "unable to read superblock"); - } - - result = loadSuperBlock(layout, UDS_BLOCK_SIZE, - layout->offset / UDS_BLOCK_SIZE, reader); - freeBufferedReader(reader); - if (result != UDS_SUCCESS) { - FREE(layout->index.saves); - layout->index.saves = NULL; - return result; - } - - result = loadSubIndexRegions(layout); - if (result != UDS_SUCCESS) { - FREE(layout->index.saves); - layout->index.saves = NULL; - return result; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static void generateSuperBlockData(size_t blockSize, - unsigned int maxSaves, - uint64_t openChapterBlocks, - uint64_t pageMapBlocks, - SuperBlockData *super) -{ - memset(super, 0, sizeof(*super)); - memcpy(super->magicLabel, SINGLE_FILE_MAGIC_1, SINGLE_FILE_MAGIC_1_LENGTH); - createUniqueNonceData(super->nonceInfo, sizeof(super->nonceInfo)); - - super->nonce = generateMasterNonce(super->nonceInfo, - sizeof(super->nonceInfo)); - super->version = SUPER_VERSION_CURRENT; - super->blockSize = blockSize; - super->numIndexes = 1; - super->maxSaves = maxSaves; - super->openChapterBlocks = openChapterBlocks; - super->pageMapBlocks = pageMapBlocks; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int resetIndexSaveLayout(IndexSaveLayout *isl, - uint64_t *nextBlockPtr, - uint64_t saveBlocks, - uint64_t pageMapBlocks, - unsigned int instance) -{ - uint64_t startBlock = *nextBlockPtr; - - if (isl->masterIndexZones) { - FREE(isl->masterIndexZones); - } - if (isl->openChapter) { - FREE(isl->openChapter); - } - if (isl->indexStateBuffer) { - freeBuffer(&isl->indexStateBuffer); - } - memset(isl, 0, sizeof(*isl)); - isl->saveType = NO_SAVE; - setupLayout(&isl->indexSave, &startBlock, saveBlocks, RL_KIND_SAVE, - instance); - setupLayout(&isl->header, nextBlockPtr, 1, RL_KIND_HEADER, - RL_SOLE_INSTANCE); - setupLayout(&isl->indexPageMap, nextBlockPtr, pageMapBlocks, - RL_KIND_INDEX_PAGE_MAP, RL_SOLE_INSTANCE); - uint64_t remaining = startBlock - *nextBlockPtr; - setupLayout(&isl->freeSpace, nextBlockPtr, remaining, RL_KIND_SCRATCH, - RL_SOLE_INSTANCE); - // number of zones is a save-time parameter - // presence of open chapter is a save-time parameter - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static void defineSubIndexNonce(SubIndexLayout *sil, - uint64_t masterNonce, - unsigned int indexId) -{ - struct subIndexNonceData { - uint64_t offset; - uint16_t indexId; - }; - byte buffer[sizeof(struct subIndexNonceData)] = { 0 }; - size_t offset = 0; - encodeUInt64LE(buffer, &offset, sil->subIndex.startBlock); - encodeUInt16LE(buffer, &offset, indexId); - sil->nonce = generateSecondaryNonce(masterNonce, buffer, sizeof(buffer)); - if (sil->nonce == 0) { - sil->nonce = generateSecondaryNonce(~masterNonce + 1, - buffer, sizeof(buffer)); - } -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int setupSubIndex(SubIndexLayout *sil, - uint64_t *nextBlockPtr, - SaveLayoutSizes *sls, - unsigned int instance, - uint64_t masterNonce) -{ - uint64_t startBlock = *nextBlockPtr; - - setupLayout(&sil->subIndex, &startBlock, sls->subIndexBlocks, - RL_KIND_INDEX, instance); - setupLayout(&sil->volume, nextBlockPtr, sls->volumeBlocks, - RL_KIND_VOLUME, RL_SOLE_INSTANCE); - unsigned int i; - for (i = 0; i < sls->numSaves; ++i) { - int result = resetIndexSaveLayout(&sil->saves[i], nextBlockPtr, - sls->saveBlocks, sls->pageMapBlocks, i); - if (result != UDS_SUCCESS) { - return result; - } - } - - if (startBlock != *nextBlockPtr) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "sub index layout regions don't agree"); - } - - defineSubIndexNonce(sil, masterNonce, instance); - return UDS_SUCCESS; -} - -/*****************************************************************************/ -/** - * Initialize a single file layout using the save layout sizes specified. - * - * @param layout the layout to initialize - * @param offset the offset in bytes from the start of the backing storage - * @param size the size in bytes of the backing storage - * @param sls a populated SaveLayoutSizes object - * - * @return UDS_SUCCESS or an error code, potentially - * UDS_INSUFFICIENT_INDEX_SPACE if the size of the backing store - * is not sufficient for the index configuration, - * UDS_BAD_INDEX_ALIGNMENT if the offset specified does not - * align properly with the index block and page sizes] - * various other errors - **/ -__attribute__((warn_unused_result)) -static int initSingleFileLayout(IndexLayout *layout, - uint64_t offset, - uint64_t size, - SaveLayoutSizes *sls) -{ - layout->totalBlocks = sls->totalBlocks; - - if (size < sls->totalBlocks * sls->blockSize) { - return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, - "not enough space for index as configured"); - } - - generateSuperBlockData(sls->blockSize, sls->numSaves, sls->openChapterBlocks, - sls->pageMapBlocks, &layout->super); - initializeIndexVersion(&layout->indexVersion, SUPER_VERSION_CURRENT); - - int result = allocateSingleFileParts(layout, &layout->super); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t nextBlock = offset / sls->blockSize; - - setupLayout(&layout->header, &nextBlock, 1, RL_KIND_HEADER, - RL_SOLE_INSTANCE); - setupLayout(&layout->config, &nextBlock, 1, RL_KIND_CONFIG, - RL_SOLE_INSTANCE); - result = setupSubIndex(&layout->index, &nextBlock, sls, 0, - layout->super.nonce); - if (result != UDS_SUCCESS) { - return result; - } - setupLayout(&layout->seal, &nextBlock, 1, RL_KIND_SEAL, RL_SOLE_INSTANCE); - if (nextBlock * sls->blockSize > offset + size) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "layout does not fit as expected"); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static void expectSubIndex(SubIndexLayout *sil, - RegionIterator *iter, - SuperBlockData *super, - unsigned int instance) -{ - if (iter->result != UDS_SUCCESS) { - return; - } - - uint64_t startBlock = iter->nextBlock; - - expectLayout(true, &sil->subIndex, iter, 0, RL_KIND_INDEX, instance); - - uint64_t endBlock = iter->nextBlock; - iter->nextBlock = startBlock; - - expectLayout(true, &sil->volume, iter, 0, RL_KIND_VOLUME, RL_SOLE_INSTANCE); - - unsigned int i; - for (i = 0; i < super->maxSaves; ++i) { - IndexSaveLayout *isl = &sil->saves[i]; - expectLayout(true, &isl->indexSave, iter, 0, RL_KIND_SAVE, i); - } - - if (iter->nextBlock != endBlock) { - iterError(iter, "sub index region does not span all saves"); - } - - defineSubIndexNonce(sil, super->nonce, instance); -} - -/*****************************************************************************/ - -/** - * Initialize a single file layout from the region table and super block data - * stored in stable storage. - * - * @param layout the layout to initialize - * @param region the IO region for this layout - * @param super the super block data read from the superblock - * @param table the region table read from the superblock - * @param firstBlock the first block number in the region - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int reconstituteSingleFileLayout(IndexLayout *layout, - SuperBlockData *super, - RegionTable *table, - uint64_t firstBlock) -{ - layout->super = *super; - layout->totalBlocks = table->header.regionBlocks; - - RegionIterator iter = { - .nextRegion = table->regions, - .lastRegion = table->regions + table->header.numRegions, - .nextBlock = firstBlock, - .result = UDS_SUCCESS - }; - - expectLayout(true, &layout->header, &iter, 1, RL_KIND_HEADER, - RL_SOLE_INSTANCE); - expectLayout(true, &layout->config, &iter, 1, RL_KIND_CONFIG, - RL_SOLE_INSTANCE); - expectSubIndex(&layout->index, &iter, &layout->super, 0); - expectLayout(true, &layout->seal, &iter, 1, RL_KIND_SEAL, RL_SOLE_INSTANCE); - - if (iter.result != UDS_SUCCESS) { - return iter.result; - } - - if (iter.nextBlock != firstBlock + layout->totalBlocks) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "layout table does not span total blocks"); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int saveSubIndexRegions(IndexLayout *layout) -{ - SubIndexLayout *sil = &layout->index; - unsigned int j; - for (j = 0; j < layout->super.maxSaves; ++j) { - IndexSaveLayout *isl = &sil->saves[j]; - int result = writeIndexSaveLayout(layout, isl); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "unable to format index %u save 0 layout", - j); - } - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int makeSingleFileRegionTable(IndexLayout *layout, - unsigned int *numRegionsPtr, - RegionTable **tablePtr) -{ - unsigned int numRegions = - 1 + // header - 1 + // config - 1 + // index - 1 + // volume - layout->super.maxSaves + // saves - 1; // seal - - RegionTable *table; - int result = ALLOCATE_EXTENDED(RegionTable, numRegions, LayoutRegion, - "layout region table", &table); - if (result != UDS_SUCCESS) { - return result; - } - - LayoutRegion *lr = &table->regions[0]; - *lr++ = layout->header; - *lr++ = layout->config; - SubIndexLayout *sil = &layout->index; - *lr++ = sil->subIndex; - *lr++ = sil->volume; - unsigned int j; - for (j = 0; j < layout->super.maxSaves; ++j) { - *lr++ = sil->saves[j].indexSave; - } - *lr++ = layout->seal; - - result = ASSERT((lr == &table->regions[numRegions]), - "incorrect number of regions"); - if (result != UDS_SUCCESS) { - return result; - } - - *numRegionsPtr = numRegions; - *tablePtr = table; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int encodeIndexSaveData(Buffer *buffer, IndexSaveData *saveData) -{ - int result = putUInt64LEIntoBuffer(buffer, saveData->timestamp); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, saveData->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, saveData->version); - if (result != UDS_SUCCESS) { - return result; - } - result = zeroBytes(buffer, 4); /* padding */ - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof *saveData, - "%zu bytes encoded of %zu expected", - contentLength(buffer), sizeof(*saveData)); - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int encodeRegionHeader(Buffer *buffer, RegionHeader *header) -{ - size_t startingLength = contentLength(buffer); - int result = putUInt64LEIntoBuffer(buffer, REGION_MAGIC); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, header->regionBlocks); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt16LEIntoBuffer(buffer, header->type); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt16LEIntoBuffer(buffer, header->version); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt16LEIntoBuffer(buffer, header->numRegions); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt16LEIntoBuffer(buffer, header->payload); - if (result != UDS_SUCCESS) { - return result; - } - result - = ASSERT_LOG_ONLY(contentLength(buffer) - startingLength == sizeof(*header), - "%zu bytes encoded, of %zu expected", - contentLength(buffer) - startingLength, sizeof(*header)); - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int encodeLayoutRegion(Buffer *buffer, LayoutRegion *region) -{ - size_t startingLength = contentLength(buffer); - int result = putUInt64LEIntoBuffer(buffer, region->startBlock); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, region->numBlocks); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, region->checksum); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt16LEIntoBuffer(buffer, region->kind); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt16LEIntoBuffer(buffer, region->instance); - if (result != UDS_SUCCESS) { - return result; - } - result - = ASSERT_LOG_ONLY(contentLength(buffer) - startingLength == sizeof(*region), - "%zu bytes encoded, of %zu expected", - contentLength(buffer) - startingLength, sizeof(*region)); - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int encodeSuperBlockData(Buffer *buffer, SuperBlockData *super) -{ - int result = putBytes(buffer, 32, &super->magicLabel); - if (result != UDS_SUCCESS) { - return result; - } - result = putBytes(buffer, 32, &super->nonceInfo); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, super->nonce); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, super->version); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, super->blockSize); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt16LEIntoBuffer(buffer, super->numIndexes); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt16LEIntoBuffer(buffer, super->maxSaves); - if (result != UDS_SUCCESS) { - return result; - } - result = zeroBytes(buffer, 4); // aligment - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, super->openChapterBlocks); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, super->pageMapBlocks); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(SuperBlockData), - "%zu bytes encoded, of %zu expected", - contentLength(buffer), sizeof(SuperBlockData)); - return result; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int writeSingleFileHeader(IndexLayout *layout, - RegionTable *table, - unsigned int numRegions, - BufferedWriter *writer) -{ - table->header = (RegionHeader) { - .magic = REGION_MAGIC, - .regionBlocks = layout->totalBlocks, - .type = RH_TYPE_SUPER, - .version = 1, - .numRegions = numRegions, - .payload = sizeof(layout->super), - }; - - size_t tableSize = sizeof(RegionTable) + numRegions * sizeof(LayoutRegion); - - Buffer *buffer; - int result = makeBuffer(tableSize, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encodeRegionHeader(buffer, &table->header); - - unsigned int i; - for (i = 0; i < numRegions; i++) { - if (result == UDS_SUCCESS) { - result = encodeLayoutRegion(buffer, &table->regions[i]); - } - } - - if (result == UDS_SUCCESS) { - result = writeToBufferedWriter(writer, getBufferContents(buffer), - contentLength(buffer)); - } - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = makeBuffer(sizeof(layout->super), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encodeSuperBlockData(buffer, &layout->super); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - - result = writeToBufferedWriter(writer, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - return flushBufferedWriter(writer); -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int saveSingleFileConfiguration(IndexLayout *layout) -{ - int result = saveSubIndexRegions(layout); - if (result != UDS_SUCCESS) { - return result; - } - - RegionTable *table; - unsigned int numRegions; - result = makeSingleFileRegionTable(layout, &numRegions, &table); - if (result != UDS_SUCCESS) { - return result; - } - - BufferedWriter *writer = NULL; - result = openLayoutWriter(layout, &layout->header, &writer); - if (result != UDS_SUCCESS) { - FREE(table); - return result; - } - - result = writeSingleFileHeader(layout, table, numRegions, writer); - FREE(table); - freeBufferedWriter(writer); - - return result; -} - -/*****************************************************************************/ -void putIndexLayout(IndexLayout **layoutPtr) -{ - if (layoutPtr == NULL) { - return; - } - IndexLayout *layout = *layoutPtr; - *layoutPtr = NULL; - if ((layout == NULL) || (--layout->refCount > 0)) { - return; - } - - SubIndexLayout *sil = &layout->index; - if (sil->saves != NULL) { - unsigned int j; - for (j = 0; j < layout->super.maxSaves; ++j) { - IndexSaveLayout *isl = &sil->saves[j]; - FREE(isl->masterIndexZones); - FREE(isl->openChapter); - freeBuffer(&isl->indexStateBuffer); - } - } - FREE(sil->saves); - - if (layout->factory != NULL) { - putIOFactory(layout->factory); - } - FREE(layout); -} - -/*****************************************************************************/ -void getIndexLayout(IndexLayout *layout, IndexLayout **layoutPtr) -{ - ++layout->refCount; - *layoutPtr = layout; -} - -/*****************************************************************************/ -const struct index_version *getIndexVersion(IndexLayout *layout) -{ - return &layout->indexVersion; -} - -/*****************************************************************************/ -int writeIndexConfig(IndexLayout *layout, UdsConfiguration config) -{ - BufferedWriter *writer = NULL; - int result = openLayoutWriter(layout, &layout->config, &writer); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "failed to open config region"); - } - - result = writeConfigContents(writer, config); - if (result != UDS_SUCCESS) { - freeBufferedWriter(writer); - return logErrorWithStringError(result, "failed to write config region"); - } - result = flushBufferedWriter(writer); - if (result != UDS_SUCCESS) { - freeBufferedWriter(writer); - return logErrorWithStringError(result, "cannot flush config writer"); - } - freeBufferedWriter(writer); - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int verifyIndexConfig(IndexLayout *layout, UdsConfiguration config) -{ - BufferedReader *reader = NULL; - int result = openLayoutReader(layout, &layout->config, &reader); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "failed to open config reader"); - } - - struct udsConfiguration storedConfig; - result = readConfigContents(reader, &storedConfig); - if (result != UDS_SUCCESS) { - freeBufferedReader(reader); - return logErrorWithStringError(result, "failed to read config region"); - } - freeBufferedReader(reader); - - return (areUdsConfigurationsEqual(&storedConfig, config) - ? UDS_SUCCESS - : UDS_NO_INDEX); -} - -#ifdef __KERNEL__ -/*****************************************************************************/ -int openVolumeBufio(IndexLayout *layout, - size_t blockSize, - unsigned int reservedBuffers, - struct dm_bufio_client **clientPtr) -{ - off_t offset = layout->index.volume.startBlock * layout->super.blockSize; - return makeBufio(layout->factory, offset, blockSize, reservedBuffers, - clientPtr); -} -#else -/*****************************************************************************/ -int openVolumeRegion(IndexLayout *layout, IORegion **regionPtr) -{ - LayoutRegion *lr = &layout->index.volume; - off_t start = lr->startBlock * layout->super.blockSize; - size_t size = lr->numBlocks * layout->super.blockSize; - int result = makeIORegion(layout->factory, start, size, regionPtr); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "cannot access index volume region"); - } - return UDS_SUCCESS; -} -#endif - -/*****************************************************************************/ -uint64_t getVolumeNonce(IndexLayout *layout) -{ - return layout->index.nonce; -} - -/*****************************************************************************/ -static uint64_t generateIndexSaveNonce(uint64_t volumeNonce, - IndexSaveLayout *isl) -{ - struct SaveNonceData { - IndexSaveData data; - uint64_t offset; - } nonceData; - - nonceData.data = isl->saveData; - nonceData.data.nonce = 0; - nonceData.offset = isl->indexSave.startBlock; - - byte buffer[sizeof(nonceData)]; - size_t offset = 0; - encodeUInt64LE(buffer, &offset, nonceData.data.timestamp); - encodeUInt64LE(buffer, &offset, nonceData.data.nonce); - encodeUInt32LE(buffer, &offset, nonceData.data.version); - encodeUInt32LE(buffer, &offset, 0U); // padding - encodeUInt64LE(buffer, &offset, nonceData.offset); - ASSERT_LOG_ONLY(offset == sizeof(nonceData), - "%zu bytes encoded of %zu expected", - offset, sizeof(nonceData)); - return generateSecondaryNonce(volumeNonce, buffer, sizeof(buffer)); -} - -/*****************************************************************************/ -static int validateIndexSaveLayout(IndexSaveLayout *isl, - uint64_t volumeNonce, - uint64_t *saveTimePtr) -{ - if (isl->saveType == NO_SAVE || isl->numZones == 0 || - isl->saveData.timestamp == 0) - { - return UDS_BAD_STATE; - } - if (isl->saveData.nonce != generateIndexSaveNonce(volumeNonce, isl)) { - return UDS_BAD_STATE; - } - if (saveTimePtr != NULL) { - *saveTimePtr = isl->saveData.timestamp; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int selectOldestIndexSaveLayout(SubIndexLayout *sil, - unsigned int maxSaves, - IndexSaveLayout **islPtr) -{ - IndexSaveLayout *oldest = NULL; - uint64_t oldestTime = 0; - - // find the oldest valid or first invalid slot - IndexSaveLayout *isl; - for (isl = sil->saves; isl < sil->saves + maxSaves; ++isl) { - uint64_t saveTime = 0; - int result = validateIndexSaveLayout(isl, sil->nonce, &saveTime); - if (result != UDS_SUCCESS) { - saveTime = 0; - } - if (oldest == NULL || saveTime < oldestTime) { - oldest = isl; - oldestTime = saveTime; - } - } - - int result = ASSERT((oldest != NULL), "no oldest or free save slot"); - if (result != UDS_SUCCESS) { - return result; - } - *islPtr = oldest; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int selectLatestIndexSaveLayout(SubIndexLayout *sil, - unsigned int maxSaves, - IndexSaveLayout **islPtr) -{ - IndexSaveLayout *latest = NULL; - uint64_t latestTime = 0; - - // find the latest valid save slot - IndexSaveLayout *isl; - for (isl = sil->saves; isl < sil->saves + maxSaves; ++isl) { - uint64_t saveTime = 0; - int result = validateIndexSaveLayout(isl, sil->nonce, &saveTime); - if (result != UDS_SUCCESS) { - continue; - } - if (saveTime > latestTime) { - latest = isl; - latestTime = saveTime; - } - } - - if (latest == NULL) { - return UDS_INDEX_NOT_SAVED_CLEANLY; - } - *islPtr = latest; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static uint64_t getTimeMS(AbsTime time) -{ - time_t t = asTimeT(time); - RelTime r = timeDifference(time, fromTimeT(t)); - return (uint64_t) t * 1000 + relTimeToMilliseconds(r); -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int instantiateIndexSaveLayout(IndexSaveLayout *isl, - SuperBlockData *super, - uint64_t volumeNonce, - unsigned int numZones, - IndexSaveType saveType) -{ - int result = UDS_SUCCESS; - if (isl->openChapter && saveType == IS_CHECKPOINT) { - FREE(isl->openChapter); - isl->openChapter = NULL; - } else if (isl->openChapter == NULL && saveType == IS_SAVE) { - result = ALLOCATE(1, LayoutRegion, "open chapter layout", - &isl->openChapter); - if (result != UDS_SUCCESS) { - return result; - } - } - if (numZones != isl->numZones) { - if (isl->masterIndexZones != NULL) { - FREE(isl->masterIndexZones); - } - result = ALLOCATE(numZones, LayoutRegion, "master index zone layouts", - &isl->masterIndexZones); - if (result != UDS_SUCCESS) { - return result; - } - isl->numZones = numZones; - } - - populateIndexSaveLayout(isl, super, numZones, saveType); - - result = makeBuffer(INDEX_STATE_BUFFER_SIZE, &isl->indexStateBuffer); - if (result != UDS_SUCCESS) { - return result; - } - - isl->read = isl->written = false; - isl->saveType = saveType; - memset(&isl->saveData, 0, sizeof(isl->saveData)); - isl->saveData.timestamp = getTimeMS(currentTime(CLOCK_REALTIME)); - isl->saveData.version = 1; - - isl->saveData.nonce = generateIndexSaveNonce(volumeNonce, isl); - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int invalidateOldSave(IndexLayout *layout, IndexSaveLayout *isl) -{ - uint64_t startBlock = isl->indexSave.startBlock; - uint64_t saveBlocks = isl->indexSave.numBlocks; - unsigned int save = isl->indexSave.instance; - - int result = resetIndexSaveLayout(isl, &startBlock, saveBlocks, - layout->super.pageMapBlocks, save); - if (result != UDS_SUCCESS) { - return result; - } - - return writeIndexSaveLayout(layout, isl); -} - -/*****************************************************************************/ -int setupIndexSaveSlot(IndexLayout *layout, - unsigned int numZones, - IndexSaveType saveType, - unsigned int *saveSlotPtr) -{ - SubIndexLayout *sil = &layout->index; - - IndexSaveLayout *isl = NULL; - int result = selectOldestIndexSaveLayout(sil, layout->super.maxSaves, &isl); - if (result != UDS_SUCCESS) { - return result; - } - - result = invalidateOldSave(layout, isl); - if (result != UDS_SUCCESS) { - return result; - } - - result = instantiateIndexSaveLayout(isl, &layout->super, sil->nonce, - numZones, saveType); - if (result != UDS_SUCCESS) { - return result; - } - - *saveSlotPtr = isl - sil->saves; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int findLatestIndexSaveSlot(IndexLayout *layout, - unsigned int *numZonesPtr, - unsigned int *slotPtr) -{ - SubIndexLayout *sil = &layout->index; - - IndexSaveLayout *isl = NULL; - int result = selectLatestIndexSaveLayout(sil, layout->super.maxSaves, &isl); - if (result != UDS_SUCCESS) { - return result; - } - - if (numZonesPtr != NULL) { - *numZonesPtr = isl->numZones; - } - if (slotPtr != NULL) { - *slotPtr = isl - sil->saves; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int makeIndexSaveRegionTable(IndexSaveLayout *isl, - unsigned int *numRegionsPtr, - RegionTable **tablePtr) -{ - unsigned int numRegions = - 1 + // header - 1 + // index page map - isl->numZones + // master index zones - (bool) isl->openChapter; // open chapter if needed - - if (isl->freeSpace.numBlocks > 0) { - numRegions++; - } - - RegionTable *table; - int result = ALLOCATE_EXTENDED(RegionTable, numRegions, LayoutRegion, - "layout region table for ISL", &table); - if (result != UDS_SUCCESS) { - return result; - } - - LayoutRegion *lr = &table->regions[0]; - *lr++ = isl->header; - *lr++ = isl->indexPageMap; - unsigned int z; - for (z = 0; z < isl->numZones; ++z) { - *lr++ = isl->masterIndexZones[z]; - } - if (isl->openChapter) { - *lr++ = *isl->openChapter; - } - if (isl->freeSpace.numBlocks > 0) { - *lr++ = isl->freeSpace; - } - - result = ASSERT((lr == &table->regions[numRegions]), - "incorrect number of ISL regions"); - if (result != UDS_SUCCESS) { - return result; - } - - *numRegionsPtr = numRegions; - *tablePtr = table; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static unsigned int regionTypeForSaveType(IndexSaveType saveType) -{ - switch (saveType) { - case IS_SAVE: - return RH_TYPE_SAVE; - - case IS_CHECKPOINT: - return RH_TYPE_CHECKPOINT; - - default: - break; - } - - return RH_TYPE_UNSAVED; -} - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int writeIndexSaveHeader(IndexSaveLayout *isl, - RegionTable *table, - unsigned int numRegions, - BufferedWriter *writer) -{ - size_t payload = sizeof(isl->saveData); - if (isl->indexStateBuffer != NULL) { - payload += contentLength(isl->indexStateBuffer); - } - - table->header = (RegionHeader) { - .magic = REGION_MAGIC, - .regionBlocks = isl->indexSave.numBlocks, - .type = regionTypeForSaveType(isl->saveType), - .version = 1, - .numRegions = numRegions, - .payload = payload, - }; - - size_t tableSize = sizeof(RegionTable) + numRegions * sizeof(LayoutRegion); - Buffer *buffer; - int result = makeBuffer(tableSize, &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encodeRegionHeader(buffer, &table->header); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - - unsigned int i; - for (i = 0; i < numRegions; i++) { - result = encodeLayoutRegion(buffer, &table->regions[i]); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == tableSize, - "%zu bytes encoded of %zu expected", - contentLength(buffer), tableSize); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - - result = writeToBufferedWriter(writer, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = makeBuffer(sizeof(isl->saveData), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = encodeIndexSaveData(buffer, &isl->saveData); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - - result = writeToBufferedWriter(writer, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - - if (isl->indexStateBuffer != NULL) { - result = writeToBufferedWriter(writer, - getBufferContents(isl->indexStateBuffer), - contentLength(isl->indexStateBuffer)); - if (result != UDS_SUCCESS) { - return result; - } - } - - return flushBufferedWriter(writer); -} - -/*****************************************************************************/ -static int writeIndexSaveLayout(IndexLayout *layout, IndexSaveLayout *isl) -{ - unsigned int numRegions; - RegionTable *table; - int result = makeIndexSaveRegionTable(isl, &numRegions, &table); - if (result != UDS_SUCCESS) { - return result; - } - - BufferedWriter *writer = NULL; - result = openLayoutWriter(layout, &isl->header, &writer); - if (result != UDS_SUCCESS) { - FREE(table); - return result; - } - - result = writeIndexSaveHeader(isl, table, numRegions, writer); - FREE(table); - freeBufferedWriter(writer); - - isl->written = true; - return result; -} - -/*****************************************************************************/ -int commitIndexSave(IndexLayout *layout, unsigned int saveSlot) -{ - int result = ASSERT((saveSlot < layout->super.maxSaves), - "save slot out of range"); - if (result != UDS_SUCCESS) { - return result; - } - - IndexSaveLayout *isl = &layout->index.saves[saveSlot]; - - if (bufferUsed(isl->indexStateBuffer) == 0) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "%s: no index state data saved", __func__); - } - - return writeIndexSaveLayout(layout, isl); -} - -/*****************************************************************************/ - -static void mutilateIndexSaveInfo(IndexSaveLayout *isl) -{ - memset(&isl->saveData, 0, sizeof(isl->saveData)); - isl->read = isl->written = 0; - isl->saveType = NO_SAVE; - isl->numZones = 0; - freeBuffer(&isl->indexStateBuffer); -} - -/*****************************************************************************/ -int cancelIndexSave(IndexLayout *layout, unsigned int saveSlot) -{ - int result = ASSERT((saveSlot < layout->super.maxSaves), - "save slot out of range"); - if (result != UDS_SUCCESS) { - return result; - } - - mutilateIndexSaveInfo(&layout->index.saves[saveSlot]); - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int discardIndexSaves(IndexLayout *layout, bool all) -{ - int result = UDS_SUCCESS; - SubIndexLayout *sil = &layout->index; - - if (all) { - unsigned int i; - for (i = 0; i < layout->super.maxSaves; ++i) { - IndexSaveLayout *isl = &sil->saves[i]; - result = firstError(result, invalidateOldSave(layout, isl)); - } - } else { - IndexSaveLayout *isl; - result = selectLatestIndexSaveLayout(sil, layout->super.maxSaves, &isl); - if (result == UDS_SUCCESS) { - result = invalidateOldSave(layout, isl); - } - } - - return result; -} - -/*****************************************************************************/ -static int createIndexLayout(IndexLayout *layout, - uint64_t size, - const UdsConfiguration config) -{ - if (config == NULL) { - return UDS_CONF_PTR_REQUIRED; - } - - SaveLayoutSizes sizes; - int result = computeSizes(&sizes, config, UDS_BLOCK_SIZE, 0); - if (result != UDS_SUCCESS) { - return result; - } - - if (size < sizes.totalBlocks * sizes.blockSize) { - return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, - "layout requires at least %" PRIu64 - " bytes", - sizes.totalBlocks * sizes.blockSize); - } - - result = initSingleFileLayout(layout, layout->offset, size, &sizes); - if (result != UDS_SUCCESS) { - return result; - } - - result = saveSingleFileConfiguration(layout); - if (result != UDS_SUCCESS) { - return result; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -Buffer *getIndexStateBuffer(IndexLayout *layout, unsigned int slot) -{ - return layout->index.saves[slot].indexStateBuffer; -} - -/*****************************************************************************/ -static int findLayoutRegion(IndexLayout *layout, - unsigned int slot, - const char *operation, - RegionKind kind, - unsigned int zone, - LayoutRegion **lrPtr) -{ - int result = ASSERT((slot < layout->super.maxSaves), "%s not started", - operation); - if (result != UDS_SUCCESS) { - return result; - } - - IndexSaveLayout *isl = &layout->index.saves[slot]; - - LayoutRegion *lr = NULL; - switch (kind) { - case RL_KIND_INDEX_PAGE_MAP: - lr = &isl->indexPageMap; - break; - - case RL_KIND_OPEN_CHAPTER: - if (isl->openChapter == NULL) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "%s: %s has no open chapter", - __func__, operation); - } - lr = isl->openChapter; - break; - - case RL_KIND_MASTER_INDEX: - if (isl->masterIndexZones == NULL || zone >= isl->numZones) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "%s: %s has no master index zone %u", - __func__, operation, zone); - } - lr = &isl->masterIndexZones[zone]; - break; - - default: - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "%s: unexpected kind %u", - __func__, kind); - } - - *lrPtr = lr; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int openIndexBufferedReader(IndexLayout *layout, - unsigned int slot, - RegionKind kind, - unsigned int zone, - BufferedReader **readerPtr) -{ - LayoutRegion *lr = NULL; - int result = findLayoutRegion(layout, slot, "load", kind, zone, &lr); - if (result != UDS_SUCCESS) { - return result; - } - return openLayoutReader(layout, lr, readerPtr); -} - -/*****************************************************************************/ -int openIndexBufferedWriter(IndexLayout *layout, - unsigned int slot, - RegionKind kind, - unsigned int zone, - BufferedWriter **writerPtr) -{ - LayoutRegion *lr = NULL; - int result = findLayoutRegion(layout, slot, "save", kind, zone, &lr); - if (result != UDS_SUCCESS) { - return result; - } - return openLayoutWriter(layout, lr, writerPtr); -} - -/*****************************************************************************/ -int makeIndexLayoutFromFactory(IOFactory *factory, - off_t offset, - uint64_t namedSize, - bool newLayout, - const UdsConfiguration config, - IndexLayout **layoutPtr) -{ - // Get the device size and round it down to a multiple of UDS_BLOCK_SIZE. - size_t size = getWritableSize(factory) & -UDS_BLOCK_SIZE; - if (namedSize > size) { - return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, - "index storage (%zu) is smaller than the" - " requested size %llu", - size, namedSize); - } - if ((namedSize > 0) && (namedSize < size)) { - size = namedSize; - } - - // Get the index size according the the config - uint64_t configSize; - int result = udsComputeIndexSize(config, 0, &configSize); - if (result != UDS_SUCCESS) { - return result; - } - if (size < configSize) { - return logErrorWithStringError(UDS_INSUFFICIENT_INDEX_SPACE, - "index storage (%zu) is smaller than the" - " required size %llu", - size, configSize); - } - size = configSize; - - IndexLayout *layout = NULL; - result = ALLOCATE(1, IndexLayout, __func__, &layout); - if (result != UDS_SUCCESS) { - return result; - } - layout->refCount = 1; - - getIOFactory(factory); - layout->factory = factory; - layout->offset = offset; - - if (newLayout) { - // Populate the layout from the UDSConfiguration - result = createIndexLayout(layout, size, config); - } else { - // Populate the layout from the saved index. - result = loadIndexLayout(layout); - } - if (result != UDS_SUCCESS) { - putIndexLayout(&layout); - return result; - } - *layoutPtr = layout; - return UDS_SUCCESS; -} diff --git a/uds/indexLayout.h b/uds/indexLayout.h deleted file mode 100644 index 4144799..0000000 --- a/uds/indexLayout.h +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexLayout.h#13 $ - */ - -#ifndef INDEX_LAYOUT_H -#define INDEX_LAYOUT_H - -#include "buffer.h" -#include "indexState.h" -#include "indexVersion.h" -#include "ioFactory.h" -#include "uds.h" - -typedef struct indexLayout IndexLayout; - -/** - * Construct an index layout. This is a platform specific function that uses - * the name string, a flag that indicates old vs. new indices, and a - * UDSConfiguration (for new indices) to make an IOFactory and invoke - * makeIndexLayoutFromFactory. - * - * @param name String naming the index. Each platform will use its own - * conventions to interpret the string, but in general it is - * a space-separated sequence of param=value settings. For - * backward compatibility a string without an equals is - * treated as a platform-specific default parameter value. - * @param newLayout Whether this is a new layout. - * @param config The UdsConfiguration required for a new layout. - * @param layoutPtr Where to store the new index layout - * - * @return UDS_SUCCESS or an error code. - **/ -int makeIndexLayout(const char *name, - bool newLayout, - const UdsConfiguration config, - IndexLayout **layoutPtr) - __attribute__((warn_unused_result)); - -/** - * Construct an index layout using an IOFactory. This method is common to all - * platforms. - * - * @param factory The IOFactory for the block storage containing the index. - * @param offset The offset of the start of the index within the block - * storage address space. - * @param namedSize The size in bytes of the space within the block storage - * address space, as specified in the name string. - * @param newLayout Whether this is a new layout. - * @param config The UdsConfiguration required for a new layout. - * @param layoutPtr Where to store the new index layout - * - * @return UDS_SUCCESS or an error code. - **/ -int makeIndexLayoutFromFactory(IOFactory *factory, - off_t offset, - uint64_t namedSize, - bool newLayout, - const UdsConfiguration config, - IndexLayout **layoutPtr) - __attribute__((warn_unused_result)); - -/** - * Decrement the use count of an index layout. If the count goes to zero, free - * the index layout. - * - * @param layoutPtr Where the layout is being stored. Always reset to NULL. - **/ -void putIndexLayout(IndexLayout **layoutPtr); - -/*****************************************************************************/ -int cancelIndexSave(IndexLayout *layout, unsigned int saveSlot) - __attribute__((warn_unused_result)); - -/*****************************************************************************/ -int commitIndexSave(IndexLayout *layout, unsigned int saveSlot) - __attribute__((warn_unused_result)); - -/*****************************************************************************/ -int discardIndexSaves(IndexLayout *layout, bool all) - __attribute__((warn_unused_result)); - -/** - * Find the latest index save slot. - * - * @param [in] layout The single file layout. - * @param [out] numZonesPtr Where to store the actual number of zones - * that were saved. - * @param [out] slotPtr Where to store the slot number we found. - * - * @return UDS_SUCCESS or an error code. - **/ -int findLatestIndexSaveSlot(IndexLayout *layout, - unsigned int *numZonesPtr, - unsigned int *slotPtr) - __attribute__((warn_unused_result)); - -/** - * Get another reference to an index layout, incrementing it's use count. - * - * @param layout The index layout. - * @param layoutPtr Where the new layout pointer is being stored. - **/ -void getIndexLayout(IndexLayout *layout, IndexLayout **layoutPtr); - -/** - * Open a BufferedReader for a specified state, kind, and zone. - * - * @param layout The index layout - * @param slot The save slot - * @param kind The kind if index save region to open. - * @param zone The zone number for the region. - * @param readerPtr Where to store the BufferedReader. - * - * @return UDS_SUCCESS or an error code. - **/ -int openIndexBufferedReader(IndexLayout *layout, - unsigned int slot, - RegionKind kind, - unsigned int zone, - BufferedReader **readerPtr) - __attribute__((warn_unused_result)); - -/** - * Open a BufferedWriter for a specified state, kind, and zone. - * - * @param layout The index layout - * @param slot The save slot - * @param kind The kind if index save region to open. - * @param zone The zone number for the region. - * @param writerPtr Where to store the BufferedWriter. - * - * @return UDS_SUCCESS or an error code. - **/ -int openIndexBufferedWriter(IndexLayout *layout, - unsigned int slot, - RegionKind kind, - unsigned int zone, - BufferedWriter **writerPtr) - __attribute__((warn_unused_result)); - -/** - * Obtain the nonce to be used to store or validate the loading of volume index - * pages. - * - * @param [in] layout The index layout. - * - * @return The nonce to use. - **/ -uint64_t getVolumeNonce(IndexLayout *layout) - __attribute__((warn_unused_result)); - -#ifdef __KERNEL__ -/** - * Obtain a dm_bufio_client for the specified index volume. - * - * @param [in] layout The index layout. - * @param [in] blockSize The size of a volume page - * @param [in] reservedBuffers The count of reserved buffers - * @param [out] clientPtr Where to put the new dm_bufio_client - * - * @return UDS_SUCCESS or an error code. - **/ -int openVolumeBufio(IndexLayout *layout, - size_t blockSize, - unsigned int reservedBuffers, - struct dm_bufio_client **clientPtr) - __attribute__((warn_unused_result)); -#else -/** - * Obtain an IORegion for the specified index volume. - * - * @param [in] layout The index layout. - * @param [out] regionPtr Where to put the new region. - * - * @return UDS_SUCCESS or an error code. - **/ -int openVolumeRegion(IndexLayout *layout, struct ioRegion **regionPtr) - __attribute__((warn_unused_result)); -#endif - -/** - * Read the index configuration, and verify that it matches the given - * configuration. - * - * @param layout the generic index layout - * @param config the index configuration - * - * @return UDS_SUCCESS or an error code - **/ -int verifyIndexConfig(IndexLayout *layout, UdsConfiguration config) - __attribute__((warn_unused_result)); - -/** - * Determine which index save slot to use for a new index save. - * - * Also allocates the masterIndex regions and, if needed, the openChapter - * region. - * - * @param [in] layout The index layout. - * @param [in] numZones Actual number of zones currently in use. - * @param [in] saveType The index save type. - * @param [out] saveSlotPtr Where to store the save slot number. - * - * @return UDS_SUCCESS or an error code - **/ -int setupIndexSaveSlot(IndexLayout *layout, - unsigned int numZones, - IndexSaveType saveType, - unsigned int *saveSlotPtr) - __attribute__((warn_unused_result)); - -/** - * Write the index configuration. - * - * @param layout the generic index layout - * @param config the index configuration to write - * - * @return UDS_SUCCESS or an error code - **/ -int writeIndexConfig(IndexLayout *layout, UdsConfiguration config) - __attribute__((warn_unused_result)); - -/** - * Get the index state buffer - * - * @param layout the index layout - * @param slot the save slot - * - * @return UDS_SUCCESS or an error code - **/ -Buffer *getIndexStateBuffer(IndexLayout *layout, unsigned int slot) - __attribute__((warn_unused_result)); - -/** - * Get the index version parameters. - * - * @param layout the index layout - * - * @return the index version parameters. - **/ -const struct index_version *getIndexVersion(IndexLayout *layout) - __attribute__((warn_unused_result)); - -#endif // INDEX_LAYOUT_H diff --git a/uds/indexLayoutLinuxKernel.c b/uds/indexLayoutLinuxKernel.c deleted file mode 100644 index 8301166..0000000 --- a/uds/indexLayoutLinuxKernel.c +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/indexLayoutLinuxKernel.c#5 $ - */ - -#include "indexLayout.h" -#include "indexLayoutParser.h" -#include "memoryAlloc.h" - -/*****************************************************************************/ -int makeIndexLayout(const char *name, - bool newLayout, - const UdsConfiguration config, - IndexLayout **layoutPtr) -{ - char *dev = NULL; - uint64_t offset = 0; - uint64_t size = 0; - - LayoutParameter parameterTable[] = { - { "dev", LP_STRING | LP_DEFAULT, { .str = &dev } }, - { "offset", LP_UINT64, { .num = &offset } }, - { "size", LP_UINT64, { .num = &size } }, - }; - size_t numParameters = sizeof(parameterTable) / sizeof(*parameterTable); - - char *params = NULL; - int result = duplicateString(name, "makeIndexLayout parameters", ¶ms); - if (result != UDS_SUCCESS) { - return result; - } - - // note dev will be set to memory owned by params - result = parseLayoutString(params, parameterTable, numParameters); - if (result != UDS_SUCCESS) { - FREE(params); - return result; - } - - IOFactory *factory = NULL; - result = makeIOFactory(dev, &factory); - FREE(params); - if (result != UDS_SUCCESS) { - return result; - } - IndexLayout *layout; - result = makeIndexLayoutFromFactory(factory, offset, size, newLayout, config, - &layout); - putIOFactory(factory); - if (result != UDS_SUCCESS) { - return result; - } - *layoutPtr = layout; - return UDS_SUCCESS; -} diff --git a/uds/indexLayoutParser.c b/uds/indexLayoutParser.c deleted file mode 100644 index 808def7..0000000 --- a/uds/indexLayoutParser.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexLayoutParser.c#2 $ - */ - -#include "indexLayoutParser.h" - -#include "errors.h" -#include "logger.h" -#include "permassert.h" -#include "stringUtils.h" -#include "typeDefs.h" -#include "uds.h" - -/*****************************************************************************/ -__attribute__((warn_unused_result)) -static int setParameterValue(LayoutParameter *lp, char *data) -{ - if ((lp->type & LP_TYPE_MASK) == LP_UINT64) { - int result = parseUint64(data, lp->value.num); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, - "bad numeric value %s", data); - } - } else if ((lp->type & LP_TYPE_MASK) == LP_STRING) { - *lp->value.str = data; - } else { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "unkown LayoutParameter type code %x", - (lp->type & LP_TYPE_MASK)); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int parseLayoutString(char *info, LayoutParameter *params, size_t count) -{ - if (!strchr(info, '=')) { - LayoutParameter *lp; - for (lp = params; lp < params + count; ++lp) { - if (lp->type & LP_DEFAULT) { - int result = setParameterValue(lp, info); - if (result != UDS_SUCCESS) { - return result; - } - break; - } - } - } else { - char *data = NULL; - char *token; - for (token = nextToken(info, " ", &data); - token; - token = nextToken(NULL, " ", &data)) - { - char *equal = strchr(token, '='); - LayoutParameter *lp; - for (lp = params; lp < params + count; ++lp) { - if (!equal && (lp->type & LP_DEFAULT)) { - break; - } else if (strncmp(token, lp->name, equal - token) == 0 && - strlen(lp->name) == (size_t) (equal - token)) { - break; - } - } - if (lp == NULL) { - return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, - "unkown index parameter %s", - token); - } - if (lp->seen) { - return logErrorWithStringError(UDS_INDEX_NAME_REQUIRED, - "duplicate index parameter %s", - token); - } - lp->seen = true; - int result = setParameterValue(lp, equal ? equal + 1 : token); - if (result != UDS_SUCCESS) { - return result; - } - } - } - return UDS_SUCCESS; -} diff --git a/uds/indexLayoutParser.h b/uds/indexLayoutParser.h deleted file mode 100644 index 35b492a..0000000 --- a/uds/indexLayoutParser.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexLayoutParser.h#1 $ - */ - -#ifndef INDEX_LAYOUT_PARSER_H -#define INDEX_LAYOUT_PARSER_H - -#include "typeDefs.h" - -typedef enum { - LP_STRING = 0x001, - LP_UINT64 = 0x002, - LP_TYPE_MASK = 0x0FF, - LP_DEFAULT = 0x100, -} LPType; - -typedef struct layoutParameter { - const char *name; - LPType type; - union { - char **str; - uint64_t *num; - } value; - bool seen; -} LayoutParameter; - -/** - * Function to parse an index layout specification. - * - * This parser treats the specification as a set of name=value parameters - * or, in the absence of an '=' character, a single value for a default - * parameter. The list of acceptable parameters is specified as an array - * of LayoutParameter entries. Each such parameter contains the address - * of the variable in which the value is to be stored. - * - * @param info A copy of the index layout specification that - * will be altered by the parser to insert null - * characters after each value. Note that string - * parameter values will point into the memory of - * this string, so this specification cannot be - * deallocated until all uses of the parameter - * values are over. - * @param params The table of parameters the caller expects to - * find in the ``info'' string. Currently this - * parser can handle string and uint64_t values. - * @param count The size of the parameter table. - * - * @return UDS_SUCCESS or an error code, particularly - * UDS_INDEX_NAME_REQUIRED for all parsing errors. - **/ -int parseLayoutString(char *info, LayoutParameter *params, size_t count) - __attribute__((warn_unused_result)); - -#endif // INDEX_LAYOUT_PARSER_H diff --git a/uds/indexPageMap.c b/uds/indexPageMap.c deleted file mode 100644 index a915179..0000000 --- a/uds/indexPageMap.c +++ /dev/null @@ -1,361 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexPageMap.c#4 $ - */ - -#include "indexPageMap.h" - -#include "buffer.h" -#include "bufferedWriter.h" -#include "compiler.h" -#include "errors.h" -#include "hashUtils.h" -#include "indexComponent.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "stringUtils.h" -#include "threads.h" -#include "uds.h" - -static int readIndexPageMap(ReadPortal *portal); -static int writeIndexPageMap(IndexComponent *component, - BufferedWriter *writer, - unsigned int zone); - -static const byte INDEX_PAGE_MAP_MAGIC[] = "ALBIPM02"; -enum { - INDEX_PAGE_MAP_MAGIC_LENGTH = sizeof(INDEX_PAGE_MAP_MAGIC) - 1, -}; - -const IndexComponentInfo INDEX_PAGE_MAP_INFO = { - .kind = RL_KIND_INDEX_PAGE_MAP, - .name = "index page map", - .saveOnly = false, - .chapterSync = true, - .multiZone = false, - .ioStorage = true, - .loader = readIndexPageMap, - .saver = writeIndexPageMap, - .incremental = NULL, -}; - -/*****************************************************************************/ -static INLINE size_t numEntries(const Geometry *geometry) -{ - return geometry->chaptersPerVolume * (geometry->indexPagesPerChapter - 1); -} - -/*****************************************************************************/ -int makeIndexPageMap(const Geometry *geometry, IndexPageMap **mapPtr) -{ - unsigned int deltaListsPerChapter = geometry->deltaListsPerChapter; - int result - = ASSERT_WITH_ERROR_CODE(((deltaListsPerChapter - 1) <= UINT16_MAX), - UDS_BAD_STATE, - "delta lists per chapter (%u) is too large", - deltaListsPerChapter); - if (result != UDS_SUCCESS) { - return result; - } - - IndexPageMap *map; - result = ALLOCATE(1, IndexPageMap, "Index Page Map", &map); - if (result != UDS_SUCCESS) { - return result; - } - - map->geometry = geometry; - - result = ALLOCATE(numEntries(geometry), - IndexPageMapEntry, - "Index Page Map Entries", - &map->entries); - if (result != UDS_SUCCESS) { - freeIndexPageMap(map); - return result; - } - - *mapPtr = map; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -void freeIndexPageMap(IndexPageMap *map) -{ - if (map != NULL) { - FREE(map->entries); - FREE(map); - } -} - -/*****************************************************************************/ -uint64_t getLastUpdate(const IndexPageMap *map) -{ - return map->lastUpdate; -} - -/*****************************************************************************/ -int updateIndexPageMap(IndexPageMap *map, - uint64_t virtualChapterNumber, - unsigned int chapterNumber, - unsigned int indexPageNumber, - unsigned int deltaListNumber) -{ - const Geometry *geometry = map->geometry; - if ((virtualChapterNumber < map->lastUpdate) - || (virtualChapterNumber > map->lastUpdate + 1)) { - // if the lastUpdate is 0, this is likely to be normal because we are - // replaying the volume - if (map->lastUpdate != 0) { - logWarning("unexpected index page map update, jumping from %" PRIu64 - " to %llu", - map->lastUpdate, virtualChapterNumber); - } - } - map->lastUpdate = virtualChapterNumber; - - if (chapterNumber >= geometry->chaptersPerVolume) { - return logErrorWithStringError( - UDS_INVALID_ARGUMENT, "chapter number %u exceeds maximum %u", - chapterNumber, geometry->chaptersPerVolume - 1); - } - if (indexPageNumber >= geometry->indexPagesPerChapter) { - return logErrorWithStringError( - UDS_INVALID_ARGUMENT, "index page number %u exceeds maximum %u", - indexPageNumber, geometry->indexPagesPerChapter - 1); - } - if (deltaListNumber >= geometry->deltaListsPerChapter) { - return logErrorWithStringError( - UDS_INVALID_ARGUMENT, "delta list number %u exceeds maximum %u", - deltaListNumber, geometry->deltaListsPerChapter - 1); - } - - if (indexPageNumber == (geometry->indexPagesPerChapter - 1)) { - /* - * There is no entry for the last index page of a chapter since its entry - * would always be geometry->deltaListsPerChapter - 1. - */ - return UDS_SUCCESS; - } - - size_t slot - = (chapterNumber * (geometry->indexPagesPerChapter - 1)) + indexPageNumber; - map->entries[slot] = (IndexPageMapEntry) deltaListNumber; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int findIndexPageNumber(const IndexPageMap *map, - const UdsChunkName *name, - unsigned int chapterNumber, - unsigned int *indexPageNumberPtr) -{ - const Geometry *geometry = map->geometry; - if (chapterNumber >= geometry->chaptersPerVolume) { - return logErrorWithStringError( - UDS_INVALID_ARGUMENT, "chapter number %u exceeds maximum %u", - chapterNumber, geometry->chaptersPerVolume - 1); - } - - unsigned int deltaListNumber = hashToChapterDeltaList(name, geometry); - unsigned int slot = (chapterNumber * (geometry->indexPagesPerChapter - 1)); - unsigned int limit = slot + (geometry->indexPagesPerChapter - 1); - unsigned int indexPageNumber = 0; - for (; slot < limit; indexPageNumber++, slot++) { - if (deltaListNumber <= map->entries[slot]) { - break; - } - } - - // This should be a clear post-condition of the loop above, but just in case - // it's not obvious, the check is cheap. - int result = ASSERT((indexPageNumber < geometry->indexPagesPerChapter), - "index page number too large"); - if (result != UDS_SUCCESS) { - return result; - } - - *indexPageNumberPtr = indexPageNumber; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getListNumberBounds(const IndexPageMap *map, - unsigned int chapterNumber, - unsigned int indexPageNumber, - IndexPageBounds *bounds) -{ - const Geometry *geometry = map->geometry; - int result = ASSERT((chapterNumber < geometry->chaptersPerVolume), - "chapter number is valid"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT((indexPageNumber < geometry->indexPagesPerChapter), - "index page number is valid"); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int slot = chapterNumber * (geometry->indexPagesPerChapter - 1); - bounds->lowestList = ((indexPageNumber == 0) - ? 0 - : map->entries[slot + indexPageNumber - 1] + 1); - bounds->highestList = ((indexPageNumber == geometry->indexPagesPerChapter - 1) - ? geometry->deltaListsPerChapter - 1 - : map->entries[slot + indexPageNumber]); - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -size_t indexPageMapSize(const Geometry *geometry) -{ - return sizeof(IndexPageMapEntry) * numEntries(geometry); -} - -/*****************************************************************************/ -static int writeIndexPageMap(IndexComponent *component, - BufferedWriter *writer, - unsigned int zone) -{ - int result = ASSERT((zone == 0), "unimplemented zone %d", zone); - if (result != UDS_SUCCESS) { - return result; - } - - IndexPageMap *map = indexComponentData(component); - - Buffer *buffer; - result = makeBuffer(INDEX_PAGE_MAP_MAGIC_LENGTH + sizeof(map->lastUpdate), - &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = putBytes(buffer, INDEX_PAGE_MAP_MAGIC_LENGTH, INDEX_PAGE_MAP_MAGIC); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = putUInt64LEIntoBuffer(buffer, map->lastUpdate); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = writeToBufferedWriter(writer, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "cannot write index page map header"); - } - result = makeBuffer(indexPageMapSize(map->geometry), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result - = putUInt16LEsIntoBuffer(buffer, numEntries(map->geometry), map->entries); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = writeToBufferedWriter(writer, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "cannot write index page map data"); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -uint64_t computeIndexPageMapSaveSize(const Geometry *geometry) -{ - return indexPageMapSize(geometry) + - INDEX_PAGE_MAP_MAGIC_LENGTH + sizeof(((IndexPageMap *) 0)->lastUpdate); -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int decodeIndexPageMap(Buffer *buffer, IndexPageMap *map) -{ - int result = getUInt64LEFromBuffer(buffer, &map->lastUpdate); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt16LEsFromBuffer(buffer, numEntries(map->geometry), - map->entries); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, - "%zu bytes decoded of %zu expected", - bufferLength(buffer) - contentLength(buffer), - bufferLength(buffer)); - return result; -} - -/*****************************************************************************/ -static int readIndexPageMap(ReadPortal *portal) -{ - IndexPageMap *map = indexComponentData(portal->component); - - BufferedReader *reader = NULL; - - int result = getBufferedReaderForPortal(portal, 0, &reader); - if (result != UDS_SUCCESS) { - return result; - } - - result = verifyBufferedData(reader, INDEX_PAGE_MAP_MAGIC, - INDEX_PAGE_MAP_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "bad index page map saved magic"); - } - - Buffer *buffer; - result - = makeBuffer(sizeof(map->lastUpdate) + indexPageMapSize(map->geometry), - &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(reader, getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - logErrorWithStringError(result, "cannot read index page map data"); - return result; - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = decodeIndexPageMap(buffer, map); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - logDebug("read index page map, last update %llu", map->lastUpdate); - return UDS_SUCCESS; -} diff --git a/uds/indexPageMap.h b/uds/indexPageMap.h deleted file mode 100644 index 3767cdd..0000000 --- a/uds/indexPageMap.h +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexPageMap.h#2 $ - */ - -#ifndef INDEX_PAGE_MAP_H -#define INDEX_PAGE_MAP_H 1 - -#include "common.h" -#include "geometry.h" -#include "indexComponent.h" - -extern const IndexComponentInfo INDEX_PAGE_MAP_INFO; - -typedef struct indexPageMap IndexPageMap; - -typedef struct { - unsigned int lowestList; - unsigned int highestList; -} IndexPageBounds; - -/* - * Notes on IndexPageMap - * - * Each volume maintains an index page map which records how the chapter delta - * lists are distributed among the index pages for that chapter. - * - * The map is conceptually a two-dimensional array indexed by chapter number - * and index page number within the chapter. Each entry contains the number - * of the last delta list on that index page. In order to save memory, the - * information for the last page in each chapter is not recorded, as it is - * known from the geometry. - */ - -typedef uint16_t IndexPageMapEntry; - -struct indexPageMap { - const Geometry *geometry; - uint64_t lastUpdate; - IndexPageMapEntry *entries; -}; - -/** - * Create an index page map. - * - * @param geometry The geometry governing the index. - * @param mapPtr A pointer to hold the new map. - * - * @return A success or error code. - **/ -int makeIndexPageMap(const Geometry *geometry, IndexPageMap **mapPtr) - __attribute__((warn_unused_result)); - -/** - * Free an index page map. - * - * @param map The index page map to destroy. - **/ -void freeIndexPageMap(IndexPageMap *map); - -/** - * Get the virtual chapter number of the last update to the index page map. - * - * @param map The index page map - * - * @return the virtual chapter number of the last chapter updated - **/ -uint64_t getLastUpdate(const IndexPageMap *map); - -/** - * Update an index page map entry. - * - * @param map The map to update - * @param virtualChapterNumber The virtual chapter number being updated. - * @param chapterNumber The chapter of the entry to update - * @param indexPageNumber The index page of the entry to update - * @param deltaListNumber The value of the new entry - * - * @return UDS_SUCCESS or an error code - **/ -int updateIndexPageMap(IndexPageMap *map, - uint64_t virtualChapterNumber, - unsigned int chapterNumber, - unsigned int indexPageNumber, - unsigned int deltaListNumber) - __attribute__((warn_unused_result)); - -/** - * Find the page number of the index page in a chapter that will contain the - * chapter index entry for a given chunk name, if it exists. - * - * @param [in] map The map to search - * @param [in] name The chunk name - * @param [in] chapterNumber The chapter containing the index page - * @param [out] indexPageNumberPtr A pointer to hold the result, guaranteed to - * be a valid index page number on UDS_SUCCESS - * - * @return UDS_SUCCESS, or UDS_INVALID_ARGUMENT if the chapter number - * is out of range - **/ -int findIndexPageNumber(const IndexPageMap *map, - const UdsChunkName *name, - unsigned int chapterNumber, - unsigned int *indexPageNumberPtr) - __attribute__((warn_unused_result)); - -/** - * Get the lowest and highest numbered delta lists for the given immutable - * chapter index page from the index page map. - * - * @param map The index page map - * @param chapterNumber The chapter containing the delta list - * @param indexPageNumber The index page number within the chapter - * @param bounds A structure to hold the list number bounds - * for the given page - * - * @return UDS_SUCCESS or an error code - **/ -int getListNumberBounds(const IndexPageMap *map, - unsigned int chapterNumber, - unsigned int indexPageNumber, - IndexPageBounds *bounds) - __attribute__((warn_unused_result)); - -/** - * Compute the size of the index page map save image, including all headers. - * - * @param geometry The index geometry. - * - * @return The number of bytes required to save the index page map. - **/ -uint64_t computeIndexPageMapSaveSize(const Geometry *geometry); - -/** - * Escaped for testing.... - * - * @param geometry The index geometry. - * - * @return The number of bytes required for the page map data, - * exclusive of headers. - **/ -size_t indexPageMapSize(const Geometry *geometry) - __attribute__((warn_unused_result)); - -#endif // INDEX_PAGE_MAP_H diff --git a/uds/indexRouter.c b/uds/indexRouter.c deleted file mode 100644 index b9b0a9e..0000000 --- a/uds/indexRouter.c +++ /dev/null @@ -1,254 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexRouter.c#7 $ - */ - -#include "indexRouter.h" - -#include "compiler.h" -#include "indexCheckpoint.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "requestQueue.h" -#include "zone.h" - -/** - * This is the request processing function invoked by the zone's RequestQueue - * worker thread. - * - * @param request the request to be indexed or executed by the zone worker - **/ -static void executeZoneRequest(Request *request) -{ - executeIndexRouterRequest(request->router, request); -} - -/** - * Construct and enqueue asynchronous control messages to add the chapter - * index for a given virtual chapter to the sparse chapter index cache. - * - * @param router the router containing the relevant queues - * @param index the index with the relevant cache and chapter - * @param virtualChapter the virtual chapter number of the chapter to cache - **/ -static void enqueueBarrierMessages(IndexRouter *router, - Index *index, - uint64_t virtualChapter) -{ - ZoneMessage barrier = { - .index = index, - .data = { - .barrier = { - .virtualChapter = virtualChapter, - } - } - }; - unsigned int zone; - for (zone = 0; zone < router->zoneCount; zone++) { - int result = launchZoneControlMessage(REQUEST_SPARSE_CACHE_BARRIER, - barrier, zone, router); - ASSERT_LOG_ONLY((result == UDS_SUCCESS), "barrier message allocation"); - } -} - -/** - * This is the request processing function for the triage stage queue. Each - * request is resolved in the master index, determining if it is a hook or - * not, and if a hook, what virtual chapter (if any) it might be found in. If - * a virtual chapter is found, this enqueues a sparse chapter cache barrier in - * every zone before enqueueing the request in its zone. - * - * @param request the request to triage - **/ -static void triageRequest(Request *request) -{ - IndexRouter *router = request->router; - Index *index = router->index; - - // Check if the name is a hook in the index pointing at a sparse chapter. - uint64_t sparseVirtualChapter = triageIndexRequest(index, request); - if (sparseVirtualChapter != UINT64_MAX) { - // Generate and place a barrier request on every zone queue. - enqueueBarrierMessages(router, index, sparseVirtualChapter); - } - - enqueueRequest(request, STAGE_INDEX); -} - -/** - * Initialize the zone queues and the triage queue. - * - * @param router the router containing the queues - * @param geometry the geometry governing the indexes - * - * @return UDS_SUCCESS or error code - **/ -static int initializeLocalIndexQueues(IndexRouter *router, - const Geometry *geometry) -{ - unsigned int i; - for (i = 0; i < router->zoneCount; i++) { - int result = makeRequestQueue("indexW", &executeZoneRequest, - &router->zoneQueues[i]); - if (result != UDS_SUCCESS) { - return result; - } - } - - // The triage queue is only needed for sparse multi-zone indexes. - if ((router->zoneCount > 1) && isSparse(geometry)) { - int result = makeRequestQueue("triageW", &triageRequest, - &router->triageQueue); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -static INLINE RequestQueue *getZoneQueue(IndexRouter *router, - unsigned int zoneNumber) -{ - return router->zoneQueues[zoneNumber]; -} - -/**********************************************************************/ -int makeIndexRouter(IndexLayout *layout, - const Configuration *config, - const struct uds_parameters *userParams, - LoadType loadType, - IndexLoadContext *loadContext, - IndexRouterCallback callback, - IndexRouter **routerPtr) -{ - unsigned int zoneCount = getZoneCount(userParams); - IndexRouter *router; - int result = ALLOCATE_EXTENDED(IndexRouter, zoneCount, RequestQueue *, - "index router", &router); - if (result != UDS_SUCCESS) { - return result; - } - - router->callback = callback; - router->zoneCount = zoneCount; - - result = initializeLocalIndexQueues(router, config->geometry); - if (result != UDS_SUCCESS) { - freeIndexRouter(router); - return result; - } - - result = makeIndex(layout, config, userParams, router->zoneCount, loadType, - loadContext, &router->index); - if (result != UDS_SUCCESS) { - freeIndexRouter(router); - return logErrorWithStringError(result, "failed to create index"); - } - - router->needToSave = (router->index->loadedType != LOAD_LOAD); - *routerPtr = router; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int saveIndexRouter(IndexRouter *router) -{ - if (!router->needToSave) { - return UDS_SUCCESS; - } - int result = saveIndex(router->index); - router->needToSave = (result != UDS_SUCCESS); - return result; -} - -/**********************************************************************/ -void freeIndexRouter(IndexRouter *router) -{ - if (router == NULL) { - return; - } - requestQueueFinish(router->triageQueue); - unsigned int i; - for (i = 0; i < router->zoneCount; i++) { - requestQueueFinish(router->zoneQueues[i]); - } - freeIndex(router->index); - FREE(router); -} - -/**********************************************************************/ -RequestQueue *selectIndexRouterQueue(IndexRouter *router, - Request *request, - RequestStage nextStage) -{ - if (request->isControlMessage) { - return getZoneQueue(router, request->zoneNumber); - } - - if (nextStage == STAGE_TRIAGE) { - // The triage queue is only needed for multi-zone sparse indexes and won't - // be allocated by the router if not needed, so simply check for NULL. - if (router->triageQueue != NULL) { - return router->triageQueue; - } - // Dense index or single zone, so route it directly to the zone queue. - } else if (nextStage != STAGE_INDEX) { - ASSERT_LOG_ONLY(false, "invalid index stage: %d", nextStage); - return NULL; - } - - Index *index = router->index; - request->zoneNumber = getMasterIndexZone(index->masterIndex, - &request->chunkName); - return getZoneQueue(router, request->zoneNumber); -} - -/**********************************************************************/ -void executeIndexRouterRequest(IndexRouter *router, Request *request) -{ - if (request->isControlMessage) { - int result = dispatchIndexZoneControlRequest(request); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "error executing control message: %d", - request->action); - } - request->status = result; - enterCallbackStage(request); - return; - } - - router->needToSave = true; - if (request->requeued && !isSuccessful(request->status)) { - request->status = makeUnrecoverable(request->status); - router->callback(request); - return; - } - - Index *index = router->index; - int result = dispatchIndexRequest(index, request); - if (result == UDS_QUEUED) { - // Take the request off the pipeline. - return; - } - - request->status = result; - router->callback(request); -} diff --git a/uds/indexRouter.h b/uds/indexRouter.h deleted file mode 100644 index a96262b..0000000 --- a/uds/indexRouter.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexRouter.h#3 $ - */ - -#ifndef INDEX_ROUTER_H -#define INDEX_ROUTER_H - -#include "compiler.h" -#include "index.h" -#include "indexSession.h" -#include "request.h" - -/** - * Callback after a query, update or remove request completes and fills in - * select fields in the request: status for all requests, oldMetadata and - * hashExists for query and update requests. - * - * @param request request object. - **/ -typedef void (*IndexRouterCallback)(Request *request); - -struct indexRouter { - IndexRouterCallback callback; - unsigned int zoneCount; - bool needToSave; - Index *index; - RequestQueue *triageQueue; - RequestQueue *zoneQueues[]; -}; - -/** - * Construct and initialize an IndexRouter instance. - * - * @param layout the IndexLayout that describes the stored index - * @param config the configuration to use - * @param userParams the index session parameters. If NULL, the default - * session parameters will be used. - * @param loadType selects whether to create, load, or rebuild the index - * @param loadContext the index load context to use - * @param callback the function to invoke when a request completes or fails - * @param routerPtr a pointer in which to store the new router - * - * @return UDS_SUCCESS or an error code - **/ -int makeIndexRouter(IndexLayout *layout, - const Configuration *config, - const struct uds_parameters *userParams, - LoadType loadType, - IndexLoadContext *loadContext, - IndexRouterCallback callback, - IndexRouter **routerPtr) - __attribute__((warn_unused_result)); - -/** - * Executes the index operation for a UDS request and calls the callback upon - * completion. - * - * @param router The index router. - * @param request A pointer to the Request to process. - **/ -void executeIndexRouterRequest(IndexRouter *router, Request *request); - -/** - * Save the index router state to persistent storage. - * - * It is the responsibility of the caller to ensure that there are no other - * uses of the index during a call to this method. It is necessary that there - * be no index requests from any block context nor any other attempt to save - * the index until after a call to saveIndexRouter returns. - * - * @param router the index router to save - * - * @return UDS_SUCCESS if successful. - **/ -int saveIndexRouter(IndexRouter *router) __attribute__((warn_unused_result)); - -/** - * Destroy the index router and free its memory. - * - * @param router the index router to destroy (may be NULL) - * - * @return UDS_SUCCESS if successful. - **/ -void freeIndexRouter(IndexRouter *router); - -/** - * Select and return the request queue responsible for executing the next - * index stage of a request, updating the request with any associated state - * (such as the zone number for UDS requests on a local index). - * - * @param router The index router. - * @param request The Request destined for the queue. - * @param nextStage The next request stage (STAGE_TRIAGE or STAGE_INDEX). - * - * @return the next index stage queue (the local triage queue, local zone - * queue, or remote RPC send queue) - **/ -RequestQueue *selectIndexRouterQueue(IndexRouter *router, - Request *request, - RequestStage nextStage); - -/** - * Wait for the index router to finish all operations that access a local - * storage device. - * - * @param router The index router. - **/ -static INLINE void waitForIdleIndexRouter(IndexRouter *router) -{ - waitForIdleChapterWriter(router->index->chapterWriter); -} - -#endif /* INDEX_ROUTER_H */ diff --git a/uds/indexSession.c b/uds/indexSession.c deleted file mode 100644 index 15e5b3f..0000000 --- a/uds/indexSession.c +++ /dev/null @@ -1,554 +0,0 @@ -/* - * %Copyright% - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexSession.c#10 $ - */ - -#include "indexSession.h" - -#include "indexCheckpoint.h" -#include "indexRouter.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "requestQueue.h" - -/**********************************************************************/ -static void collectStats(const struct uds_index_session *indexSession, - UdsContextStats *stats) -{ - const SessionStats *sessionStats = &indexSession->stats; - - stats->currentTime = asTimeT(currentTime(CLOCK_REALTIME)); - - stats->postsFound = READ_ONCE(sessionStats->postsFound); - stats->inMemoryPostsFound = READ_ONCE(sessionStats->postsFoundOpenChapter); - stats->densePostsFound = READ_ONCE(sessionStats->postsFoundDense); - stats->sparsePostsFound = READ_ONCE(sessionStats->postsFoundSparse); - stats->postsNotFound = READ_ONCE(sessionStats->postsNotFound); - stats->updatesFound = READ_ONCE(sessionStats->updatesFound); - stats->updatesNotFound = READ_ONCE(sessionStats->updatesNotFound); - stats->deletionsFound = READ_ONCE(sessionStats->deletionsFound); - stats->deletionsNotFound = READ_ONCE(sessionStats->deletionsNotFound); - stats->queriesFound = READ_ONCE(sessionStats->queriesFound); - stats->queriesNotFound = READ_ONCE(sessionStats->queriesNotFound); - stats->requests = READ_ONCE(sessionStats->requests); -} - -/**********************************************************************/ -static void handleCallbacks(Request *request) -{ - if (request->status == UDS_SUCCESS) { - // Measure the turnaround time of this request and include that time, - // along with the rest of the request, in the context's StatCounters. - updateRequestContextStats(request); - } - - if (request->callback != NULL) { - // The request has specified its own callback and does not expect to be - // freed. - struct uds_index_session *indexSession = request->session; - request->found = (request->location != LOC_UNAVAILABLE); - request->callback((UdsRequest *) request); - // We do this release after the callback because of the contract of the - // udsFlushIndexSession method. - releaseIndexSession(indexSession); - return; - } - - // Should not get here, because this is either a control message or it has a - // callback method. - freeRequest(request); -} - -/**********************************************************************/ -int checkIndexSession(struct uds_index_session *indexSession) -{ - lockMutex(&indexSession->requestMutex); - unsigned int state = indexSession->state; - unlockMutex(&indexSession->requestMutex); - - if (state == IS_FLAG_LOADED) { - return UDS_SUCCESS; - } else if (state & IS_FLAG_DISABLED) { - return UDS_DISABLED; - } else if ((state & IS_FLAG_LOADING) - || (state & IS_FLAG_SUSPENDED) - || (state & IS_FLAG_WAITING)) { - return UDS_SUSPENDED; - } - - return UDS_NO_INDEXSESSION; -} - -/**********************************************************************/ -int getIndexSession(struct uds_index_session *indexSession) -{ - lockMutex(&indexSession->requestMutex); - indexSession->requestCount++; - unlockMutex(&indexSession->requestMutex); - - int result = checkIndexSession(indexSession); - if (result != UDS_SUCCESS) { - releaseIndexSession(indexSession); - return result; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void releaseIndexSession(struct uds_index_session *indexSession) -{ - lockMutex(&indexSession->requestMutex); - if (--indexSession->requestCount == 0) { - broadcastCond(&indexSession->requestCond); - } - unlockMutex(&indexSession->requestMutex); -} - -/**********************************************************************/ -int startLoadingIndexSession(struct uds_index_session *indexSession) -{ - int result; - lockMutex(&indexSession->requestMutex); - if (indexSession->state & IS_FLAG_SUSPENDED) { - result = UDS_SUSPENDED; - } else if (indexSession->state != 0) { - result = UDS_INDEXSESSION_IN_USE; - } else { - indexSession->state |= IS_FLAG_LOADING; - result = UDS_SUCCESS; - } - unlockMutex(&indexSession->requestMutex); - return result; -} - -/**********************************************************************/ -void finishLoadingIndexSession(struct uds_index_session *indexSession, - int result) -{ - lockMutex(&indexSession->requestMutex); - indexSession->state &= ~IS_FLAG_LOADING; - if (result == UDS_SUCCESS) { - indexSession->state |= IS_FLAG_LOADED; - } - broadcastCond(&indexSession->requestCond); - unlockMutex(&indexSession->requestMutex); -} - -/**********************************************************************/ -void disableIndexSession(struct uds_index_session *indexSession) -{ - lockMutex(&indexSession->requestMutex); - indexSession->state |= IS_FLAG_DISABLED; - unlockMutex(&indexSession->requestMutex); -} - -/**********************************************************************/ -int makeEmptyIndexSession(struct uds_index_session **indexSessionPtr) -{ - struct uds_index_session *session; - int result = ALLOCATE(1, struct uds_index_session, __func__, &session); - if (result != UDS_SUCCESS) { - return result; - } - - result = initMutex(&session->requestMutex); - if (result != UDS_SUCCESS) { - FREE(session); - return result; - } - - result = initCond(&session->requestCond); - if (result != UDS_SUCCESS) { - destroyMutex(&session->requestMutex); - FREE(session); - return result; - } - - result = initMutex(&session->loadContext.mutex); - if (result != UDS_SUCCESS) { - destroyCond(&session->requestCond); - destroyMutex(&session->requestMutex); - FREE(session); - return result; - } - - result = initCond(&session->loadContext.cond); - if (result != UDS_SUCCESS) { - destroyMutex(&session->loadContext.mutex); - destroyCond(&session->requestCond); - destroyMutex(&session->requestMutex); - FREE(session); - return result; - } - - result = makeRequestQueue("callbackW", &handleCallbacks, - &session->callbackQueue); - if (result != UDS_SUCCESS) { - destroyCond(&session->loadContext.cond); - destroyMutex(&session->loadContext.mutex); - destroyCond(&session->requestCond); - destroyMutex(&session->requestMutex); - FREE(session); - return result; - } - - *indexSessionPtr = session; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int udsSuspendIndexSession(struct uds_index_session *session, bool save) -{ - int result; - bool saveIndex = false; - bool suspendIndex = false; - lockMutex(&session->requestMutex); - // Wait for any pending close operation to complete. - while (session->state & IS_FLAG_CLOSING) { - waitCond(&session->requestCond, &session->requestMutex); - } - if ((session->state & IS_FLAG_WAITING) - || (session->state & IS_FLAG_DESTROYING)) { - result = EBUSY; - } else if (session->state & IS_FLAG_SUSPENDED) { - result = UDS_SUCCESS; - } else if (session->state & IS_FLAG_LOADING) { - session->state |= IS_FLAG_WAITING; - suspendIndex = true; - result = UDS_SUCCESS; - } else if (!(session->state & IS_FLAG_LOADED)) { - session->state |= IS_FLAG_SUSPENDED; - broadcastCond(&session->requestCond); - result = UDS_SUCCESS; - } else { - saveIndex = save; - if (saveIndex) { - session->state |= IS_FLAG_WAITING; - } else { - session->state |= IS_FLAG_SUSPENDED; - broadcastCond(&session->requestCond); - } - result = UDS_SUCCESS; - } - unlockMutex(&session->requestMutex); - - if (!saveIndex && !suspendIndex) { - return result; - } - - if (saveIndex) { - result = udsSaveIndex(session); - lockMutex(&session->requestMutex); - session->state &= ~IS_FLAG_WAITING; - session->state |= IS_FLAG_SUSPENDED; - broadcastCond(&session->requestCond); - unlockMutex(&session->requestMutex); - return result; - } - - lockMutex(&session->loadContext.mutex); - switch (session->loadContext.status) { - case INDEX_OPENING: - session->loadContext.status = INDEX_SUSPENDING; - - // Wait until the index indicates that it is not replaying. - while ((session->loadContext.status != INDEX_SUSPENDED) - && (session->loadContext.status != INDEX_READY)) { - waitCond(&session->loadContext.cond, - &session->loadContext.mutex); - } - break; - - case INDEX_READY: - // Index load does not need to be suspended. - break; - - case INDEX_SUSPENDED: - case INDEX_SUSPENDING: - case INDEX_FREEING: - default: - // These cases should not happen. - ASSERT_LOG_ONLY(false, "Bad load context state %u", - session->loadContext.status); - break; - } - unlockMutex(&session->loadContext.mutex); - - lockMutex(&session->requestMutex); - session->state &= ~IS_FLAG_WAITING; - session->state |= IS_FLAG_SUSPENDED; - broadcastCond(&session->requestCond); - unlockMutex(&session->requestMutex); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int udsResumeIndexSession(struct uds_index_session *session) -{ - lockMutex(&session->requestMutex); - if (session->state & IS_FLAG_WAITING) { - unlockMutex(&session->requestMutex); - return EBUSY; - } - - /* If not suspended, just succeed */ - if (!(session->state & IS_FLAG_SUSPENDED)) { - unlockMutex(&session->requestMutex); - return UDS_SUCCESS; - } - - if (!(session->state & IS_FLAG_LOADING)) { - session->state &= ~IS_FLAG_SUSPENDED; - unlockMutex(&session->requestMutex); - return UDS_SUCCESS; - } - - session->state |= IS_FLAG_WAITING; - unlockMutex(&session->requestMutex); - - lockMutex(&session->loadContext.mutex); - switch (session->loadContext.status) { - case INDEX_SUSPENDED: - session->loadContext.status = INDEX_OPENING; - // Notify the index to start replaying again. - broadcastCond(&session->loadContext.cond); - break; - - case INDEX_READY: - // There is no index rebuild to resume. - break; - - case INDEX_OPENING: - case INDEX_SUSPENDING: - case INDEX_FREEING: - default: - // These cases should not happen; do nothing. - ASSERT_LOG_ONLY(false, "Bad load context state %u", - session->loadContext.status); - break; - } - unlockMutex(&session->loadContext.mutex); - - lockMutex(&session->requestMutex); - session->state &= ~IS_FLAG_WAITING; - session->state &= ~IS_FLAG_SUSPENDED; - broadcastCond(&session->requestCond); - unlockMutex(&session->requestMutex); - return UDS_SUCCESS; -} - -/**********************************************************************/ -static void waitForNoRequestsInProgress(struct uds_index_session *indexSession) -{ - lockMutex(&indexSession->requestMutex); - while (indexSession->requestCount > 0) { - waitCond(&indexSession->requestCond, &indexSession->requestMutex); - } - unlockMutex(&indexSession->requestMutex); -} - -/**********************************************************************/ -int saveAndFreeIndex(struct uds_index_session *indexSession) -{ - int result = UDS_SUCCESS; - IndexRouter *router = indexSession->router; - if (router != NULL) { - lockMutex(&indexSession->requestMutex); - bool suspended = (indexSession->state & IS_FLAG_SUSPENDED); - unlockMutex(&indexSession->requestMutex); - if (!suspended) { - result = saveIndexRouter(router); - if (result != UDS_SUCCESS) { - logWarningWithStringError(result, "ignoring error from saveIndexRouter"); - } - } - freeIndexRouter(router); - indexSession->router = NULL; - - // Reset all index state that happens to be in the index session, so it - // doesn't affect any future index. - lockMutex(&indexSession->loadContext.mutex); - indexSession->loadContext.status = INDEX_OPENING; - unlockMutex(&indexSession->loadContext.mutex); - - lockMutex(&indexSession->requestMutex); - // Only the suspend bit will remain relevant. - indexSession->state &= IS_FLAG_SUSPENDED; - unlockMutex(&indexSession->requestMutex); - } - - logDebug("Closed index"); - return result; -} - -/**********************************************************************/ -int udsCloseIndex(struct uds_index_session *indexSession) -{ - lockMutex(&indexSession->requestMutex); - - // Wait for any pending suspend, resume or close operations to complete. - while ((indexSession->state & IS_FLAG_WAITING) - || (indexSession->state & IS_FLAG_CLOSING)) { - waitCond(&indexSession->requestCond, &indexSession->requestMutex); - } - - int result = UDS_SUCCESS; - if (indexSession->state & IS_FLAG_SUSPENDED) { - result = UDS_SUSPENDED; - } else if ((indexSession->state & IS_FLAG_DESTROYING) - || !(indexSession->state & IS_FLAG_LOADED)) { - // The index doesn't exist, hasn't finished loading, or is being destroyed. - result = UDS_NO_INDEXSESSION; - } else { - indexSession->state |= IS_FLAG_CLOSING; - } - unlockMutex(&indexSession->requestMutex); - if (result != UDS_SUCCESS) { - return result; - } - - logDebug("Closing index"); - waitForNoRequestsInProgress(indexSession); - result = saveAndFreeIndex(indexSession); - - lockMutex(&indexSession->requestMutex); - indexSession->state &= ~IS_FLAG_CLOSING; - broadcastCond(&indexSession->requestCond); - unlockMutex(&indexSession->requestMutex); - return result; -} - -/**********************************************************************/ -int udsDestroyIndexSession(struct uds_index_session *indexSession) -{ - logDebug("Destroying index session"); - - bool loadPending = false; - lockMutex(&indexSession->requestMutex); - - // Wait for any pending suspend, resume, or close operations to complete. - while ((indexSession->state & IS_FLAG_WAITING) - || (indexSession->state & IS_FLAG_CLOSING)) { - waitCond(&indexSession->requestCond, &indexSession->requestMutex); - } - - if (indexSession->state & IS_FLAG_DESTROYING) { - unlockMutex(&indexSession->requestMutex); - return EBUSY; - } - - indexSession->state |= IS_FLAG_DESTROYING; - loadPending = ((indexSession->state & IS_FLAG_LOADING) - && (indexSession->state & IS_FLAG_SUSPENDED)); - unlockMutex(&indexSession->requestMutex); - - if (loadPending) { - // Tell the index to terminate the rebuild. - lockMutex(&indexSession->loadContext.mutex); - if (indexSession->loadContext.status == INDEX_SUSPENDED) { - indexSession->loadContext.status = INDEX_FREEING; - broadcastCond(&indexSession->loadContext.cond); - } - unlockMutex(&indexSession->loadContext.mutex); - - // Wait until the load exits before proceeding. - lockMutex(&indexSession->requestMutex); - while (indexSession->state & IS_FLAG_LOADING) { - waitCond(&indexSession->requestCond, &indexSession->requestMutex); - } - unlockMutex(&indexSession->requestMutex); - } - - waitForNoRequestsInProgress(indexSession); - int result = saveAndFreeIndex(indexSession); - requestQueueFinish(indexSession->callbackQueue); - indexSession->callbackQueue = NULL; - destroyCond(&indexSession->loadContext.cond); - destroyMutex(&indexSession->loadContext.mutex); - destroyCond(&indexSession->requestCond); - destroyMutex(&indexSession->requestMutex); - logDebug("Destroyed index session"); - FREE(indexSession); - return result; -} - -/**********************************************************************/ -int udsFlushIndexSession(struct uds_index_session *indexSession) -{ - waitForNoRequestsInProgress(indexSession); - // Wait until any open chapter writes are complete - waitForIdleIndexRouter(indexSession->router); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int udsSaveIndex(struct uds_index_session *indexSession) -{ - waitForNoRequestsInProgress(indexSession); - // saveIndexRouter waits for open chapter writes to complete - return saveIndexRouter(indexSession->router); -} - -/**********************************************************************/ -int udsSetCheckpointFrequency(struct uds_index_session *indexSession, - unsigned int frequency) -{ - setIndexCheckpointFrequency(indexSession->router->index->checkpoint, - frequency); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int udsGetIndexConfiguration(struct uds_index_session *indexSession, - UdsConfiguration *conf) -{ - if (conf == NULL) { - return logErrorWithStringError(UDS_CONF_PTR_REQUIRED, - "received a NULL config pointer"); - } - int result = ALLOCATE(1, struct udsConfiguration, __func__, conf); - if (result == UDS_SUCCESS) { - **conf = indexSession->userConfig; - } - return result; -} - -/**********************************************************************/ -int udsGetIndexStats(struct uds_index_session *indexSession, - UdsIndexStats *stats) -{ - if (stats == NULL) { - return logErrorWithStringError(UDS_INDEX_STATS_PTR_REQUIRED, - "received a NULL index stats pointer"); - } - getIndexStats(indexSession->router->index, stats); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int udsGetIndexSessionStats(struct uds_index_session *indexSession, - UdsContextStats *stats) -{ - if (stats == NULL) { - return logWarningWithStringError(UDS_CONTEXT_STATS_PTR_REQUIRED, - "received a NULL context stats pointer"); - } - collectStats(indexSession, stats); - return UDS_SUCCESS; -} diff --git a/uds/indexSession.h b/uds/indexSession.h deleted file mode 100644 index 1467fd2..0000000 --- a/uds/indexSession.h +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexSession.h#6 $ - */ - -#ifndef INDEX_SESSION_H -#define INDEX_SESSION_H - -#include "atomicDefs.h" -#include "config.h" -#include "cpu.h" -#include "opaqueTypes.h" -#include "threads.h" -#include "uds.h" - -/** - * The bit position of flags used to indicate index session states. - **/ -typedef enum { - IS_FLAG_BIT_START = 8, - /** Flag indicating that the session is loading */ - IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START, - /** Flag indicating that that the session has been loaded */ - IS_FLAG_BIT_LOADED, - /** Flag indicating that the session is disabled permanently */ - IS_FLAG_BIT_DISABLED, - /** Flag indicating that the session is suspended */ - IS_FLAG_BIT_SUSPENDED, - /** Flag indicating that the session is waiting for an index state change */ - IS_FLAG_BIT_WAITING, - /** Flag indicating that that the session is closing */ - IS_FLAG_BIT_CLOSING, - /** Flag indicating that that the session is being destroyed */ - IS_FLAG_BIT_DESTROYING, -} IndexSessionFlagBit; - -/** - * The index session state flags. - **/ -typedef enum { - IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED), - IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING), - IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED), - IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED), - IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING), - IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING), - IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING), -} IndexSessionFlag; - -typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) sessionStats { - uint64_t postsFound; /* Post calls that found an entry */ - uint64_t postsFoundOpenChapter; /* Post calls found in the open chapter */ - uint64_t postsFoundDense; /* Post calls found in the dense index */ - uint64_t postsFoundSparse; /* Post calls found in the sparse index */ - uint64_t postsNotFound; /* Post calls that did not find an entry */ - uint64_t updatesFound; /* Update calls that found an entry */ - uint64_t updatesNotFound; /* Update calls that did not find an entry */ - uint64_t deletionsFound; /* Delete calls that found an entry */ - uint64_t deletionsNotFound; /* Delete calls that did not find an entry */ - uint64_t queriesFound; /* Query calls that found an entry */ - uint64_t queriesNotFound; /* Query calls that did not find an entry */ - uint64_t requests; /* Total number of requests */ -} SessionStats; - -/** - * States used in the index load context, reflecting the state of the index. - **/ -typedef enum { - /** The index has not been loaded or rebuilt completely */ - INDEX_OPENING = 0, - /** The index is able to handle requests */ - INDEX_READY, - /** The index has a pending request to suspend */ - INDEX_SUSPENDING, - /** The index is suspended in the midst of a rebuild */ - INDEX_SUSPENDED, - /** The index is being shut down while suspended */ - INDEX_FREEING, -} IndexSuspendStatus; - -/** - * The CondVar here must be notified when the status changes to - * INDEX_SUSPENDED, in order to wake up the waiting udsSuspendIndexSession() - * call. It must also be notified when the status changes away from - * INDEX_SUSPENDED, to resume rebuild the index from checkForSuspend() in the - * index. - **/ -typedef struct indexLoadContext { - Mutex mutex; - CondVar cond; - IndexSuspendStatus status; // Covered by indexLoadContext.mutex. -} IndexLoadContext; - -/** - * The request CondVar here must be notified when IS_FLAG_WAITING is cleared, - * in case udsCloseIndex() or udsDestroyIndexSession() is waiting on that flag. - * It must also be notified when IS_FLAG_CLOSING is cleared, in case - * udsSuspendIndexSession(), udsCloseIndex() or udsDestroyIndexSession() is - * waiting on that flag. - * Finally, it must also be notified when IS_FLAG_LOADING is cleared, to inform - * udsDestroyIndexSession() that the index session can be safely freed. - **/ -struct uds_index_session { - unsigned int state; // Covered by requestMutex. - IndexRouter *router; - RequestQueue *callbackQueue; - struct udsConfiguration userConfig; - IndexLoadContext loadContext; - // Asynchronous Request synchronization - Mutex requestMutex; - CondVar requestCond; - int requestCount; - // Request statistics, all owned by the callback thread - SessionStats stats; -}; - -/** - * Check that the index session is usable. - * - * @param indexSession the session to query - * - * @return UDS_SUCCESS or an error code - **/ -int checkIndexSession(struct uds_index_session *indexSession) - __attribute__((warn_unused_result)); - -/** - * Make sure that the IndexSession is allowed to load an index, and if so, set - * its state to indicate that the load has started. - * - * @param indexSession the session to load with - * - * @return UDS_SUCCESS, or an error code if an index already exists. - **/ -int startLoadingIndexSession(struct uds_index_session *indexSession) - __attribute__((warn_unused_result)); - -/** - * Update the IndexSession state after attempting to load an index, to indicate - * that the load has completed, and whether or not it succeeded. - * - * @param indexSession the session that was loading - * @param result the result of the load operation - **/ -void finishLoadingIndexSession(struct uds_index_session *indexSession, - int result); - -/** - * Disable an index session due to an error. - * - * @param indexSession the session to be disabled - **/ -void disableIndexSession(struct uds_index_session *indexSession); - -/** - * Acquire the index session for an asynchronous index request. - * - * The pointer must eventually be released with a corresponding call to - * releaseIndexSession(). - * - * @param indexSession The index session - * - * @return UDS_SUCCESS or an error code - **/ -int getIndexSession(struct uds_index_session *indexSession) - __attribute__((warn_unused_result)); - -/** - * Release a pointer to an index session. - * - * @param indexSession The session to release - **/ -void releaseIndexSession(struct uds_index_session *indexSession); - -/** - * Construct a new, empty index session. - * - * @param indexSessionPtr The pointer to receive the new session - * - * @return UDS_SUCCESS or an error code - **/ -int makeEmptyIndexSession(struct uds_index_session **indexSessionPtr) - __attribute__((warn_unused_result)); - -/** - * Save an index while the session is quiescent. - * - * During the call to #udsSaveIndex, there should be no other call to - * #udsSaveIndex and there should be no calls to #udsStartChunkOperation. - * - * @param indexSession The session to save - * - * @return Either #UDS_SUCCESS or an error code - **/ -int udsSaveIndex(struct uds_index_session *indexSession) - __attribute__((warn_unused_result)); - -/** - * Close the index by saving the underlying index. - * - * @param indexSession The index session to be shut down and freed - **/ -int saveAndFreeIndex(struct uds_index_session *indexSession); - -/** - * Set the checkpoint frequency of the grid. - * - * @param session The index session to be modified. - * @param frequency New checkpoint frequency. - * - * @return Either UDS_SUCCESS or an error code. - * - **/ -int udsSetCheckpointFrequency(struct uds_index_session *session, - unsigned int frequency) - __attribute__((warn_unused_result)); - -#endif /* INDEX_SESSION_H */ diff --git a/uds/indexState.c b/uds/indexState.c deleted file mode 100644 index 86b9fd3..0000000 --- a/uds/indexState.c +++ /dev/null @@ -1,512 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexState.c#6 $ - */ - -#include "indexState.h" - -#include "errors.h" -#include "indexComponent.h" -#include "indexLayout.h" -#include "logger.h" -#include "memoryAlloc.h" - - -/*****************************************************************************/ -int makeIndexState(IndexLayout *layout, - unsigned int numZones, - unsigned int maxComponents, - IndexState **statePtr) -{ - if (maxComponents == 0) { - return logErrorWithStringError( - UDS_INVALID_ARGUMENT, "cannot make index state with maxComponents 0"); - } - - IndexState *state = NULL; - int result = ALLOCATE_EXTENDED(IndexState, maxComponents, IndexComponent *, - "index state", &state); - if (result != UDS_SUCCESS) { - return result; - } - - state->count = 0; - state->layout = layout; - state->length = maxComponents; - state->loadZones = 0; - state->loadSlot = UINT_MAX; - state->saveSlot = UINT_MAX; - state->saving = false; - state->zoneCount = numZones; - - *statePtr = state; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -void freeIndexState(IndexState **statePtr) -{ - IndexState *state = *statePtr; - *statePtr = NULL; - if (state != NULL) { - unsigned int i; - for (i = 0; i < state->count; ++i) { - freeIndexComponent(&state->entries[i]); - } - FREE(state); - } -} - -/*****************************************************************************/ -/** - * Add a component to the index state. - * - * @param state The index state. - * @param component The index component. - * - * @return UDS_SUCCESS or an error code. - **/ -static int addComponentToIndexState(IndexState *state, - IndexComponent *component) -{ - if (findIndexComponent(state, component->info) != NULL) { - return logErrorWithStringError( - UDS_INVALID_ARGUMENT, "cannot add state component %s: already present", - component->info->name); - } - - if (state->count >= state->length) { - return logErrorWithStringError( - UDS_RESOURCE_LIMIT_EXCEEDED, - "cannot add state component %s, %u components already added", - component->info->name, state->count); - } - - state->entries[state->count] = component; - ++state->count; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int addIndexStateComponent(IndexState *state, - const IndexComponentInfo *info, - void *data, - void *context) -{ - IndexComponent *component = NULL; - int result = makeIndexComponent(state, info, state->zoneCount, data, context, - &component); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "cannot make region index component"); - } - - result = addComponentToIndexState(state, component); - if (result != UDS_SUCCESS) { - freeIndexComponent(&component); - return result; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -IndexComponent *findIndexComponent(const IndexState *state, - const IndexComponentInfo *info) -{ - unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - if (info == component->info) { - return component; - } - } - return NULL; -} - -/*****************************************************************************/ -static const char *indexSaveTypeName(IndexSaveType saveType) -{ - return saveType == IS_SAVE ? "save" : "checkpoint"; -} - -/*****************************************************************************/ -int loadIndexState(IndexState *state, bool *replayPtr) -{ - int result = findLatestIndexSaveSlot(state->layout, &state->loadZones, - &state->loadSlot); - if (result != UDS_SUCCESS) { - return result; - } - - bool replayRequired = false; - unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - result = readIndexComponent(component); - if (result != UDS_SUCCESS) { - if (!missingIndexComponentRequiresReplay(component)) { - state->loadZones = 0; - state->loadSlot = UINT_MAX; - return logErrorWithStringError(result, "index component %s", - indexComponentName(component)); - } - replayRequired = true; - } - } - - state->loadZones = 0; - state->loadSlot = UINT_MAX; - if (replayPtr != NULL) { - *replayPtr = replayRequired; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int prepareToSaveIndexState(IndexState *state, IndexSaveType saveType) -{ - if (state->saving) { - return logErrorWithStringError(UDS_BAD_STATE, - "already saving the index state"); - } - int result = setupIndexSaveSlot(state->layout, state->zoneCount, saveType, - &state->saveSlot); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot prepare index %s", - indexSaveTypeName(saveType)); - } - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -/** - * Complete the saving of an index state. - * - * @param state the index state - * - * @return UDS_SUCCESS or an error code - **/ -static int completeIndexSaving(IndexState *state) -{ - state->saving = false; - int result = commitIndexSave(state->layout, state->saveSlot); - state->saveSlot = UINT_MAX; - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot commit index state"); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -static int cleanupSave(IndexState *state) -{ - int result = cancelIndexSave(state->layout, state->saveSlot); - state->saveSlot = UINT_MAX; - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot cancel index save"); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int saveIndexState(IndexState *state) -{ - int result = prepareToSaveIndexState(state, IS_SAVE); - if (result != UDS_SUCCESS) { - return result; - } - - -unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - result = writeIndexComponent(component); - if (result != UDS_SUCCESS) { - cleanupSave(state); - return result; - } - } - return completeIndexSaving(state); -} - -/*****************************************************************************/ -int writeIndexStateCheckpoint(IndexState *state) -{ - int result = prepareToSaveIndexState(state, IS_CHECKPOINT); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - if (skipIndexComponentOnCheckpoint(component)) { - continue; - } - result = writeIndexComponent(component); - if (result != UDS_SUCCESS) { - cleanupSave(state); - return result; - } - } - - return completeIndexSaving(state); -} - -/*****************************************************************************/ -int startIndexStateCheckpoint(IndexState *state) -{ - int result = prepareToSaveIndexState(state, IS_CHECKPOINT); - if (result != UDS_SUCCESS) { - return result; - } - - state->saving = true; - - unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - if (skipIndexComponentOnCheckpoint(component)) { - continue; - } - result = startIndexComponentIncrementalSave(component); - if (result != UDS_SUCCESS) { - abortIndexStateCheckpoint(state); - return result; - } - } - - return result; -} - -/*****************************************************************************/ -int performIndexStateCheckpointChapterSynchronizedSaves(IndexState *state) -{ - if (!state->saving) { - return UDS_SUCCESS; - } - - unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - if (skipIndexComponentOnCheckpoint(component) || - !deferIndexComponentCheckpointToChapterWriter(component)) { - continue; - } - int result = performIndexComponentChapterWriterSave(component); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/** - * Wrapper function to do a zone-based checkpoint operation. - * - * @param [in] state the index state - * @param [in] zone the zone number - * @param [in] compFunc the index component function to use - * @param [out] completed if non-NULL, where to save the completion status - * - * @return UDS_SUCCESS or an error code - * - **/ -static int doIndexStateCheckpointInZone(IndexState *state, - unsigned int zone, - int (*compFunc)(IndexComponent *, - unsigned int, - CompletionStatus *), - CompletionStatus *completed) -{ - if (!state->saving) { - if (completed != NULL) { - *completed = CS_COMPLETED_PREVIOUSLY; - } - return UDS_SUCCESS; - } - - CompletionStatus status = CS_COMPLETED_PREVIOUSLY; - - unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - if (skipIndexComponentOnCheckpoint(component)) { - continue; - } - if (zone > 0 && !component->info->multiZone) { - continue; - } - CompletionStatus componentStatus = CS_NOT_COMPLETED; - int result = (*compFunc)(component, zone, &componentStatus); - if (result != UDS_SUCCESS) { - return result; - } - // compute rolling least status - if (componentStatus < status) { - status = componentStatus; - } - } - - if (completed != NULL) { - *completed = status; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int performIndexStateCheckpointInZone(IndexState *state, - unsigned int zone, - CompletionStatus *completed) -{ - return doIndexStateCheckpointInZone(state, zone, - &performIndexComponentZoneSave, - completed); -} - -/*****************************************************************************/ -int finishIndexStateCheckpointInZone(IndexState *state, - unsigned int zone, - CompletionStatus *completed) -{ - return doIndexStateCheckpointInZone(state, zone, - &finishIndexComponentZoneSave, - completed); -} - -/*****************************************************************************/ -int abortIndexStateCheckpointInZone(IndexState *state, - unsigned int zone, - CompletionStatus *completed) -{ - return doIndexStateCheckpointInZone(state, zone, - &abortIndexComponentZoneSave, completed); -} - -/*****************************************************************************/ -int finishIndexStateCheckpoint(IndexState *state) -{ - if (!state->saving) { - return UDS_SUCCESS; - } - - unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - if (skipIndexComponentOnCheckpoint(component)) { - continue; - } - int result = finishIndexComponentIncrementalSave(component); - if (result != UDS_SUCCESS) { - abortIndexStateCheckpoint(state); - return result; - } - } - - int result = completeIndexSaving(state); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int abortIndexStateCheckpoint(IndexState *state) -{ - if (!state->saving) { - return logErrorWithStringError(UDS_BAD_STATE, - "not saving the index state"); - } - - logError("aborting index state checkpoint"); - - int result = UDS_SUCCESS; - unsigned int i; - for (i = 0; i < state->count; ++i) { - IndexComponent *component = state->entries[i]; - if (skipIndexComponentOnCheckpoint(component)) { - continue; - } - int tmp = abortIndexComponentIncrementalSave(component); - if (result == UDS_SUCCESS) { - result = tmp; - } - } - - cleanupSave(state); - state->saving = false; - - return result; -} - -/*****************************************************************************/ -int discardIndexStateData(IndexState *state) -{ - int result = discardIndexSaves(state->layout, true); - state->saveSlot = UINT_MAX; - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "%s: cannot destroy all index saves", - __func__); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int discardLastIndexStateSave(IndexState *state) -{ - int result = discardIndexSaves(state->layout, false); - state->saveSlot = UINT_MAX; - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "%s: cannot destroy latest index save", - __func__); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -Buffer *getStateIndexStateBuffer(IndexState *state, IOAccessMode mode) -{ - unsigned int slot = mode == IO_READ ? state->loadSlot : state->saveSlot; - return getIndexStateBuffer(state->layout, slot); -} - -/*****************************************************************************/ -int openStateBufferedReader(IndexState *state, - RegionKind kind, - unsigned int zone, - BufferedReader **readerPtr) -{ - return openIndexBufferedReader(state->layout, state->loadSlot, kind, zone, - readerPtr); -} - -/*****************************************************************************/ -int openStateBufferedWriter(IndexState *state, - RegionKind kind, - unsigned int zone, - BufferedWriter **writerPtr) -{ - return openIndexBufferedWriter(state->layout, state->saveSlot, kind, zone, - writerPtr); -} diff --git a/uds/indexState.h b/uds/indexState.h deleted file mode 100644 index 82899c1..0000000 --- a/uds/indexState.h +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexState.h#5 $ - */ - -#ifndef INDEX_STATE_H -#define INDEX_STATE_H 1 - -#include "buffer.h" -#include "indexComponent.h" - - -/** - * Used here and in SingleFileLayout. - **/ -typedef enum { - IS_SAVE, - IS_CHECKPOINT, - NO_SAVE = 9999, -} IndexSaveType; - -/* - * Used in getStateIndexStateBuffer to identify whether the index state buffer - * is for the index being loaded or the index being saved. - */ -typedef enum { - IO_READ = 0x1, - IO_WRITE = 0x2, -} IOAccessMode; - -/** - * The index state structure controls the loading and saving of the index - * state. - **/ -typedef struct indexState { - struct indexLayout *layout; - unsigned int zoneCount; // number of index zones to use - unsigned int loadZones; - unsigned int loadSlot; - unsigned int saveSlot; - unsigned int count; // count of registered entries (<= length) - unsigned int length; // total span of array allocation - bool saving; // incremental save in progress - IndexComponent *entries[]; // array of index component entries -} IndexState; - -/** - * Make an index state object, - * - * @param [in] layout The index layout. - * @param [in] numZones The number of zones to use. - * @param [in] maxComponents The maximum number of components to be handled. - * @param [out] statePtr Where to store the index state object. - * - * @return UDS_SUCCESS or an error code - **/ -int makeIndexState(struct indexLayout *layout, - unsigned int numZones, - unsigned int maxComponents, - IndexState **statePtr) - __attribute__((warn_unused_result)); - -/** - * Free an index state (generically). - * - * @param statePtr The pointer to the index state to be freed and - * set to NULL. - **/ -void freeIndexState(IndexState **statePtr); - -/** - * Add an index component to an index state. - * - * @param state The index directory in which to add this component. - * @param info The index component file specification. - * @param data The per-component data structure. - * @param context The load/save context of the component. - * - * @return UDS_SUCCESS or an error code. - **/ -int addIndexStateComponent(IndexState *state, - const IndexComponentInfo *info, - void *data, - void *context) - __attribute__((warn_unused_result)); - -/** - * Load index state - * - * @param state The index state. - * @param replayPtr If set, the place to hold whether a replay is required. - * - * @return UDS_SUCCESS or error - **/ -int loadIndexState(IndexState *state, bool *replayPtr) - __attribute__((warn_unused_result)); - -/** - * Save the current index state, including the open chapter. - * - * @param state The index state. - * - * @return UDS_SUCCESS or error - **/ -int saveIndexState(IndexState *state) __attribute__((warn_unused_result)); - -/** - * Prepare to save the index state. - * - * @param state the index state - * @param saveType whether a checkpoint or save - * - * @return UDS_SUCCESS or an error code - **/ -int prepareToSaveIndexState(IndexState *state, IndexSaveType saveType) - __attribute__((warn_unused_result)); - -/** - * Write index checkpoint non-incrementally (for testing). - * - * @param state The index state. - * - * @return UDS_SUCCESS or error - **/ -int writeIndexStateCheckpoint(IndexState *state) - __attribute__((warn_unused_result)); - -/** - * Sets up an index state checkpoint which will proceed incrementally. - * May create the directory but does not actually write any data. - * - * @param state The index state. - * - * @return UDS_SUCCESS or an error code. - **/ -int startIndexStateCheckpoint(IndexState *state) - __attribute__((warn_unused_result)); - -/** - * Perform operations on index state checkpoints that are synchronized to - * the chapter writer thread. - * - * @param state The index state. - * - * @return UDS_SUCCESS or an error code. - **/ -int performIndexStateCheckpointChapterSynchronizedSaves(IndexState *state) - __attribute__((warn_unused_result)); - -/** - * Performs zone-specific (and, for zone 0, general) incremental checkpointing. - * - * @param [in] state The index state. - * @param [in] zone The zone number. - * @param [out] completed Set to whether the checkpoint has completed - * for this zone. - * - * @return UDS_SUCCESS or an error code. - **/ -int performIndexStateCheckpointInZone(IndexState *state, - unsigned int zone, - CompletionStatus *completed) - __attribute__((warn_unused_result)); - -/** - * Force the completion of an incremental index state checkpoint - * for a particular zone. - * - * @param [in] state The index state. - * @param [in] zone The zone number. - * @param [out] completed Set to whether the checkpoint has completed - * for this zone. - * - * @return UDS_SUCCESS or an error code. - **/ -int finishIndexStateCheckpointInZone(IndexState *state, - unsigned int zone, - CompletionStatus *completed) - __attribute__((warn_unused_result)); - -/** - * Force the completion of an incremental index state checkpoint once - * all zones are completed. - * - * @param [in] state The index state. - * - * @return UDS_SUCCESS or an error code. - **/ -int finishIndexStateCheckpoint(IndexState *state) - __attribute__((warn_unused_result)); - -/** - * Aborts an index state checkpoint which is proceeding incrementally - * for a particular zone. - * - * @param [in] state The index state. - * @param [in] zone The zone number. - * @param [out] completed Set to whether the checkpoint has completed or - * aborted for this zone. - * - * @return UDS_SUCCESS or an error code. - **/ -int abortIndexStateCheckpointInZone(IndexState *state, - unsigned int zone, - CompletionStatus *completed); - -/** - * Aborts an index state checkpoint which is proceeding incrementally, - * once all the zones are aborted. - * - * @param [in] state The index state. - * - * @return UDS_SUCCESS or an error code. - **/ -int abortIndexStateCheckpoint(IndexState *state); - -/** - * Remove or disable the index state data, for testing. - * - * @param state The index state - * - * @return UDS_SUCCESS or an error code - * - * @note the return value of this function is frequently ignored - **/ -int discardIndexStateData(IndexState *state); - -/** - * Discard the last index state save, for testing. - * - * @param state The index state - * - * @return UDS_SUCCESS or an error code - * - * @note the return value of this function is frequently ignored - **/ -int discardLastIndexStateSave(IndexState *state); - -/** - * Find index component, for testing. - * - * @param state The index state - * @param info The index component file specification - * - * @return The index component, or NULL if not found - **/ -IndexComponent *findIndexComponent(const IndexState *state, - const IndexComponentInfo *info) - __attribute__((warn_unused_result)); - -/** - * Get the indexStateBuffer for a specified mode. - * - * @param state The index state. - * @param mode One of IO_READ or IO_WRITE. - * - * @return the index state buffer - **/ -Buffer *getStateIndexStateBuffer(IndexState *state, IOAccessMode mode) - __attribute__((warn_unused_result)); - -/** - * Open a BufferedReader for a specified state, kind, and zone. - * This helper function is used by IndexComponent. - * - * @param state The index state. - * @param kind The kind if index save region to open. - * @param zone The zone number for the region. - * @param readerPtr Where to store the BufferedReader. - * - * @return UDS_SUCCESS or an error code. - **/ -int openStateBufferedReader(IndexState *state, - RegionKind kind, - unsigned int zone, - BufferedReader **readerPtr) - __attribute__((warn_unused_result)); - -/** - * Open a BufferedWriter for a specified state, kind, and zone. - * This helper function is used by IndexComponent. - * - * @param state The index state. - * @param kind The kind if index save region to open. - * @param zone The zone number for the region. - * @param writerPtr Where to store the BufferedWriter. - * - * @return UDS_SUCCESS or an error code. - **/ -int openStateBufferedWriter(IndexState *state, - RegionKind kind, - unsigned int zone, - BufferedWriter **writerPtr) - __attribute__((warn_unused_result)); - -#endif // INDEX_STATE_H diff --git a/uds/indexStateData.c b/uds/indexStateData.c deleted file mode 100644 index 62038f0..0000000 --- a/uds/indexStateData.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexStateData.c#3 $ - */ - -#include "indexStateData.h" - -#include "buffer.h" -#include "errors.h" -#include "index.h" -#include "logger.h" -#include "uds.h" - -/* The index state version header */ -typedef struct { - int32_t signature; - int32_t versionID; -} IndexStateVersion; - -/* The version 301 index state */ -typedef struct { - uint64_t newestChapter; - uint64_t oldestChapter; - uint64_t lastCheckpoint; - uint32_t unused; - uint32_t padding; -} IndexStateData301; - -static const IndexStateVersion INDEX_STATE_VERSION_301 = { - .signature = -1, - .versionID = 301, -}; - -/** - * The index state index component reader. - * - * @param portal the ReadPortal that handles the read of the component - * - * @return UDS_SUCCESS or an error code - **/ -static int readIndexStateData(ReadPortal *portal) -{ - Buffer *buffer = getStateIndexStateBuffer(portal->component->state, IO_READ); - int result = rewindBuffer(buffer, uncompactedAmount(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - - IndexStateVersion fileVersion; - result = getInt32LEFromBuffer(buffer, &fileVersion.signature); - if (result != UDS_SUCCESS) { - return result; - } - result = getInt32LEFromBuffer(buffer, &fileVersion.versionID); - if (result != UDS_SUCCESS) { - return result; - } - - if (fileVersion.signature != -1 || fileVersion.versionID != 301) { - return logErrorWithStringError(UDS_UNSUPPORTED_VERSION, - "Index state version %d,%d is unsupported", - fileVersion.signature, - fileVersion.versionID); - } - - IndexStateData301 state; - result = getUInt64LEFromBuffer(buffer, &state.newestChapter); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &state.oldestChapter); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &state.lastCheckpoint); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &state.unused); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &state.padding); - if (result != UDS_SUCCESS) { - return result; - } - - if ((state.unused != 0) || (state.padding != 0)) { - return UDS_CORRUPT_COMPONENT; - } - - Index *index = indexComponentData(portal->component); - index->newestVirtualChapter = state.newestChapter; - index->oldestVirtualChapter = state.oldestChapter; - index->lastCheckpoint = state.lastCheckpoint; - return UDS_SUCCESS; -} - -/** - * The index state index component writer. - * - * @param component The component whose state is to be saved (an Index) - * @param writer The buffered writer. - * @param zone The zone to write. - * - * @return UDS_SUCCESS or an error code - **/ -static int writeIndexStateData(IndexComponent *component, - BufferedWriter *writer __attribute__((unused)), - unsigned int zone __attribute__((unused))) -{ - Buffer *buffer = getStateIndexStateBuffer(component->state, IO_WRITE); - int result = resetBufferEnd(buffer, 0); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, INDEX_STATE_VERSION_301.signature); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, INDEX_STATE_VERSION_301.versionID); - if (result != UDS_SUCCESS) { - return result; - } - - Index *index = indexComponentData(component); - IndexStateData301 state = { - .newestChapter = index->newestVirtualChapter, - .oldestChapter = index->oldestVirtualChapter, - .lastCheckpoint = index->lastCheckpoint, - }; - - result = putUInt64LEIntoBuffer(buffer, state.newestChapter); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, state.oldestChapter); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, state.lastCheckpoint); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, state.unused); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, state.padding); - if (result != UDS_SUCCESS) { - return result; - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ - -const IndexComponentInfo INDEX_STATE_INFO = { - .kind = RL_KIND_INDEX_STATE, - .name = "index state", - .saveOnly = false, - .chapterSync = true, - .multiZone = false, - .ioStorage = false, - .loader = readIndexStateData, - .saver = writeIndexStateData, - .incremental = NULL, -}; diff --git a/uds/indexStateData.h b/uds/indexStateData.h deleted file mode 100644 index b6aa9b2..0000000 --- a/uds/indexStateData.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexStateData.h#1 $ - */ - -#ifndef INDEX_STATE_DATA_H -#define INDEX_STATE_DATA_H 1 - -#include "indexComponent.h" - -extern const IndexComponentInfo INDEX_STATE_INFO; - -#endif /* not INDEX_STATE_DATA_H */ diff --git a/uds/indexVersion.c b/uds/indexVersion.c deleted file mode 100644 index df16e73..0000000 --- a/uds/indexVersion.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexVersion.c#1 $ - */ - -#include "indexVersion.h" - -void initializeIndexVersion(struct index_version *version, - uint32_t superVersion) -{ - /* - * Version 1 was introduced for the first single file layout. It was used in - * RHEL7 and in RHEL8.0 Beta. No kernel index ever used an earlier version. - */ - - /* - * Version 2 was created when we discovered that the volume header page was - * written in native endian format. It was used in RHEL8.0 and RHEL8.1. We - * stopped reading and the volume header page, and changed to version 2 so - * that an index creaed on RHEL8 cannot be taken back an used on RHEL7. - * - * Versions 1 and 2 are identical in normal operation (i.e. after the index - * is loaded). - */ - - /* - * Version 3 was created when we discovered the the chapter index headers - * were written in native endian format. It was first used in RHEL8.2 and is - * the current version for new indices. - * - * Versions before 3 read and write native endian chapter headers. Version 3 - * reads chapter headers in any endian order, and writes little-endian - * chapter headers. - */ - bool chapterIndexHeaderNativeEndian = superVersion < 3; - - *version = (struct index_version) { - .chapterIndexHeaderNativeEndian = chapterIndexHeaderNativeEndian, - }; -} diff --git a/uds/indexVersion.h b/uds/indexVersion.h deleted file mode 100644 index f46b2e9..0000000 --- a/uds/indexVersion.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexVersion.h#1 $ - */ - -#ifndef INDEX_VERSION_H -#define INDEX_VERSION_H - -#include "typeDefs.h" - -struct index_version { - bool chapterIndexHeaderNativeEndian; -}; - -enum { - SUPER_VERSION_MINIMUM = 1, - SUPER_VERSION_MAXIMUM = 3, - SUPER_VERSION_CURRENT = 3, -}; - -/** - * Initialize the version parameters that we normally learn when loading the - * index but need to use during index operation. - * - * @param version The version parameters - * @param superVersion The SuperBlock version number - **/ -void initializeIndexVersion(struct index_version *version, - uint32_t superVersion); - -#endif // INDEX_VERSION_H diff --git a/uds/indexZone.c b/uds/indexZone.c deleted file mode 100644 index f3cd8ed..0000000 --- a/uds/indexZone.c +++ /dev/null @@ -1,401 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexZone.c#4 $ - */ - -#include "indexZone.h" - -#include "errors.h" -#include "index.h" -#include "indexCheckpoint.h" -#include "indexRouter.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "request.h" -#include "sparseCache.h" -#include "uds.h" - -/**********************************************************************/ -int makeIndexZone(struct index *index, unsigned int zoneNumber) -{ - IndexZone *zone; - int result = ALLOCATE(1, IndexZone, "index zone", &zone); - if (result != UDS_SUCCESS) { - return result; - } - - result = makeOpenChapter(index->volume->geometry, index->zoneCount, - &zone->openChapter); - if (result != UDS_SUCCESS) { - freeIndexZone(zone); - return result; - } - - result = makeOpenChapter(index->volume->geometry, index->zoneCount, - &zone->writingChapter); - if (result != UDS_SUCCESS) { - freeIndexZone(zone); - return result; - } - - zone->index = index; - zone->id = zoneNumber; - index->zones[zoneNumber] = zone; - - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeIndexZone(IndexZone *zone) -{ - if (zone == NULL) { - return; - } - - freeOpenChapter(zone->openChapter); - freeOpenChapter(zone->writingChapter); - FREE(zone); -} - -/**********************************************************************/ -bool isZoneChapterSparse(const IndexZone *zone, - uint64_t virtualChapter) -{ - return isChapterSparse(zone->index->volume->geometry, - zone->oldestVirtualChapter, - zone->newestVirtualChapter, - virtualChapter); -} - -/**********************************************************************/ -void setActiveChapters(IndexZone *zone) -{ - zone->oldestVirtualChapter = zone->index->oldestVirtualChapter; - zone->newestVirtualChapter = zone->index->newestVirtualChapter; -} - -/** - * Swap the open and writing chapters after blocking until there are no active - * chapter writers on the index. - * - * @param zone The zone swapping chapters - * - * @return UDS_SUCCESS or a return code - **/ -static int swapOpenChapter(IndexZone *zone) -{ - // Wait for any currently writing chapter to complete - int result = finishPreviousChapter(zone->index->chapterWriter, - zone->newestVirtualChapter); - if (result != UDS_SUCCESS) { - return result; - } - - // Swap the writing and open chapters - OpenChapterZone *tempChapter = zone->openChapter; - zone->openChapter = zone->writingChapter; - zone->writingChapter = tempChapter; - return UDS_SUCCESS; -} - -/** - * Advance to a new open chapter, and forget the oldest chapter in the - * index if necessary. - * - * @param zone The zone containing the chapter to reap - * - * @return UDS_SUCCESS or an error code - **/ -static int reapOldestChapter(IndexZone *zone) -{ - Index *index = zone->index; - unsigned int chaptersPerVolume = index->volume->geometry->chaptersPerVolume; - int result - = ASSERT(((zone->newestVirtualChapter - zone->oldestVirtualChapter) - <= chaptersPerVolume), - "newest (%llu) and oldest (%llu) virtual chapters " - "less than or equal to chapters per volume (%u)", - zone->newestVirtualChapter, zone->oldestVirtualChapter, - chaptersPerVolume); - if (result != UDS_SUCCESS) { - return result; - } - - setMasterIndexZoneOpenChapter(index->masterIndex, zone->id, - zone->newestVirtualChapter); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int executeSparseCacheBarrierMessage(IndexZone *zone, - BarrierMessageData *barrier) -{ - /* - * Check if the chapter index for the virtual chapter is already in the - * cache, and if it's not, rendezvous with the other zone threads to add the - * chapter index to the sparse index cache. - */ - return updateSparseCache(zone, barrier->virtualChapter); -} - -/** - * Handle notification that some other zone has closed its open chapter. If - * the chapter that was closed is still the open chapter for this zone, - * close it now in order to minimize skew. - * - * @param zone The zone receiving the notification - * @param chapterClosed The notification - * - * @return UDS_SUCCESS or an error code - **/ -static int handleChapterClosed(IndexZone *zone, - ChapterClosedMessageData *chapterClosed) -{ - if (zone->newestVirtualChapter == chapterClosed->virtualChapter) { - return openNextChapter(zone, NULL); - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int dispatchIndexZoneControlRequest(Request *request) -{ - ZoneMessage *message = &request->zoneMessage; - IndexZone *zone = message->index->zones[request->zoneNumber]; - - switch (request->action) { - case REQUEST_SPARSE_CACHE_BARRIER: - return executeSparseCacheBarrierMessage(zone, &message->data.barrier); - - case REQUEST_ANNOUNCE_CHAPTER_CLOSED: - return handleChapterClosed(zone, &message->data.chapterClosed); - - default: - return ASSERT_FALSE("valid control message type: %d", request->action); - } -} - -/** - * Announce the closure of the current open chapter to the other zones. - * - * @param request The request which caused the chapter to close - * (may be NULL) - * @param zone The zone which first closed the chapter - * @param closedChapter The chapter which was closed - * - * @return UDS_SUCCESS or an error code - **/ -static int announceChapterClosed(Request *request, - IndexZone *zone, - uint64_t closedChapter) -{ - IndexRouter *router = ((request != NULL) ? request->router : NULL); - - ZoneMessage zoneMessage = { - .index = zone->index, - .data = { - .chapterClosed = { .virtualChapter = closedChapter } - } - }; - - unsigned int i; - for (i = 0; i < zone->index->zoneCount; i++) { - if (zone->id == i) { - continue; - } - int result; - if (router != NULL) { - result = launchZoneControlMessage(REQUEST_ANNOUNCE_CHAPTER_CLOSED, - zoneMessage, i, router); - } else { - // We're in a test which doesn't have zone queues, so we can just - // call the message function directly. - result = handleChapterClosed(zone->index->zones[i], - &zoneMessage.data.chapterClosed); - } - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int openNextChapter(IndexZone *zone, Request *request) -{ - logDebug("closing chapter %llu of zone %d after %u entries (%u short)", - zone->newestVirtualChapter, zone->id, zone->openChapter->size, - zone->openChapter->capacity - zone->openChapter->size); - - int result = swapOpenChapter(zone); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t closedChapter = zone->newestVirtualChapter++; - result = reapOldestChapter(zone); - if (result != UDS_SUCCESS) { - return logUnrecoverable(result, "reapOldestChapter failed"); - } - - resetOpenChapter(zone->openChapter); - - // begin, continue, or finish the checkpoint processing - // moved above startClosingChapter because some of the - // checkpoint processing now done by the chapter writer thread - result = processCheckpointing(zone->index, - zone->id, - zone->newestVirtualChapter); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int finishedZones = startClosingChapter(zone->index->chapterWriter, - zone->id, - zone->writingChapter); - if ((finishedZones == 1) && (zone->index->zoneCount > 1)) { - // This is the first zone of a multi-zone index to close this chapter, - // so inform the other zones in order to control zone skew. - result = announceChapterClosed(request, zone, closedChapter); - if (result != UDS_SUCCESS) { - return result; - } - } - - // If the chapter being opened won't overwrite the oldest chapter, we're - // done. - if (!areSamePhysicalChapter(zone->index->volume->geometry, - zone->newestVirtualChapter, - zone->oldestVirtualChapter)) { - return UDS_SUCCESS; - } - - uint64_t victim = zone->oldestVirtualChapter++; - if (finishedZones < zone->index->zoneCount) { - // We are not the last zone to close the chapter, so we're done - return UDS_SUCCESS; - } - - /* - * We are the last zone to close the chapter, so clean up the cache. That - * it is safe to let the last thread out of the previous chapter to do this - * relies on the fact that although the new open chapter shadows the oldest - * chapter in the cache, until we write the new open chapter to disk, we'll - * never look for it in the cache. - */ - return forgetChapter(zone->index->volume, victim, INVALIDATION_EXPIRE); -} - -/**********************************************************************/ -IndexRegion computeIndexRegion(const IndexZone *zone, - uint64_t virtualChapter) -{ - if (virtualChapter == zone->newestVirtualChapter) { - return LOC_IN_OPEN_CHAPTER; - } - return (isZoneChapterSparse(zone, virtualChapter) - ? LOC_IN_SPARSE : LOC_IN_DENSE); -} - -/**********************************************************************/ -int getRecordFromZone(IndexZone *zone, - Request *request, - bool *found, - uint64_t virtualChapter) -{ - if (virtualChapter == zone->newestVirtualChapter) { - searchOpenChapter(zone->openChapter, &request->chunkName, - &request->oldMetadata, found); - return UDS_SUCCESS; - } - - if ((zone->newestVirtualChapter > 0) - && (virtualChapter == (zone->newestVirtualChapter - 1)) - && (zone->writingChapter->size > 0)) { - // Only search the writing chapter if it is full, else look on disk. - searchOpenChapter(zone->writingChapter, &request->chunkName, - &request->oldMetadata, found); - return UDS_SUCCESS; - } - - // The slow lane thread has determined the location previously. We don't need - // to search again. Just return the location. - if (request->slLocationKnown) { - *found = request->slLocation != LOC_UNAVAILABLE; - return UDS_SUCCESS; - } - - Volume *volume = zone->index->volume; - if (isZoneChapterSparse(zone, virtualChapter) - && sparseCacheContains(volume->sparseCache, virtualChapter, - request->zoneNumber)) { - // The named chunk, if it exists, is in a sparse chapter that is cached, - // so just run the chunk through the sparse chapter cache search. - return searchSparseCacheInZone(zone, request, virtualChapter, found); - } - - return searchVolumePageCache(volume, request, &request->chunkName, - virtualChapter, &request->oldMetadata, found); -} - -/**********************************************************************/ -int putRecordInZone(IndexZone *zone, - Request *request, - const UdsChunkData *metadata) -{ - unsigned int remaining; - int result = putOpenChapter(zone->openChapter, &request->chunkName, metadata, - &remaining); - if (result != UDS_SUCCESS) { - return result; - } - - if (remaining == 0) { - return openNextChapter(zone, request); - } - - return UDS_SUCCESS; -} - -/**************************************************************************/ -int searchSparseCacheInZone(IndexZone *zone, - Request *request, - uint64_t virtualChapter, - bool *found) -{ - int recordPageNumber; - int result = searchSparseCache(zone, &request->chunkName, &virtualChapter, - &recordPageNumber); - if ((result != UDS_SUCCESS) || (virtualChapter == UINT64_MAX)) { - return result; - } - - Volume *volume = zone->index->volume; - // XXX map to physical chapter and validate. It would be nice to just pass - // the virtual in to the slow lane, since it's tracking invalidations. - unsigned int chapter - = mapToPhysicalChapter(volume->geometry, virtualChapter); - - return searchCachedRecordPage(volume, request, &request->chunkName, chapter, - recordPageNumber, &request->oldMetadata, - found); -} diff --git a/uds/indexZone.h b/uds/indexZone.h deleted file mode 100644 index 8301894..0000000 --- a/uds/indexZone.h +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/indexZone.h#2 $ - */ - -#ifndef INDEX_ZONE_H -#define INDEX_ZONE_H - -#include "common.h" -#include "openChapterZone.h" -#include "request.h" - -typedef struct { - struct index *index; - OpenChapterZone *openChapter; - OpenChapterZone *writingChapter; - uint64_t oldestVirtualChapter; - uint64_t newestVirtualChapter; - unsigned int id; -} IndexZone; - -/** - * Allocate an index zone. - * - * @param index The index receiving the zone - * @param zoneNumber The number of the zone to allocate - * - * @return UDS_SUCCESS or an error code. - **/ -int makeIndexZone(struct index *index, unsigned int zoneNumber) - __attribute__((warn_unused_result)); - -/** - * Clean up an index zone. - * - * @param zone The index zone to free - * - * @return UDS_SUCCESS or an error code. - **/ -void freeIndexZone(IndexZone *zone); - -/** - * Check whether a chapter is sparse or dense based on the current state of - * the index zone. - * - * @param zone The index zone to check against - * @param virtualChapter The virtual chapter number of the chapter to check - * - * @return true if the chapter is in the sparse part of the volume - **/ -bool isZoneChapterSparse(const IndexZone *zone, - uint64_t virtualChapter) - __attribute__((warn_unused_result)); - -/** - * Set the active chapter numbers for a zone based on its index. The active - * chapters consist of the range of chapters from the current oldest to - * the current newest virtual chapter. - * - * @param zone The zone to set - **/ -void setActiveChapters(IndexZone *zone); - -/** - * Dispatch a control request to an index zone. - * - * @param request The request to dispatch - * - * @return UDS_SUCCESS or an error code - **/ -int dispatchIndexZoneControlRequest(Request *request) - __attribute__((warn_unused_result)); - -/** - * Execute a sparse chapter index cache barrier control request on the zone - * worker thread. This call into the sparse cache to coordinate the cache - * update with the other zones. - * - * @param zone The index zone receiving the barrier message - * @param barrier The barrier control message data - * - * @return UDS_SUCCESS or an error code if the chapter index could not be - * read or decoded - **/ -int executeSparseCacheBarrierMessage(IndexZone *zone, - BarrierMessageData *barrier) - __attribute__((warn_unused_result)); - -/** - * Open the next chapter. - * - * @param zone The zone containing the open chapter - * @param request The request which requires the next chapter to be - * opened - * - * @return UDS_SUCCESS if successful. - **/ -int openNextChapter(IndexZone *zone, Request *request) - __attribute__((warn_unused_result)); - -/** - * Determine the IndexRegion in which a block was found. - * - * @param zone The zone that was searched - * @param virtualChapter The virtual chapter number - * - * @return the IndexRegion of the chapter in which the block was found - **/ -IndexRegion computeIndexRegion(const IndexZone *zone, - uint64_t virtualChapter); - -/** - * Get a record from either the volume or the open chapter in a zone. - * - * @param zone The index zone to query - * @param request The request originating the query - * @param found A pointer to a bool which will be set to - * true if the record was found. - * @param virtualChapter The chapter in which to search - * - * @return UDS_SUCCESS or an error code - **/ -int getRecordFromZone(IndexZone *zone, - Request *request, - bool *found, - uint64_t virtualChapter) - __attribute__((warn_unused_result)); - -/** - * Put a record in the open chapter. If this fills the chapter, the chapter - * will be closed and a new one will be opened. - * - * @param zone The index zone containing the chapter - * @param request The request containing the name of the record - * @param metadata The record metadata - * - * @return UDS_SUCCESS or an error - **/ -int putRecordInZone(IndexZone *zone, - Request *request, - const UdsChunkData *metadata) - __attribute__((warn_unused_result)); - -/** - * Search the cached sparse chapter index, either for a cached sparse hook, or - * as the last chance for finding the record named by a request. - * - * @param [in] zone the index zone - * @param [in] request the request originating the search - * @param [in] virtualChapter if UINT64_MAX, search the entire cache; - * otherwise search this chapter, if cached - * @param [out] found A pointer to a bool which will be set to - * true if the record was found - * - * @return UDS_SUCCESS or an error code - **/ -int searchSparseCacheInZone(IndexZone *zone, - Request *request, - uint64_t virtualChapter, - bool *found) - __attribute__((warn_unused_result)); - -#endif /* INDEX_ZONE_H */ diff --git a/uds/ioFactory.h b/uds/ioFactory.h deleted file mode 100644 index ef6cc90..0000000 --- a/uds/ioFactory.h +++ /dev/null @@ -1,179 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/ioFactory.h#7 $ - */ - -#ifndef IO_FACTORY_H -#define IO_FACTORY_H - -#include "bufferedReader.h" -#include "bufferedWriter.h" -#ifdef __KERNEL__ -#include -#else -#include "fileUtils.h" -#include "ioRegion.h" -#endif - -/* - * An IOFactory object is responsible for controlling access to index storage. - * The index is a contiguous range of blocks on a block device or within a - * file. - * - * The IOFactory holds the open device or file and is responsible for closing - * it. The IOFactory has methods to make IORegions that are used to access - * sections of the index. - */ -typedef struct ioFactory IOFactory; - -/* - * Define the UDS block size as 4K. Historically, we wrote the volume file in - * large blocks, but wrote all the other index data into byte streams stored in - * files. When we converted to writing an index into a block device, we - * changed to writing the byte streams into page sized blocks. Now that we - * support multiple architectures, we write into 4K blocks on all platforms. - * - * XXX We must convert all the rogue 4K constants to use UDS_BLOCK_SIZE. - */ -enum { UDS_BLOCK_SIZE = 4096 }; - -#ifdef __KERNEL__ -/** - * Create an IOFactory. The IOFactory is returned with a reference count of 1. - * - * @param path The path to the block device or file that contains the - * block stream - * @param factoryPtr The IOFactory is returned here - * - * @return UDS_SUCCESS or an error code - **/ -int makeIOFactory(const char *path, IOFactory **factoryPtr) - __attribute__((warn_unused_result)); -#else -/** - * Create an IOFactory. The IOFactory is returned with a reference count of 1. - * - * @param path The path to the block device or file that contains the - * block stream - * @param access The requested access kind. - * @param factoryPtr The IOFactory is returned here - * - * @return UDS_SUCCESS or an error code - **/ -int makeIOFactory(const char *path, - FileAccess access, - IOFactory **factoryPtr) - __attribute__((warn_unused_result)); -#endif - -/** - * Get another reference to an IOFactory, incrementing its reference count. - * - * @param factory The IOFactory - **/ -void getIOFactory(IOFactory *factory); - -/** - * Free a reference to an IOFactory. If the reference count drops to zero, - * free the IOFactory and release all its resources. - * - * @param factory The IOFactory - **/ -void putIOFactory(IOFactory *factory); - -/** - * Get the maximum potential size of the device or file. For a device, this is - * the actual size of the device. For a file, this is the largest file that we - * can possibly write. - * - * @param factory The IOFactory - * - * @return the writable size (in bytes) - **/ -size_t getWritableSize(IOFactory *factory) __attribute__((warn_unused_result)); - -#ifdef __KERNEL__ -/** - * Create a struct dm_bufio_client for a region of the index. - * - * @param factory The IOFactory - * @param offset The byte offset to the region within the index - * @param size The size of a block, in bytes - * @param reservedBuffers The number of buffers that can be reserved - * @param clientPtr The struct dm_bufio_client is returned here - * - * @return UDS_SUCCESS or an error code - **/ -int makeBufio(IOFactory *factory, - off_t offset, - size_t blockSize, - unsigned int reservedBuffers, - struct dm_bufio_client **clientPtr) - __attribute__((warn_unused_result)); -#else -/** - * Create an IORegion for a region of the index. - * - * @param factory The IOFactory - * @param offset The byte offset to the region within the index - * @param size The size in bytes of the region - * @param regionPtr The IORegion is returned here - * - * @return UDS_SUCCESS or an error code - **/ -int makeIORegion(IOFactory *factory, - off_t offset, - size_t size, - IORegion **regionPtr) - __attribute__((warn_unused_result)); -#endif - -/** - * Create a BufferedReader for a region of the index. - * - * @param factory The IOFactory - * @param offset The byte offset to the region within the index - * @param size The size in bytes of the region - * @param regionPtr The IORegion is returned here - * - * @return UDS_SUCCESS or an error code - **/ -int openBufferedReader(IOFactory *factory, - off_t offset, - size_t size, - BufferedReader **readerPtr) - __attribute__((warn_unused_result)); - -/** - * Create a BufferedWriter for a region of the index. - * - * @param factory The IOFactory - * @param offset The byte offset to the region within the index - * @param size The size in bytes of the region - * @param regionPtr The IORegion is returned here - * - * @return UDS_SUCCESS or an error code - **/ -int openBufferedWriter(IOFactory *factory, - off_t offset, - size_t size, - BufferedWriter **writerPtr) - __attribute__((warn_unused_result)); - -#endif // IO_FACTORY_H diff --git a/uds/ioFactoryLinuxKernel.c b/uds/ioFactoryLinuxKernel.c deleted file mode 100644 index 9e45920..0000000 --- a/uds/ioFactoryLinuxKernel.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/ioFactoryLinuxKernel.c#9 $ - */ - -#include -#include - -#include "atomicDefs.h" -#include "ioFactory.h" -#include "logger.h" -#include "memoryAlloc.h" - -enum { BLK_FMODE = FMODE_READ | FMODE_WRITE }; - -/* - * A kernel mode IOFactory object controls access to an index stored on a block - * device. - */ -struct ioFactory { - struct block_device *bdev; - atomic_t refCount; -}; - -/*****************************************************************************/ -void getIOFactory(IOFactory *factory) -{ - atomic_inc(&factory->refCount); -} - -/*****************************************************************************/ -int makeIOFactory(const char *path, IOFactory **factoryPtr) -{ - struct block_device *bdev; - dev_t device = name_to_dev_t(path); - if (device != 0) { - bdev = blkdev_get_by_dev(device, BLK_FMODE, NULL); - } else { - bdev = blkdev_get_by_path(path, BLK_FMODE, NULL); - } - if (IS_ERR(bdev)) { - logErrorWithStringError(-PTR_ERR(bdev), "%s is not a block device", path); - return UDS_INVALID_ARGUMENT; - } - - IOFactory *factory; - int result = ALLOCATE(1, IOFactory, __func__, &factory); - if (result != UDS_SUCCESS) { - blkdev_put(bdev, BLK_FMODE); - return result; - } - - factory->bdev = bdev; - atomic_set_release(&factory->refCount, 1); - - *factoryPtr = factory; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -void putIOFactory(IOFactory *factory) -{ - if (atomic_add_return(-1, &factory->refCount) <= 0) { - blkdev_put(factory->bdev, BLK_FMODE); - FREE(factory); - } -} - -/*****************************************************************************/ -size_t getWritableSize(IOFactory *factory) -{ - return i_size_read(factory->bdev->bd_inode); -} - -/*****************************************************************************/ -int makeBufio(IOFactory *factory, - off_t offset, - size_t blockSize, - unsigned int reservedBuffers, - struct dm_bufio_client **clientPtr) -{ - if (offset % SECTOR_SIZE != 0) { - return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, - "offset %zd not multiple of %d", - offset, SECTOR_SIZE); - } - if (blockSize % UDS_BLOCK_SIZE != 0) { - return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, - "blockSize %zd not multiple of %d", - blockSize, UDS_BLOCK_SIZE); - } - - struct dm_bufio_client *client = dm_bufio_client_create(factory->bdev, - blockSize, - reservedBuffers, 0, - NULL, NULL); - if (IS_ERR(client)) { - return -PTR_ERR(client); - } - - dm_bufio_set_sector_offset(client, offset >> SECTOR_SHIFT); - *clientPtr = client; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int openBufferedReader(IOFactory *factory, - off_t offset, - size_t size, - BufferedReader **readerPtr) -{ - if (size % UDS_BLOCK_SIZE != 0) { - return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, - "region size %zd is not multiple of %d", - size, UDS_BLOCK_SIZE); - } - - struct dm_bufio_client *client = NULL; - int result = makeBufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); - if (result != UDS_SUCCESS) { - return result; - } - - result = makeBufferedReader(factory, client, size / UDS_BLOCK_SIZE, - readerPtr); - if (result != UDS_SUCCESS) { - dm_bufio_client_destroy(client); - } - return result; -} - -/*****************************************************************************/ -int openBufferedWriter(IOFactory *factory, - off_t offset, - size_t size, - BufferedWriter **writerPtr) -{ - if (size % UDS_BLOCK_SIZE != 0) { - return logErrorWithStringError(UDS_INCORRECT_ALIGNMENT, - "region size %zd is not multiple of %d", - size, UDS_BLOCK_SIZE); - } - - struct dm_bufio_client *client = NULL; - int result = makeBufio(factory, offset, UDS_BLOCK_SIZE, 1, &client); - if (result != UDS_SUCCESS) { - return result; - } - - result = makeBufferedWriter(factory, client, size / UDS_BLOCK_SIZE, - writerPtr); - if (result != UDS_SUCCESS) { - dm_bufio_client_destroy(client); - } - return result; -} diff --git a/uds/layoutRegion.h b/uds/layoutRegion.h deleted file mode 100644 index b49f979..0000000 --- a/uds/layoutRegion.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/layoutRegion.h#1 $ - */ - -#ifndef LAYOUT_REGION_H -#define LAYOUT_REGION_H - -/** - * Single file layouts are defined in terms of data regions. Each data region - * is a sub-section of the available space. Some data regions may contain - * subsidiary data regions, for example, a checkpoint or index save will - * contain master index regions (according to the number of zones), an - * index page map region, and possibly an open chapter region. - **/ - -static const uint64_t REGION_MAGIC = 0x416c6252676e3031; // 'AlbRgn01' - -typedef struct regionHeader { - uint64_t magic; // REGION_MAGIC - uint64_t regionBlocks; // size of whole region - uint16_t type; // RH_TYPE_... - uint16_t version; // 1 - uint16_t numRegions; // number of layouts in the table - uint16_t payload; // extra data beyond region table -} RegionHeader; - -typedef struct layoutRegion { - uint64_t startBlock; - uint64_t numBlocks; - uint32_t checksum; // only used for save regions - uint16_t kind; - uint16_t instance; -} LayoutRegion; - -typedef struct regionTable { - RegionHeader header; - LayoutRegion regions[]; -} RegionTable; - -#endif // LAYOUT_REGION_H diff --git a/uds/loadType.c b/uds/loadType.c deleted file mode 100644 index 125f8b0..0000000 --- a/uds/loadType.c +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/loadType.c#1 $ - */ - -#include "loadType.h" - -#include "logger.h" - -/**********************************************************************/ -const char *getLoadType(LoadType loadType) -{ - switch (loadType) { - case LOAD_CREATE: - return "creating index"; - case LOAD_LOAD: - return "loading index"; - case LOAD_REBUILD: - return "loading or rebuilding index"; - default: - return "no load method specified"; - } -} diff --git a/uds/loadType.h b/uds/loadType.h deleted file mode 100644 index 2b93e72..0000000 --- a/uds/loadType.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/loadType.h#1 $ - */ - -#ifndef LOAD_TYPE_H -#define LOAD_TYPE_H - -/** - * Methods of starting the index. (Keep getLoadType() in sync.) - * - * Usage number 1 is to note the interface method that initiates loading the - * index. As in this table: - * - * name type opened by - * =========== ====== ==================== - * LOAD_CREATE local udsCreateLocalIndex - * LOAD_LOAD local udsLoadLocalIndex - * LOAD_REBUILD local udsRebuildLocalIndex - * - * Usage number 2 is to record how an index was really opened. As in this - * table: - * - * LOAD_CREATE new empty index - * LOAD_LOAD loaded saved index - * LOAD_REPLAY loaded checkpoint and replayed new chapters - * LOAD_EMPTY empty master index from empty volume data - * LOAD_REBUILD rebuilt master index from volume data - **/ -typedef enum { - LOAD_UNDEFINED = 0, - LOAD_CREATE, - LOAD_LOAD, - LOAD_REBUILD, - LOAD_EMPTY, - LOAD_REPLAY, -} LoadType; - -/** - * get a string indicating how an index is to be loaded. - * - * @param loadType The load type to log - **/ -const char *getLoadType(LoadType loadType); - -#endif /* LOAD_TYPE_H */ diff --git a/uds/logger.c b/uds/logger.c deleted file mode 100644 index 311bae1..0000000 --- a/uds/logger.c +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/logger.c#3 $ - */ - -#include "logger.h" - -#include "common.h" -#include "errors.h" -#include "stringUtils.h" -#include "threads.h" -#include "uds.h" - -typedef struct { - const char *name; - const int priority; -} PriorityName; - -static const PriorityName PRIORITIES[] = { - { "ALERT", LOG_ALERT }, - { "CRITICAL", LOG_CRIT }, - { "CRIT", LOG_CRIT }, - { "DEBUG", LOG_DEBUG }, - { "EMERGENCY", LOG_EMERG }, - { "EMERG", LOG_EMERG }, - { "ERROR", LOG_ERR }, - { "ERR", LOG_ERR }, - { "INFO", LOG_INFO }, - { "NOTICE", LOG_NOTICE }, - { "PANIC", LOG_EMERG }, - { "WARN", LOG_WARNING }, - { "WARNING", LOG_WARNING }, - { NULL, -1 }, -}; - -static const char *const PRIORITY_STRINGS[] = { - "EMERGENCY", - "ALERT", - "CRITICAL", - "ERROR", - "WARN", - "NOTICE", - "INFO", - "DEBUG", -}; - -static int logLevel = LOG_INFO; - -/*****************************************************************************/ -int getLogLevel(void) -{ - return logLevel; -} - -/*****************************************************************************/ -void setLogLevel(int newLogLevel) -{ - logLevel = newLogLevel; -} - -/*****************************************************************************/ -int stringToPriority(const char *string) -{ - int i; - for (i = 0; PRIORITIES[i].name != NULL; i++) { - if (strcasecmp(string, PRIORITIES[i].name) == 0) { - return PRIORITIES[i].priority; - } - } - return LOG_INFO; -} - -/*****************************************************************************/ -const char *priorityToString(int priority) -{ - if ((priority < 0) || (priority >= (int) COUNT_OF(PRIORITY_STRINGS))) { - return "unknown"; - } - return PRIORITY_STRINGS[priority]; -} - -/*****************************************************************************/ -void logEmbeddedMessage(int priority, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - ...) -{ - va_list ap; - va_start(ap, fmt2); - logMessagePack(priority, prefix, fmt1, args1, fmt2, ap); - va_end(ap); -} - -#pragma GCC diagnostic push -/* - * GCC (version 8.1.1 20180502 (Red Hat 8.1.1-1)) on Fedora 28 seems - * to think that this function should get a printf format - * attribute. But we have no second format string, and no additional - * arguments at the call site, and GCC also gets unhappy trying to - * analyze the format and values when there are none. So we'll just - * shut it up. - */ -#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" -/** - * Log a message. - * - * This helper function exists solely to create a valid va_list with - * no useful info. It does the real work of vLogMessage, which wants a - * second va_list object to pass down. - * - * @param priority The syslog priority value for the message. - * @param format The format of the message (a printf style format) - * @param args The variadic argument list of format parameters. - **/ -static void vLogMessageHelper(int priority, - const char *format, - va_list args, - ...) -{ - va_list dummy; - va_start(dummy, args); - logMessagePack(priority, NULL, format, args, NULL, dummy); - va_end(dummy); -} -#pragma GCC diagnostic pop - -/*****************************************************************************/ -void vLogMessage(int priority, const char *format, va_list args) -{ - vLogMessageHelper(priority, format, args); -} - -/*****************************************************************************/ -void logMessage(int priority, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(priority, format, args); - va_end(args); -} - -/*****************************************************************************/ -void logDebug(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_DEBUG, format, args); - va_end(args); -} - -/*****************************************************************************/ -void logInfo(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_INFO, format, args); - va_end(args); -} - -/*****************************************************************************/ -void logNotice(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_NOTICE, format, args); - va_end(args); -} - -/*****************************************************************************/ -void logWarning(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_WARNING, format, args); - va_end(args); -} - -/*****************************************************************************/ -void logError(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_ERR, format, args); - va_end(args); -} - -/*****************************************************************************/ -int vLogWithStringError(int priority, - int errnum, - const char *format, - va_list args) -{ - char errbuf[ERRBUF_SIZE]; - logEmbeddedMessage(priority, NULL, format, args, ": %s (%d)", - stringError(errnum, errbuf, sizeof(errbuf)), - errnum); - return errnum; -} - -/*****************************************************************************/ -int logWithStringError(int priority, int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(priority, errnum, format, args); - va_end(args); - return errnum; -} - -/*****************************************************************************/ -int logErrorWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_ERR, errnum, format, args); - va_end(args); - return errnum; -} - -/*****************************************************************************/ -int logWarningWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_WARNING, errnum, format, args); - va_end(args); - return errnum; -} - -/*****************************************************************************/ -int logDebugWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_DEBUG, errnum, format, args); - va_end(args); - return errnum; -} - -/*****************************************************************************/ -int logInfoWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_INFO, errnum, format, args); - va_end(args); - return errnum; -} - -/*****************************************************************************/ -int logNoticeWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_NOTICE, errnum, format, args); - va_end(args); - return errnum; -} - -/*****************************************************************************/ -int logFatalWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_CRIT, errnum, format, args); - va_end(args); - return errnum; -} - -/*****************************************************************************/ -int logUnrecoverable(int errnum, const char *format, ...) -{ - if (isSuccessful(errnum)) { - return errnum; - } - va_list args; - va_start(args, format); - vLogWithStringError(LOG_CRIT, errnum, format, args); - va_end(args); - return makeUnrecoverable(errnum); -} - -/*****************************************************************************/ -void logFatal(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_CRIT, format, args); - va_end(args); -} diff --git a/uds/logger.h b/uds/logger.h deleted file mode 100644 index b1f9d56..0000000 --- a/uds/logger.h +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/logger.h#5 $ - */ - -#ifndef LOGGER_H -#define LOGGER_H 1 - -#ifdef __KERNEL__ -#include -#include -#else -#include -#include "minisyslog.h" -#endif - -#ifdef __KERNEL__ -#define LOG_EMERG 0 /* system is unusable */ -#define LOG_ALERT 1 /* action must be taken immediately */ -#define LOG_CRIT 2 /* critical conditions */ -#define LOG_ERR 3 /* error conditions */ -#define LOG_WARNING 4 /* warning conditions */ -#define LOG_NOTICE 5 /* normal but significant condition */ -#define LOG_INFO 6 /* informational */ -#define LOG_DEBUG 7 /* debug-level messages */ -#endif - -#ifdef __KERNEL__ -// Make it easy to log real pointer values using %px when in development. -#ifdef LOG_INTERNAL -#define PRIptr "px" -#else -#define PRIptr "pK" -#endif -#else // not __KERNEL__ -// For compatibility with hooks we need when compiling in kernel mode. -#define PRIptr "p" -#endif - -/* - * Apply a rate limiter to a log method call. - * - * @param logFunc A method that does logging, which is not invoked if we are - * running in the kernel and the ratelimiter detects that we - * are calling it frequently. - */ -#ifdef __KERNEL__ -#define logRatelimit(logFunc, ...) \ - do { \ - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - if (__ratelimit(&_rs)) { \ - logFunc(__VA_ARGS__); \ - } \ - } while (0) -#else -#define logRatelimit(logFunc, ...) logFunc(__VA_ARGS__) -#endif - -/** - * @file - * - * All of the log() functions will preserve the callers value of errno. - **/ - -#ifndef __KERNEL__ -/* - * In user mode, the functions in this file are not thread safe in the sense - * that nothing prevents multiple threads from closing loggers out from under - * other threads. In reality this isn't a problem since there are no calls to - * closeLogger() in production code. - */ - -/** - * Start the logger. - **/ -void openLogger(void); - -/** - * Stop the logger. - **/ -void closeLogger(void); -#endif - -/** - * Get the current logging level. - * - * @return the current logging priority level. - **/ -int getLogLevel(void); - -/** - * Set the current logging level. - * - * @param newLogLevel the new value for the logging priority level. - **/ -void setLogLevel(int newLogLevel); - -/** - * Return the integer logging priority represented by a name. - * - * @param string the name of the logging priority (case insensitive). - * - * @return the integer priority named by string, or LOG_INFO if not recognized. - **/ -int stringToPriority(const char *string); - -/** - * Return the printable name of a logging priority. - * - * @return the priority name - **/ -const char *priorityToString(int priority); - -/** - * Log a debug message. - * - * @param format The format of the message (a printf style format) - **/ -void logDebug(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log an informational message. - * - * @param format The format of the message (a printf style format) - **/ -void logInfo(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log a normal (but notable) condition. - * - * @param format The format of the message (a printf style format) - **/ -void logNotice(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log a warning. - * - * @param format The format of the message (a printf style format) - **/ -void logWarning(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log an error. - * - * @param format The format of the message (a printf style format) - **/ -void logError(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log a message embedded within another message. - * - * @param priority the priority at which to log the message - * @param prefix optional string prefix to message, may be NULL - * @param fmt1 format of message first part, may be NULL - * @param args1 arguments for message first part - * @param fmt2 format of message second part - **/ -void logEmbeddedMessage(int priority, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - ...) - __attribute__((format(printf, 3, 0), format(printf, 5, 6))); - -/** - * Log a message pack consisting of multiple variable sections. - * - * @param priority the priority at which to log the message - * @param prefix optional string prefix to message, may be NULL - * @param fmt1 format of message first part, may be NULL - * @param args1 arguments for message first part - * @param fmt2 format of message second part, may be NULL - * @param args2 arguments for message second part - **/ -void logMessagePack(int priority, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - va_list args2) - __attribute__((format(printf, 3, 0))); - -/** - * Log a stack backtrace. - * - * @param priority The priority at which to log the backtrace - **/ -void logBacktrace(int priority); - -/** - * Log a message with an error from an error code. - * - * @param priority The priority of the logging entry - * @param errnum Int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * - * @return errnum - **/ -int logWithStringError(int priority, int errnum, const char *format, ...) - __attribute__((format(printf, 3, 4))); - -/** - * Log a message with an error from an error code. - * - * @param priority The priority of the logging entry - * @param errnum Int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * @param args The list of arguments with format. - * - * @return errnum - **/ -int vLogWithStringError(int priority, - int errnum, - const char *format, - va_list args) - __attribute__((format(printf, 3, 0))); - -/** - * Log an error prefixed with the string associated with the errnum. - * - * @param errnum Int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * - * @return errnum - **/ -int logErrorWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logDebugWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logInfoWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logNoticeWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logWarningWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logFatalWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/** - * IF the result is an error, log a FATAL level message and return the result - * after marking it unrecoverable. The UDS_SUCCESS and UDS_QUEUED results are - * not considered errors and are returned unmodified. - * - * @param errnum int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * - * @return makeUnrecoverable(errnum) or UDS_SUCCESS or UDS_QUEUED - **/ -int logUnrecoverable(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/** - * Log a fatal error. - * - * @param format The format of the message (a printf style format) - **/ -void logFatal(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log a message -- for internal use only. - * - * @param priority The syslog priority value for the message. - * @param format The format of the message (a printf style format) - * @param args The variadic argument list of format parameters. - **/ -void vLogMessage(int priority, const char *format, va_list args) - __attribute__((format(printf, 2, 0))); - -/** - * Log a message - * - * @param priority The syslog priority value for the message. - * @param format The format of the message (a printf style format) - **/ -void logMessage(int priority, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/** - * Sleep or delay a short time (likely a few milliseconds) in an attempt allow - * the log buffers to be written out in case they might be overrun. This is - * unnecessary in user-space (and is a no-op there), but is needed when - * quickly issuing a lot of log output in the Linux kernel, as when dumping a - * large number of data structures. - **/ -void pauseForLogger(void); - -#endif /* LOGGER_H */ diff --git a/uds/loggerLinuxKernel.c b/uds/loggerLinuxKernel.c deleted file mode 100644 index bb1ad0b..0000000 --- a/uds/loggerLinuxKernel.c +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/loggerLinuxKernel.c#2 $ - */ - -#include -#include -#include -#include - -#include "logger.h" - -/**********************************************************************/ -static const char *priorityToLogLevel(int priority) -{ - switch (priority) { - case LOG_EMERG: - case LOG_ALERT: - case LOG_CRIT: - return KERN_CRIT; - case LOG_ERR: - return KERN_ERR; - case LOG_WARNING: - return KERN_WARNING; - case LOG_NOTICE: - return KERN_NOTICE; - case LOG_INFO: - return KERN_INFO; - case LOG_DEBUG: - return KERN_DEBUG; - default: - return ""; - } -} - -/**********************************************************************/ -static const char *getCurrentInterruptType(void) -{ - if (in_nmi()) { - return "NMI"; - } - if (in_irq()) { - return "HI"; - } - if (in_softirq()) { - return "SI"; - } - return "INTR"; -} - -/**********************************************************************/ -void logMessagePack(int priority, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - va_list args2) -{ - if (priority > getLogLevel()) { - return; - } - - /* - * The kernel's printk has some magic for indirection to a secondary - * va_list. It wants us to supply a pointer to the va_list. - * - * However, va_list varies across platforms and can be an array - * type, which makes passing it around as an argument kind of - * tricky, due to the automatic conversion to a pointer. This makes - * taking the address of the argument a dicey thing; if we use "&a" - * it works fine for non-array types, but for array types we get the - * address of a pointer. Functions like va_copy and sprintf don't - * care as they get "va_list" values passed and are written to do - * the right thing, but printk explicitly wants the address of the - * va_list. - * - * So, we copy the va_list values to ensure that "&" consistently - * works the way we want. - */ - va_list args1Copy; - va_copy(args1Copy, args1); - va_list args2Copy; - va_copy(args2Copy, args2); - struct va_format vaf1 = { - .fmt = (fmt1 != NULL) ? fmt1 : "", - .va = &args1Copy, - }; - struct va_format vaf2 = { - .fmt = (fmt2 != NULL) ? fmt2 : "", - .va = &args2Copy, - }; - - if (prefix == NULL) { - prefix = ""; - } - - /* - * Context info formats: - * - * interrupt: uds[NMI]: blah - * process: uds: myprog: blah - * - * Fields: module name, interrupt level or process name. - * - * XXX need the equivalent of VDO's deviceInstance here - */ - if (in_interrupt()) { - printk("%s%s[%s]: %s%pV%pV\n", priorityToLogLevel(priority), - THIS_MODULE->name, getCurrentInterruptType(), prefix, &vaf1, &vaf2); - } else { - printk("%s%s: %s: %s%pV%pV\n", priorityToLogLevel(priority), - THIS_MODULE->name, current->comm, prefix, &vaf1, &vaf2); - } - - va_end(args1Copy); - va_end(args2Copy); -} - -/**********************************************************************/ -void logBacktrace(int priority) -{ - if (priority > getLogLevel()) { - return; - } - logMessage(priority, "[backtrace]"); - dump_stack(); -} - -/**********************************************************************/ -void pauseForLogger(void) -{ - // Hopefully, a few milliseconds of sleep will be large enough - // for the kernel log buffer to be flushed. - msleep(4); -} diff --git a/uds/masterIndex005.c b/uds/masterIndex005.c deleted file mode 100644 index 3f9a5b2..0000000 --- a/uds/masterIndex005.c +++ /dev/null @@ -1,1470 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/masterIndex005.c#3 $ - */ -#include "masterIndex005.h" - -#include "buffer.h" -#include "compiler.h" -#include "errors.h" -#include "hashUtils.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "uds.h" -#include "zone.h" - -/* - * The master index is a kept as a delta index where the payload is a - * chapter number. The master index adds 2 basic functions to the delta - * index: - * - * (1) How to get the delta list number and address out of the chunk name. - * - * (2) Dealing with chapter numbers, and especially the lazy flushing of - * chapters from the index. - * - * There are three ways of expressing chapter numbers: virtual, index, and - * rolling. The interface to the the master index uses virtual chapter - * numbers, which are 64 bits long. We do not store such large values in - * memory, so we internally use a binary value using the minimal number of - * bits. - * - * The delta index stores the index chapter number, which is the low-order - * bits of the virtual chapter number. - * - * When we need to deal with ordering of index chapter numbers, we roll the - * index chapter number around so that the smallest one we are using has - * the representation 0. See convertIndexToVirtual() or - * flushInvalidEntries() for an example of this technique. - */ - -typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) masterIndexZone { - uint64_t virtualChapterLow; // The lowest virtual chapter indexed - uint64_t virtualChapterHigh; // The highest virtual chapter indexed - long numEarlyFlushes; // The number of early flushes -} MasterIndexZone; - -typedef struct { - MasterIndex common; // Common master index methods - DeltaIndex deltaIndex; // The delta index - uint64_t *flushChapters; // The first chapter to be flushed - MasterIndexZone *masterZones; // The Zones - uint64_t volumeNonce; // The volume nonce - uint64_t chapterZoneBits; // Expected size of a chapter (per zone) - uint64_t maxZoneBits; // Maximum size index (per zone) - unsigned int addressBits; // Number of bits in address mask - unsigned int addressMask; // Mask to get address within delta list - unsigned int chapterBits; // Number of bits in chapter number - unsigned int chapterMask; // Largest storable chapter number - unsigned int numChapters; // Number of chapters used - unsigned int numDeltaLists; // The number of delta lists - unsigned int numZones; // The number of zones -} MasterIndex5; - -typedef struct chapterRange { - unsigned int chapterStart; // The first chapter - unsigned int chapterCount; // The number of chapters -} ChapterRange; - -// Constants for the magic byte of a MasterIndexRecord -static const byte masterIndexRecordMagic = 0xAA; -static const byte badMagic = 0; - -/* - * In production, the default value for minMasterIndexDeltaLists will be - * replaced by MAX_ZONES*MAX_ZONES. Some unit tests will replace - * minMasterIndexDeltaLists with the non-default value 1, because those - * tests really want to run with a single delta list. - */ -unsigned int minMasterIndexDeltaLists; - -/** - * Maximum of two unsigned ints - * - * @param a One unsigned int - * @param b Another unsigned int - * - * @return the bigger one - **/ -static INLINE unsigned int maxUint(unsigned int a, unsigned int b) -{ - return a > b ? a : b; -} - -/** - * Extract the address from a block name. - * - * @param mi5 The master index - * @param name The block name - * - * @return the address - **/ -static INLINE unsigned int extractAddress(const MasterIndex5 *mi5, - const UdsChunkName *name) -{ - return extractMasterIndexBytes(name) & mi5->addressMask; -} - -/** - * Extract the delta list number from a block name. - * - * @param mi5 The master index - * @param name The block name - * - * @return the delta list number - **/ -static INLINE unsigned int extractDListNum(const MasterIndex5 *mi5, - const UdsChunkName *name) -{ - uint64_t bits = extractMasterIndexBytes(name); - return (bits >> mi5->addressBits) % mi5->numDeltaLists; -} - -/** - * Get the master index zone containing a given master index record - * - * @param record The master index record - * - * @return the master index zone - **/ -static INLINE const MasterIndexZone *getMasterZone(const MasterIndexRecord *record) -{ - const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, - common); - return &mi5->masterZones[record->zoneNumber]; -} - -/** - * Convert an index chapter number to a virtual chapter number. - * - * @param record The master index record - * @param indexChapter The index chapter number - * - * @return the virtual chapter number - **/ -static INLINE uint64_t convertIndexToVirtual(const MasterIndexRecord *record, - unsigned int indexChapter) -{ - const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, - common); - const MasterIndexZone *masterZone = getMasterZone(record); - unsigned int rollingChapter - = ((indexChapter - masterZone->virtualChapterLow) & mi5->chapterMask); - return masterZone->virtualChapterLow + rollingChapter; -} - -/** - * Convert a virtual chapter number to an index chapter number. - * - * @param mi5 The master index - * @param virtualChapter The virtual chapter number - * - * @return the index chapter number - **/ -static INLINE unsigned int convertVirtualToIndex(const MasterIndex5 *mi5, - uint64_t virtualChapter) -{ - return virtualChapter & mi5->chapterMask; -} - -/** - * Determine whether a virtual chapter number is in the range being indexed - * - * @param record The master index record - * @param virtualChapter The virtual chapter number - * - * @return true if the virtual chapter number is being indexed - **/ -static INLINE bool isVirtualChapterIndexed(const MasterIndexRecord *record, - uint64_t virtualChapter) -{ - const MasterIndexZone *masterZone = getMasterZone(record); - return ((virtualChapter >= masterZone->virtualChapterLow) - && (virtualChapter <= masterZone->virtualChapterHigh)); -} - -/***********************************************************************/ -/** - * Flush an invalid entry from the master index, advancing to the next - * valid entry. - * - * @param record Updated to describe the next valid record - * @param flushRange Range of chapters to flush from the index - * @param nextChapterToInvalidate Updated to record the next chapter that we - * will need to invalidate - * - * @return UDS_SUCCESS or an error code - **/ -static INLINE int flushInvalidEntries(MasterIndexRecord *record, - ChapterRange *flushRange, - unsigned int *nextChapterToInvalidate) -{ - const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, - common); - int result = nextDeltaIndexEntry(&record->deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - while (!record->deltaEntry.atEnd) { - unsigned int indexChapter = getDeltaEntryValue(&record->deltaEntry); - unsigned int relativeChapter = ((indexChapter - flushRange->chapterStart) - & mi5->chapterMask); - if (likely(relativeChapter >= flushRange->chapterCount)) { - if (relativeChapter < *nextChapterToInvalidate) { - *nextChapterToInvalidate = relativeChapter; - } - break; - } - result = removeDeltaIndexEntry(&record->deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/** - * Find the delta index entry, or the insertion point for a delta index - * entry, while processing chapter LRU flushing. - * - * @param record Updated to describe the entry being looked for - * @param listNumber The delta list number - * @param key The address field being looked for - * @param flushRange The range of chapters to flush from the index - * - * @return UDS_SUCCESS or an error code - **/ -static int getMasterIndexEntry(MasterIndexRecord *record, - unsigned int listNumber, - unsigned int key, - ChapterRange *flushRange) -{ - const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, - common); - unsigned int nextChapterToInvalidate = mi5->chapterMask; - - int result = startDeltaIndexSearch(&mi5->deltaIndex, listNumber, 0, - false, &record->deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - do { - result = flushInvalidEntries(record, flushRange, &nextChapterToInvalidate); - if (result != UDS_SUCCESS) { - return result; - } - } while (!record->deltaEntry.atEnd && (key > record->deltaEntry.key)); - - result = rememberDeltaIndexOffset(&record->deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - - // We probably found the record we want, but we need to keep going - MasterIndexRecord otherRecord = *record; - if (!otherRecord.deltaEntry.atEnd && (key == otherRecord.deltaEntry.key)) { - for (;;) { - result = flushInvalidEntries(&otherRecord, flushRange, - &nextChapterToInvalidate); - if (result != UDS_SUCCESS) { - return result; - } - if (otherRecord.deltaEntry.atEnd - || !otherRecord.deltaEntry.isCollision) { - break; - } - byte collisionName[UDS_CHUNK_NAME_SIZE]; - result = getDeltaEntryCollision(&otherRecord.deltaEntry, collisionName); - if (result != UDS_SUCCESS) { - return result; - } - if (memcmp(collisionName, record->name, UDS_CHUNK_NAME_SIZE) == 0) { - // This collision record is the one we are looking for - *record = otherRecord; - break; - } - } - } - while (!otherRecord.deltaEntry.atEnd) { - result = flushInvalidEntries(&otherRecord, flushRange, - &nextChapterToInvalidate); - if (result != UDS_SUCCESS) { - return result; - } - } - nextChapterToInvalidate += flushRange->chapterStart; - nextChapterToInvalidate &= mi5->chapterMask; - flushRange->chapterStart = nextChapterToInvalidate; - flushRange->chapterCount = 0; - return UDS_SUCCESS; -} - -/***********************************************************************/ -/** - * Terminate and clean up the master index - * - * @param masterIndex The master index to terminate - **/ -static void freeMasterIndex_005(MasterIndex *masterIndex) -{ - if (masterIndex != NULL) { - MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); - FREE(mi5->flushChapters); - mi5->flushChapters = NULL; - FREE(mi5->masterZones); - mi5->masterZones = NULL; - uninitializeDeltaIndex(&mi5->deltaIndex); - FREE(masterIndex); - } -} - -/** - * Constants and structures for the saved master index file. "MI5" is for - * masterIndex005, and "-XXXX" is a number to increment when the format of - * the data changes. - **/ -enum { MAGIC_SIZE = 8 }; -static const char MAGIC_MI_START[] = "MI5-0005"; - -struct mi005_data { - char magic[MAGIC_SIZE]; // MAGIC_MI_START - uint64_t volumeNonce; - uint64_t virtualChapterLow; - uint64_t virtualChapterHigh; - unsigned int firstList; - unsigned int numLists; -}; - -/***********************************************************************/ -/** - * Set the tag value used when saving and/or restoring a master index. - * - * @param masterIndex The master index - * @param tag The tag value - **/ -static void setMasterIndexTag_005(MasterIndex *masterIndex, byte tag) -{ - MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); - setDeltaIndexTag(&mi5->deltaIndex, tag); -} - -/***********************************************************************/ -__attribute__((warn_unused_result)) -static int encodeMasterIndexHeader(Buffer *buffer, struct mi005_data *header) -{ - int result = putBytes(buffer, MAGIC_SIZE, MAGIC_MI_START); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, header->volumeNonce); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, header->virtualChapterLow); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt64LEIntoBuffer(buffer, header->virtualChapterHigh); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, header->firstList); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, header->numLists); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(struct mi005_data), - "%zu bytes of config written, of %zu expected", - contentLength(buffer), sizeof(struct mi005_data)); - return result; -} - -/** - * Start saving a master index to a buffered output stream. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * @param bufferedWriter The index state component being written - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int startSavingMasterIndex_005(const MasterIndex *masterIndex, - unsigned int zoneNumber, - BufferedWriter *bufferedWriter) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; - unsigned int firstList = getDeltaIndexZoneFirstList(&mi5->deltaIndex, - zoneNumber); - unsigned int numLists = getDeltaIndexZoneNumLists(&mi5->deltaIndex, - zoneNumber); - - struct mi005_data header; - memset(&header, 0, sizeof(header)); - memcpy(header.magic, MAGIC_MI_START, MAGIC_SIZE); - header.volumeNonce = mi5->volumeNonce; - header.virtualChapterLow = masterZone->virtualChapterLow; - header.virtualChapterHigh = masterZone->virtualChapterHigh; - header.firstList = firstList; - header.numLists = numLists; - - Buffer *buffer; - int result = makeBuffer(sizeof(struct mi005_data), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = encodeMasterIndexHeader(buffer, &header); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to write master index header"); - } - result = makeBuffer(numLists * sizeof(uint64_t), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - uint64_t *firstFlushChapter = &mi5->flushChapters[firstList]; - result = putUInt64LEsIntoBuffer(buffer, numLists, firstFlushChapter); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to write master index flush " - "ranges"); - } - - return startSavingDeltaIndex(&mi5->deltaIndex, zoneNumber, bufferedWriter); -} - -/***********************************************************************/ -/** - * Have all the data been written while saving a master index to an output - * stream? If the answer is yes, it is still necessary to call - * finishSavingMasterIndex(), which will return quickly. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return true if all the data are written - **/ -static bool isSavingMasterIndexDone_005(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - return isSavingDeltaIndexDone(&mi5->deltaIndex, zoneNumber); -} - -/***********************************************************************/ -/** - * Finish saving a master index to an output stream. Force the writing of - * all of the remaining data. If an error occurred asynchronously during - * the save operation, it will be returned here. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int finishSavingMasterIndex_005(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - return finishSavingDeltaIndex(&mi5->deltaIndex, zoneNumber); -} - -/***********************************************************************/ -/** - * Abort saving a master index to an output stream. If an error occurred - * asynchronously during the save operation, it will be dropped. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int abortSavingMasterIndex_005(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - return abortSavingDeltaIndex(&mi5->deltaIndex, zoneNumber); -} - -/***********************************************************************/ -__attribute__((warn_unused_result)) -static int decodeMasterIndexHeader(Buffer *buffer, struct mi005_data *header) -{ - int result = getBytesFromBuffer(buffer, sizeof(header->magic), - &header->magic); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &header->volumeNonce); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &header->virtualChapterLow); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt64LEFromBuffer(buffer, &header->virtualChapterHigh); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &header->firstList); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &header->numLists); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, - "%zu bytes decoded of %zu expected", - bufferLength(buffer) - contentLength(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - result = UDS_CORRUPT_COMPONENT; - } - return result; -} - -/** - * Start restoring the master index from multiple buffered readers - * - * @param masterIndex The master index to restore into - * @param bufferedReaders The buffered readers to read the master index from - * @param numReaders The number of buffered readers - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int startRestoringMasterIndex_005(MasterIndex *masterIndex, - BufferedReader **bufferedReaders, - int numReaders) -{ - if (masterIndex == NULL) { - return logWarningWithStringError(UDS_BAD_STATE, - "cannot restore to null master index"); - } - MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); - emptyDeltaIndex(&mi5->deltaIndex); - - uint64_t virtualChapterLow = 0; - uint64_t virtualChapterHigh = 0; - int i; - for (i = 0; i < numReaders; i++) { - Buffer *buffer; - int result = makeBuffer(sizeof(struct mi005_data), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(bufferedReaders[i], - getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return logWarningWithStringError(result, - "failed to read master index header"); - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - struct mi005_data header; - result = decodeMasterIndexHeader(buffer, &header); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - if (memcmp(header.magic, MAGIC_MI_START, MAGIC_SIZE) != 0) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "master index file had bad magic" - " number"); - } - if (mi5->volumeNonce == 0) { - mi5->volumeNonce = header.volumeNonce; - } else if (header.volumeNonce != mi5->volumeNonce) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "master index volume nonce incorrect"); - } - if (i == 0) { - virtualChapterLow = header.virtualChapterLow; - virtualChapterHigh = header.virtualChapterHigh; - } else if (virtualChapterHigh != header.virtualChapterHigh) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "Inconsistent master index zone files:" - " Chapter range is [%llu,%" - PRIu64 "], chapter range %d is [%" - PRIu64 ",%llu]", - virtualChapterLow, virtualChapterHigh, - i, header.virtualChapterLow, - header.virtualChapterHigh); - } else if (virtualChapterLow < header.virtualChapterLow) { - virtualChapterLow = header.virtualChapterLow; - } - uint64_t *firstFlushChapter = &mi5->flushChapters[header.firstList]; - result = makeBuffer(header.numLists * sizeof(uint64_t), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(bufferedReaders[i], - getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return logWarningWithStringError(result, - "failed to read master index flush" - " ranges"); - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = getUInt64LEsFromBuffer(buffer, header.numLists, - firstFlushChapter); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - } - - unsigned int z; - for (z = 0; z < mi5->numZones; z++) { - memset(&mi5->masterZones[z], 0, sizeof(MasterIndexZone)); - mi5->masterZones[z].virtualChapterLow = virtualChapterLow; - mi5->masterZones[z].virtualChapterHigh = virtualChapterHigh; - } - - int result = startRestoringDeltaIndex(&mi5->deltaIndex, bufferedReaders, - numReaders); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, "restoring delta index failed"); - } - return UDS_SUCCESS; -} - -/***********************************************************************/ -/** - * Have all the data been read while restoring a master index from an - * input stream? - * - * @param masterIndex The master index to restore into - * - * @return true if all the data are read - **/ -static bool isRestoringMasterIndexDone_005(const MasterIndex *masterIndex) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - return isRestoringDeltaIndexDone(&mi5->deltaIndex); -} - -/***********************************************************************/ -/** - * Restore a saved delta list - * - * @param masterIndex The master index to restore into - * @param dlsi The DeltaListSaveInfo describing the delta list - * @param data The saved delta list bit stream - * - * @return error code or UDS_SUCCESS - **/ -static int restoreDeltaListToMasterIndex_005(MasterIndex *masterIndex, - const DeltaListSaveInfo *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); - return restoreDeltaListToDeltaIndex(&mi5->deltaIndex, dlsi, data); -} - -/***********************************************************************/ -/** - * Abort restoring a master index from an input stream. - * - * @param masterIndex The master index - **/ -static void abortRestoringMasterIndex_005(MasterIndex *masterIndex) -{ - MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); - abortRestoringDeltaIndex(&mi5->deltaIndex); -} - -/***********************************************************************/ -static void removeNewestChapters(MasterIndex5 *mi5, - unsigned int zoneNumber, - uint64_t virtualChapter) -{ - // Get the range of delta lists belonging to this zone - unsigned int firstList = getDeltaIndexZoneFirstList(&mi5->deltaIndex, - zoneNumber); - unsigned int numLists = getDeltaIndexZoneNumLists(&mi5->deltaIndex, - zoneNumber); - unsigned int lastList = firstList + numLists - 1; - - if (virtualChapter > mi5->chapterMask) { - // The virtual chapter number is large enough so that we can use the - // normal LRU mechanism without an unsigned underflow. - virtualChapter -= mi5->chapterMask + 1; - // Eliminate the newest chapters by renumbering them to become the - // oldest chapters - unsigned int i; - for (i = firstList; i <= lastList; i++) { - if (virtualChapter < mi5->flushChapters[i]) { - mi5->flushChapters[i] = virtualChapter; - } - } - } else { - // Underflow will prevent the fast path. Do it the slow and painful way. - MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; - ChapterRange range; - range.chapterStart = convertVirtualToIndex(mi5, virtualChapter); - range.chapterCount = (mi5->chapterMask + 1 - - (virtualChapter - masterZone->virtualChapterLow)); - UdsChunkName name; - memset(&name, 0, sizeof(UdsChunkName)); - MasterIndexRecord record = (MasterIndexRecord) { - .magic = masterIndexRecordMagic, - .masterIndex = &mi5->common, - .name = &name, - .zoneNumber = zoneNumber, - }; - unsigned int i; - for (i = firstList; i <= lastList; i++) { - ChapterRange tempRange = range; - getMasterIndexEntry(&record, i, 0, &tempRange); - } - } -} - -/***********************************************************************/ -/** - * Set the open chapter number on a zone. The master index zone will be - * modified to index the proper number of chapters ending with the new open - * chapter. - * - * @param masterIndex The master index - * @param zoneNumber The zone number - * @param virtualChapter The new open chapter number - **/ -static void setMasterIndexZoneOpenChapter_005(MasterIndex *masterIndex, - unsigned int zoneNumber, - uint64_t virtualChapter) -{ - MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); - MasterIndexZone *masterZone = &mi5->masterZones[zoneNumber]; - // Take care here to avoid underflow of an unsigned value. Note that - // this is the smallest valid virtual low. We may or may not actually - // use this value. - uint64_t newVirtualLow = (virtualChapter >= mi5->numChapters - ? virtualChapter - mi5->numChapters + 1 - : 0); - - if (virtualChapter <= masterZone->virtualChapterLow) { - /* - * Moving backwards and the new range is totally before the old range. - * Note that moving to the lowest virtual chapter counts as totally before - * the old range, as we need to remove the entries in the open chapter. - */ - emptyDeltaIndexZone(&mi5->deltaIndex, zoneNumber); - masterZone->virtualChapterLow = virtualChapter; - masterZone->virtualChapterHigh = virtualChapter; - } else if (virtualChapter <= masterZone->virtualChapterHigh) { - // Moving backwards and the new range overlaps the old range. Note - // that moving to the same open chapter counts as backwards, as we need - // to remove the entries in the open chapter. - removeNewestChapters(mi5, zoneNumber, virtualChapter); - masterZone->virtualChapterHigh = virtualChapter; - } else if (newVirtualLow < masterZone->virtualChapterLow) { - // Moving forwards and we can keep all the old chapters - masterZone->virtualChapterHigh = virtualChapter; - } else if (newVirtualLow <= masterZone->virtualChapterHigh) { - // Moving forwards and we can keep some old chapters - masterZone->virtualChapterLow = newVirtualLow; - masterZone->virtualChapterHigh = virtualChapter; - } else { - // Moving forwards and the new range is totally after the old range - masterZone->virtualChapterLow = virtualChapter; - masterZone->virtualChapterHigh = virtualChapter; - } - // Check to see if the zone data has grown to be too large - if (masterZone->virtualChapterLow < masterZone->virtualChapterHigh) { - uint64_t usedBits = getDeltaIndexZoneDlistBitsUsed(&mi5->deltaIndex, - zoneNumber); - if (usedBits > mi5->maxZoneBits) { - // Expire enough chapters to free the desired space - uint64_t expireCount - = 1 + (usedBits - mi5->maxZoneBits) / mi5->chapterZoneBits; - if (expireCount == 1) { - logRatelimit(logInfo, - "masterZone %u: At chapter %" PRIu64 - ", expiring chapter %llu early", - zoneNumber, virtualChapter, - masterZone->virtualChapterLow); - masterZone->numEarlyFlushes++; - masterZone->virtualChapterLow++; - } else { - uint64_t firstExpired = masterZone->virtualChapterLow; - if (firstExpired + expireCount < masterZone->virtualChapterHigh) { - masterZone->numEarlyFlushes += expireCount; - masterZone->virtualChapterLow += expireCount; - } else { - masterZone->numEarlyFlushes - += masterZone->virtualChapterHigh - masterZone->virtualChapterLow; - masterZone->virtualChapterLow = masterZone->virtualChapterHigh; - } - logRatelimit(logInfo, - "masterZone %u: At chapter %" PRIu64 - ", expiring chapters %llu to %llu early", - zoneNumber, virtualChapter, firstExpired, - masterZone->virtualChapterLow - 1); - } - } - } -} - -/***********************************************************************/ -/** - * Set the open chapter number. The master index will be modified to index - * the proper number of chapters ending with the new open chapter. - * - * @param masterIndex The master index - * @param virtualChapter The new open chapter number - **/ -static void setMasterIndexOpenChapter_005(MasterIndex *masterIndex, - uint64_t virtualChapter) -{ - MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); - unsigned int z; - for (z = 0; z < mi5->numZones; z++) { - // In normal operation, we advance forward one chapter at a time. - // Log all abnormal changes. - MasterIndexZone *masterZone = &mi5->masterZones[z]; - bool logMove = virtualChapter != masterZone->virtualChapterHigh + 1; - if (logMove) { - logDebug("masterZone %u: The range of indexed chapters is moving from [%" - PRIu64 ", %llu] ...", - z, - masterZone->virtualChapterLow, - masterZone->virtualChapterHigh); - } - - setMasterIndexZoneOpenChapter_005(masterIndex, z, virtualChapter); - - if (logMove) { - logDebug("masterZone %u: ... and moving to [%llu, %llu]", - z, - masterZone->virtualChapterLow, - masterZone->virtualChapterHigh); - } - } -} - -/***********************************************************************/ -/** - * Find the master index zone associated with a chunk name - * - * @param masterIndex The master index - * @param name The chunk name - * - * @return the zone that the chunk name belongs to - **/ -static unsigned int getMasterIndexZone_005(const MasterIndex *masterIndex, - const UdsChunkName *name) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - unsigned int deltaListNumber = extractDListNum(mi5, name); - return getDeltaIndexZone(&mi5->deltaIndex, deltaListNumber); -} - -/***********************************************************************/ -/** - * Do a quick read-only lookup of the chunk name and return information - * needed by the index code to process the chunk name. - * - * @param masterIndex The master index - * @param name The chunk name - * @param triage Information about the chunk name - * - * @return UDS_SUCCESS or an error code - **/ -static int lookupMasterIndexName_005(const MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexTriage *triage) -{ - triage->isSample = false; - triage->inSampledChapter = false; - triage->zone = getMasterIndexZone_005(masterIndex, name); - return UDS_SUCCESS; -} - -/***********************************************************************/ -/** - * Do a quick read-only lookup of the sampled chunk name and return - * information needed by the index code to process the chunk name. - * - * @param masterIndex The master index - * @param name The chunk name - * @param triage Information about the chunk name. The zone and - * isSample fields are already filled in. Set - * inSampledChapter and virtualChapter if the chunk - * name is found in the index. - * - * @return UDS_SUCCESS or an error code - **/ -static int lookupMasterIndexSampledName_005(const MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexTriage *triage) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - unsigned int address = extractAddress(mi5, name); - unsigned int deltaListNumber = extractDListNum(mi5, name); - DeltaIndexEntry deltaEntry; - int result = getDeltaIndexEntry(&mi5->deltaIndex, deltaListNumber, address, - name->name, true, &deltaEntry); - if (result != UDS_SUCCESS) { - return result; - } - triage->inSampledChapter = !deltaEntry.atEnd && (deltaEntry.key == address); - if (triage->inSampledChapter) { - const MasterIndexZone *masterZone = &mi5->masterZones[triage->zone]; - unsigned int indexChapter = getDeltaEntryValue(&deltaEntry); - unsigned int rollingChapter = ((indexChapter - - masterZone->virtualChapterLow) - & mi5->chapterMask); - triage->virtualChapter = masterZone->virtualChapterLow + rollingChapter; - if (triage->virtualChapter > masterZone->virtualChapterHigh) { - triage->inSampledChapter = false; - } - } - return UDS_SUCCESS; -} - -/***********************************************************************/ -/** - * Find the master index record associated with a block name - * - * This is always the first routine to be called when dealing with a delta - * master index entry. The fields of the record parameter should be - * examined to determine the state of the record: - * - * If isFound is false, then we did not find an entry for the block - * name. Information is saved in the MasterIndexRecord so that - * putMasterIndexRecord() will insert an entry for that block name at - * the proper place. - * - * If isFound is true, then we did find an entry for the block name. - * Information is saved in the MasterIndexRecord so that the "chapter" - * and "isCollision" fields reflect the entry found. - * Calls to removeMasterIndexRecord() will remove the entry, calls to - * setMasterIndexRecordChapter() can modify the entry, and calls to - * putMasterIndexRecord() can insert a collision record with this - * entry. - * - * @param masterIndex The master index to search - * @param name The chunk name - * @param record Set to the info about the record searched for - * - * @return UDS_SUCCESS or an error code - **/ -static int getMasterIndexRecord_005(MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexRecord *record) -{ - MasterIndex5 *mi5 = container_of(masterIndex, MasterIndex5, common); - unsigned int address = extractAddress(mi5, name); - unsigned int deltaListNumber = extractDListNum(mi5, name); - uint64_t flushChapter = mi5->flushChapters[deltaListNumber]; - record->magic = masterIndexRecordMagic; - record->masterIndex = masterIndex; - record->mutex = NULL; - record->name = name; - record->zoneNumber = getDeltaIndexZone(&mi5->deltaIndex, deltaListNumber); - const MasterIndexZone *masterZone = getMasterZone(record); - - int result; - if (flushChapter < masterZone->virtualChapterLow) { - ChapterRange range; - uint64_t flushCount = masterZone->virtualChapterLow - flushChapter; - range.chapterStart = convertVirtualToIndex(mi5, flushChapter); - range.chapterCount = (flushCount > mi5->chapterMask - ? mi5->chapterMask + 1 - : flushCount); - result = getMasterIndexEntry(record, deltaListNumber, address, &range); - flushChapter = convertIndexToVirtual(record, range.chapterStart); - if (flushChapter > masterZone->virtualChapterHigh) { - flushChapter = masterZone->virtualChapterHigh; - } - mi5->flushChapters[deltaListNumber] = flushChapter; - } else { - result = getDeltaIndexEntry(&mi5->deltaIndex, deltaListNumber, address, - name->name, false, &record->deltaEntry); - } - if (result != UDS_SUCCESS) { - return result; - } - record->isFound = (!record->deltaEntry.atEnd - && (record->deltaEntry.key == address)); - if (record->isFound) { - unsigned int indexChapter = getDeltaEntryValue(&record->deltaEntry); - record->virtualChapter = convertIndexToVirtual(record, indexChapter); - } - record->isCollision = record->deltaEntry.isCollision; - return UDS_SUCCESS; -} - -/***********************************************************************/ -/** - * Create a new record associated with a block name. - * - * @param record The master index record found by getRecord() - * @param virtualChapter The chapter number where block info is found - * - * @return UDS_SUCCESS or an error code - **/ -int putMasterIndexRecord(MasterIndexRecord *record, uint64_t virtualChapter) -{ - const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, - common); - if (record->magic != masterIndexRecordMagic) { - return logWarningWithStringError(UDS_BAD_STATE, - "bad magic number in master index record"); - } - if (!isVirtualChapterIndexed(record, virtualChapter)) { - const MasterIndexZone *masterZone = getMasterZone(record); - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot put record into chapter number %" - PRIu64 " that is out of the valid range %" - PRIu64 " to %llu", - virtualChapter, - masterZone->virtualChapterLow, - masterZone->virtualChapterHigh); - } - unsigned int address = extractAddress(mi5, record->name); - if (unlikely(record->mutex != NULL)) { - lockMutex(record->mutex); - } - int result = putDeltaIndexEntry(&record->deltaEntry, address, - convertVirtualToIndex(mi5, virtualChapter), - record->isFound ? record->name->name : NULL); - if (unlikely(record->mutex != NULL)) { - unlockMutex(record->mutex); - } - switch (result) { - case UDS_SUCCESS: - record->virtualChapter = virtualChapter; - record->isCollision = record->deltaEntry.isCollision; - record->isFound = true; - break; - case UDS_OVERFLOW: - logRatelimit(logWarningWithStringError, UDS_OVERFLOW, - "Master index entry dropped due to overflow condition"); - logDeltaIndexEntry(&record->deltaEntry); - break; - default: - break; - } - return result; -} - -/**********************************************************************/ -static INLINE int validateRecord(MasterIndexRecord *record) -{ - if (record->magic != masterIndexRecordMagic) { - return logWarningWithStringError( - UDS_BAD_STATE, "bad magic number in master index record"); - } - if (!record->isFound) { - return logWarningWithStringError(UDS_BAD_STATE, - "illegal operation on new record"); - } - return UDS_SUCCESS; -} - -/***********************************************************************/ -/** - * Remove an existing record. - * - * @param record The master index record found by getRecord() - * - * @return UDS_SUCCESS or an error code - **/ -int removeMasterIndexRecord(MasterIndexRecord *record) -{ - int result = validateRecord(record); - if (result != UDS_SUCCESS) { - return result; - } - // Mark the record so that it cannot be used again - record->magic = badMagic; - if (unlikely(record->mutex != NULL)) { - lockMutex(record->mutex); - } - result = removeDeltaIndexEntry(&record->deltaEntry); - if (unlikely(record->mutex != NULL)) { - unlockMutex(record->mutex); - } - return result; -} - -/***********************************************************************/ -/** - * Set the chapter number associated with a block name. - * - * @param record The master index record found by getRecord() - * @param virtualChapter The chapter number where the block info is now found. - * - * @return UDS_SUCCESS or an error code - **/ -int setMasterIndexRecordChapter(MasterIndexRecord *record, - uint64_t virtualChapter) -{ - const MasterIndex5 *mi5 = container_of(record->masterIndex, MasterIndex5, - common); - int result = validateRecord(record); - if (result != UDS_SUCCESS) { - return result; - } - if (!isVirtualChapterIndexed(record, virtualChapter)) { - const MasterIndexZone *masterZone = getMasterZone(record); - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot set chapter number %" PRIu64 - " that is out of the valid range %" PRIu64 - " to %llu", - virtualChapter, - masterZone->virtualChapterLow, - masterZone->virtualChapterHigh); - } - if (unlikely(record->mutex != NULL)) { - lockMutex(record->mutex); - } - result = setDeltaEntryValue(&record->deltaEntry, - convertVirtualToIndex(mi5, virtualChapter)); - if (unlikely(record->mutex != NULL)) { - unlockMutex(record->mutex); - } - if (result != UDS_SUCCESS) { - return result; - } - record->virtualChapter = virtualChapter; - return UDS_SUCCESS; -} - -/***********************************************************************/ -/** - * Get the number of bytes used for master index entries. - * - * @param masterIndex The master index - * - * @return The number of bytes in use - **/ -static size_t getMasterIndexMemoryUsed_005(const MasterIndex *masterIndex) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - uint64_t bits = getDeltaIndexDlistBitsUsed(&mi5->deltaIndex); - return (bits + CHAR_BIT - 1) / CHAR_BIT; -} - -/***********************************************************************/ -/** - * Return the master index stats. There is only one portion of the master - * index in this implementation, and we call it the dense portion of the - * index. - * - * @param masterIndex The master index - * @param dense Stats for the dense portion of the index - * @param sparse Stats for the sparse portion of the index - **/ -static void getMasterIndexStats_005(const MasterIndex *masterIndex, - MasterIndexStats *dense, - MasterIndexStats *sparse) -{ - const MasterIndex5 *mi5 = const_container_of(masterIndex, MasterIndex5, - common); - DeltaIndexStats dis; - getDeltaIndexStats(&mi5->deltaIndex, &dis); - dense->memoryAllocated = (dis.memoryAllocated - + sizeof(MasterIndex5) - + mi5->numDeltaLists * sizeof(uint64_t) - + mi5->numZones * sizeof(MasterIndexZone)); - dense->rebalanceTime = dis.rebalanceTime; - dense->rebalanceCount = dis.rebalanceCount; - dense->recordCount = dis.recordCount; - dense->collisionCount = dis.collisionCount; - dense->discardCount = dis.discardCount; - dense->overflowCount = dis.overflowCount; - dense->numLists = dis.numLists; - dense->earlyFlushes = 0; - unsigned int z; - for (z = 0; z < mi5->numZones; z++) { - dense->earlyFlushes += mi5->masterZones[z].numEarlyFlushes; - } - memset(sparse, 0, sizeof(MasterIndexStats)); -} - -/***********************************************************************/ -/** - * Determine whether a given chunk name is a hook. - * - * @param masterIndex The master index - * @param name The block name - * - * @return whether to use as sample - **/ -static bool isMasterIndexSample_005(const MasterIndex *masterIndex - __attribute__((unused)), - const UdsChunkName *name - __attribute__((unused))) -{ - return false; -} - -/***********************************************************************/ -typedef struct { - unsigned int addressBits; // Number of bits in address mask - unsigned int chapterBits; // Number of bits in chapter number - unsigned int meanDelta; // The mean delta - unsigned long numDeltaLists; // The number of delta lists - unsigned long numChapters; // Number of chapters used - size_t numBitsPerChapter; // The number of bits per chapter - size_t memorySize; // The number of bytes of delta list memory - size_t targetFreeSize; // The number of free bytes we desire -} Parameters005; - -/***********************************************************************/ -static int computeMasterIndexParameters005(const Configuration *config, - Parameters005 *params) -{ - enum { DELTA_LIST_SIZE = 256 }; - /* - * For a given zone count, setting the the minimum number of delta lists - * to the square of the number of zones ensures that the distribution of - * delta lists over zones doesn't underflow, leaving the last zone with - * an invalid number of delta lists. See the explanation in - * initializeDeltaIndex(). Because we can restart with a different number - * of zones but the number of delta lists is invariant across restart, - * we must use the largest number of zones to compute this minimum. - */ - unsigned long minDeltaLists = (minMasterIndexDeltaLists - ? minMasterIndexDeltaLists - : MAX_ZONES * MAX_ZONES); - - Geometry *geometry = config->geometry; - unsigned long recordsPerChapter = geometry->recordsPerChapter; - params->numChapters = geometry->chaptersPerVolume; - unsigned long recordsPerVolume = recordsPerChapter * params->numChapters; - unsigned int numAddresses = config->masterIndexMeanDelta * DELTA_LIST_SIZE; - params->numDeltaLists - = maxUint(recordsPerVolume / DELTA_LIST_SIZE, minDeltaLists); - params->addressBits = computeBits(numAddresses - 1); - params->chapterBits = computeBits(params->numChapters - 1); - - if ((unsigned int) params->numDeltaLists != params->numDeltaLists) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot initialize master index with %lu" - " delta lists", - params->numDeltaLists); - } - if (params->addressBits > 31) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot initialize master index with %u" - " address bits", - params->addressBits); - } - if (geometry->sparseChaptersPerVolume > 0) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot initialize dense master index" - " with %u sparse chapters", - geometry->sparseChaptersPerVolume); - } - if (recordsPerChapter == 0) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot initialize master index with %lu" - " records per chapter", - recordsPerChapter); - } - if (params->numChapters == 0) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cannot initialize master index with %lu" - " chapters per volume", - params->numChapters); - } - - /* - * We can now compute the probability that a delta list is not touched during - * the writing of an entire chapter. The computation is: - * - * double pNotTouched = pow((double) (params->numDeltaLists - 1) - * / params->numDeltaLists, - * recordsPerChapter); - * - * For the standard index sizes, about 78% of the delta lists are not - * touched, and therefore contain dead index entries that have not been - * eliminated by the lazy LRU processing. We can then compute how many dead - * index entries accumulate over time. The computation is: - * - * double invalidChapters = pNotTouched / (1.0 - pNotTouched); - * - * For the standard index sizes, we will need about 3.5 chapters of space for - * the dead index entries in a 1K chapter index. Since we do not want to do - * that floating point computation, we use 4 chapters per 1K of chapters. - */ - unsigned long invalidChapters = maxUint(params->numChapters / 256, 2); - unsigned long chaptersInMasterIndex = params->numChapters + invalidChapters; - unsigned long entriesInMasterIndex - = recordsPerChapter * chaptersInMasterIndex; - // Compute the mean delta - unsigned long addressSpan = params->numDeltaLists << params->addressBits; - params->meanDelta = addressSpan / entriesInMasterIndex; - // Project how large we expect a chapter to be - params->numBitsPerChapter = getDeltaMemorySize(recordsPerChapter, - params->meanDelta, - params->chapterBits); - // Project how large we expect the index to be - size_t numBitsPerIndex = params->numBitsPerChapter * chaptersInMasterIndex; - size_t expectedIndexSize = numBitsPerIndex / CHAR_BIT; - /* - * Set the total memory to be 6% larger than the expected index size. We - * want this number to be large enough that the we do not do a great many - * rebalances as the list when the list is full. We use MasterIndex_p1 - * to tune this setting. - */ - params->memorySize = expectedIndexSize * 106 / 100; - // Set the target free size to 5% of the expected index size - params->targetFreeSize = expectedIndexSize / 20; - return UDS_SUCCESS; -} - -/***********************************************************************/ -int computeMasterIndexSaveBytes005(const Configuration *config, - size_t *numBytes) -{ - Parameters005 params = { .addressBits = 0 }; - int result = computeMasterIndexParameters005(config, ¶ms); - if (result != UDS_SUCCESS) { - return result; - } - // Saving a MasterIndex005 needs a header plus one uint64_t per delta - // list plus the delta index. - *numBytes = (sizeof(struct mi005_data) - + params.numDeltaLists * sizeof(uint64_t) - + computeDeltaIndexSaveBytes(params.numDeltaLists, - params.memorySize)); - return UDS_SUCCESS; -} - -/***********************************************************************/ -int makeMasterIndex005(const Configuration *config, unsigned int numZones, - uint64_t volumeNonce, MasterIndex **masterIndex) -{ - Parameters005 params = { .addressBits = 0 }; - int result = computeMasterIndexParameters005(config, ¶ms); - if (result != UDS_SUCCESS) { - return result; - } - - MasterIndex5 *mi5; - result = ALLOCATE(1, MasterIndex5, "master index", &mi5); - if (result != UDS_SUCCESS) { - *masterIndex = NULL; - return result; - } - - mi5->common.abortRestoringMasterIndex = abortRestoringMasterIndex_005; - mi5->common.abortSavingMasterIndex = abortSavingMasterIndex_005; - mi5->common.finishSavingMasterIndex = finishSavingMasterIndex_005; - mi5->common.freeMasterIndex = freeMasterIndex_005; - mi5->common.getMasterIndexMemoryUsed = getMasterIndexMemoryUsed_005; - mi5->common.getMasterIndexRecord = getMasterIndexRecord_005; - mi5->common.getMasterIndexStats = getMasterIndexStats_005; - mi5->common.getMasterIndexZone = getMasterIndexZone_005; - mi5->common.isMasterIndexSample = isMasterIndexSample_005; - mi5->common.isRestoringMasterIndexDone = isRestoringMasterIndexDone_005; - mi5->common.isSavingMasterIndexDone = isSavingMasterIndexDone_005; - mi5->common.lookupMasterIndexName = lookupMasterIndexName_005; - mi5->common.lookupMasterIndexSampledName = lookupMasterIndexSampledName_005; - mi5->common.restoreDeltaListToMasterIndex = restoreDeltaListToMasterIndex_005; - mi5->common.setMasterIndexOpenChapter = setMasterIndexOpenChapter_005; - mi5->common.setMasterIndexTag = setMasterIndexTag_005; - mi5->common.setMasterIndexZoneOpenChapter = setMasterIndexZoneOpenChapter_005; - mi5->common.startRestoringMasterIndex = startRestoringMasterIndex_005; - mi5->common.startSavingMasterIndex = startSavingMasterIndex_005; - - mi5->addressBits = params.addressBits; - mi5->addressMask = (1u << params.addressBits) - 1; - mi5->chapterBits = params.chapterBits; - mi5->chapterMask = (1u << params.chapterBits) - 1; - mi5->numChapters = params.numChapters; - mi5->numDeltaLists = params.numDeltaLists; - mi5->numZones = numZones; - mi5->chapterZoneBits = params.numBitsPerChapter / numZones; - mi5->volumeNonce = volumeNonce; - - result = initializeDeltaIndex(&mi5->deltaIndex, numZones, - params.numDeltaLists, params.meanDelta, - params.chapterBits, params.memorySize); - if (result == UDS_SUCCESS) { - mi5->maxZoneBits = ((getDeltaIndexDlistBitsAllocated(&mi5->deltaIndex) - - params.targetFreeSize * CHAR_BIT) - / numZones); - } - - // Initialize the chapter flush ranges to be empty. This depends upon - // allocate returning zeroed memory. - if (result == UDS_SUCCESS) { - result = ALLOCATE(params.numDeltaLists, uint64_t, - "first chapter to flush", &mi5->flushChapters); - } - - // Initialize the virtual chapter ranges to start at zero. This depends - // upon allocate returning zeroed memory. - if (result == UDS_SUCCESS) { - result = ALLOCATE(numZones, MasterIndexZone, "master index zones", - &mi5->masterZones); - } - - if (result == UDS_SUCCESS) { - *masterIndex = &mi5->common; - } else { - freeMasterIndex_005(&mi5->common); - *masterIndex = NULL; - } - return result; -} diff --git a/uds/masterIndex005.h b/uds/masterIndex005.h deleted file mode 100644 index 5436c7f..0000000 --- a/uds/masterIndex005.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/masterIndex005.h#1 $ - */ - -#ifndef MASTERINDEX005_H -#define MASTERINDEX005_H 1 - -#include "masterIndexOps.h" - -/** - * Make a new master index. - * - * @param config The configuration of the master index - * @param numZones The number of zones - * @param volumeNonce The nonce used to authenticate the index - * @param masterIndex Location to hold new master index ptr - * - * @return error code or UDS_SUCCESS - **/ -int makeMasterIndex005(const Configuration *config, unsigned int numZones, - uint64_t volumeNonce, MasterIndex **masterIndex) - __attribute__((warn_unused_result)); - -/** - * Compute the number of bytes required to save a master index of a given - * configuration. - * - * @param config The configuration of the master index - * @param numBytes The number of bytes required to save the master index - * - * @return UDS_SUCCESS or an error code. - **/ -int computeMasterIndexSaveBytes005(const Configuration *config, - size_t *numBytes) - __attribute__((warn_unused_result)); - -#endif /* MASTERINDEX005_H */ diff --git a/uds/masterIndex006.c b/uds/masterIndex006.c deleted file mode 100644 index 3e1ef00..0000000 --- a/uds/masterIndex006.c +++ /dev/null @@ -1,791 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/masterIndex006.c#2 $ - */ -#include "masterIndex006.h" - -#include "buffer.h" -#include "compiler.h" -#include "errors.h" -#include "hashUtils.h" -#include "logger.h" -#include "masterIndex005.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "threads.h" -#include "uds.h" - -/* - * The master index is a kept as a wrapper around 2 master index - * implementations, one for dense chapters and one for sparse chapters. - * Methods will be routed to one or the other, or both, depending on the - * method and data passed in. - * - * The master index is divided into zones, and in normal operation there is - * one thread operating on each zone. Any operation that operates on all - * the zones needs to do its operation at a safe point that ensures that - * only one thread is operating on the master index. - * - * The only multithreaded operation supported by the sparse master index is - * the lookupMasterIndexName() method. It is called by the thread that - * assigns an index request to the proper zone, and needs to do a master - * index query for sampled chunk names. The zone mutexes are used to make - * this lookup operation safe. - */ - -typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) masterIndexZone { - Mutex hookMutex; // Protects the sampled index in this zone -} MasterIndexZone; - -typedef struct { - MasterIndex common; // Common master index methods - unsigned int sparseSampleRate; // The sparse sample rate - unsigned int numZones; // The number of zones - MasterIndex *miNonHook; // The non-hook index - MasterIndex *miHook; // The hook index == sample index - MasterIndexZone *masterZones; // The zones -} MasterIndex6; - -/** - * Determine whether a given chunk name is a hook. - * - * @param masterIndex The master index - * @param name The block name - * - * @return whether to use as sample - **/ -static INLINE bool isMasterIndexSample_006(const MasterIndex *masterIndex, - const UdsChunkName *name) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - return (extractSamplingBytes(name) % mi6->sparseSampleRate) == 0; -} - -/***********************************************************************/ -/** - * Get the subindex for the given chunk name - * - * @param masterIndex The master index - * @param name The block name - * - * @return the subindex - **/ -static INLINE MasterIndex *getSubIndex(const MasterIndex *masterIndex, - const UdsChunkName *name) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - return (isMasterIndexSample_006(masterIndex, name) - ? mi6->miHook - : mi6->miNonHook); -} - -/***********************************************************************/ -/** - * Terminate and clean up the master index - * - * @param masterIndex The master index to terminate - **/ -static void freeMasterIndex_006(MasterIndex *masterIndex) -{ - if (masterIndex != NULL) { - MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); - if (mi6->masterZones != NULL) { - unsigned int zone; - for (zone = 0; zone < mi6->numZones; zone++) { - destroyMutex(&mi6->masterZones[zone].hookMutex); - } - FREE(mi6->masterZones); - mi6->masterZones = NULL; - } - if (mi6->miNonHook != NULL) { - freeMasterIndex(mi6->miNonHook); - mi6->miNonHook = NULL; - } - if (mi6->miHook != NULL) { - freeMasterIndex(mi6->miHook); - mi6->miHook = NULL; - } - FREE(masterIndex); - } -} - -/***********************************************************************/ -/** - * Constants and structures for the saved master index file. "MI6" is for - * masterIndex006, and "-XXXX" is a number to increment when the format of - * the data changes. - **/ -enum { MAGIC_SIZE = 8 }; -static const char MAGIC_MI_START[] = "MI6-0001"; - -struct mi006_data { - char magic[MAGIC_SIZE]; // MAGIC_MI_START - unsigned int sparseSampleRate; -}; - -/***********************************************************************/ -/** - * Set the tag value used when saving and/or restoring a master index. - * - * @param masterIndex The master index - * @param tag The tag value - **/ -static void setMasterIndexTag_006(MasterIndex *masterIndex - __attribute__((unused)), - byte tag __attribute__((unused))) -{ -} - -/***********************************************************************/ -__attribute__((warn_unused_result)) -static int encodeMasterIndexHeader(Buffer *buffer, struct mi006_data *header) -{ - int result = putBytes(buffer, MAGIC_SIZE, MAGIC_MI_START); - if (result != UDS_SUCCESS) { - return result; - } - result = putUInt32LEIntoBuffer(buffer, header->sparseSampleRate); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == sizeof(struct mi006_data), - "%zu bytes of config written, of %zu expected", - contentLength(buffer), sizeof(struct mi006_data)); - return result; -} - -/** - * Start saving a master index to a buffered output stream. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * @param bufferedWriter The index state component being written - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int startSavingMasterIndex_006(const MasterIndex *masterIndex, - unsigned int zoneNumber, - BufferedWriter *bufferedWriter) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - Buffer *buffer; - int result = makeBuffer(sizeof(struct mi006_data), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - struct mi006_data header; - memset(&header, 0, sizeof(header)); - memcpy(header.magic, MAGIC_MI_START, MAGIC_SIZE); - header.sparseSampleRate = mi6->sparseSampleRate; - result = encodeMasterIndexHeader(buffer, &header); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - result = writeToBufferedWriter(bufferedWriter, getBufferContents(buffer), - contentLength(buffer)); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - logWarningWithStringError(result, "failed to write master index header"); - return result; - } - - result = startSavingMasterIndex(mi6->miNonHook, zoneNumber, bufferedWriter); - if (result != UDS_SUCCESS) { - return result; - } - - result = startSavingMasterIndex(mi6->miHook, zoneNumber, bufferedWriter); - if (result != UDS_SUCCESS) { - return result; - } - return UDS_SUCCESS; -} - -/***********************************************************************/ -/** - * Have all the data been written while saving a master index to an output - * stream? If the answer is yes, it is still necessary to call - * finishSavingMasterIndex(), which will return quickly. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return true if all the data are written - **/ -static bool isSavingMasterIndexDone_006(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - return (isSavingMasterIndexDone(mi6->miNonHook, zoneNumber) - && isSavingMasterIndexDone(mi6->miHook, zoneNumber)); -} - -/***********************************************************************/ -/** - * Finish saving a master index to an output stream. Force the writing of - * all of the remaining data. If an error occurred asynchronously during - * the save operation, it will be returned here. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int finishSavingMasterIndex_006(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - int result = finishSavingMasterIndex(mi6->miNonHook, zoneNumber); - if (result == UDS_SUCCESS) { - result = finishSavingMasterIndex(mi6->miHook, zoneNumber); - } - return result; -} - -/***********************************************************************/ -/** - * Abort saving a master index to an output stream. If an error occurred - * asynchronously during the save operation, it will be dropped. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int abortSavingMasterIndex_006(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - int result = abortSavingMasterIndex(mi6->miNonHook, zoneNumber); - int result2 = abortSavingMasterIndex(mi6->miHook, zoneNumber); - if (result == UDS_SUCCESS) { - result = result2; - } - return result; -} - -/***********************************************************************/ -__attribute__((warn_unused_result)) -static int decodeMasterIndexHeader(Buffer *buffer, struct mi006_data *header) -{ - int result = getBytesFromBuffer(buffer, sizeof(header->magic), - &header->magic); - if (result != UDS_SUCCESS) { - return result; - } - result = getUInt32LEFromBuffer(buffer, &header->sparseSampleRate); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_LOG_ONLY(contentLength(buffer) == 0, - "%zu bytes decoded of %zu expected", - bufferLength(buffer) - contentLength(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - result = UDS_CORRUPT_COMPONENT; - } - return result; -} - -/** - * Start restoring the master index from multiple buffered readers - * - * @param masterIndex The master index to restore into - * @param bufferedReaders The buffered reader to read the master index from - * @param numReaders The number of buffered readers - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static int startRestoringMasterIndex_006(MasterIndex *masterIndex, - BufferedReader **bufferedReaders, - int numReaders) -{ - MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); - int result = ASSERT_WITH_ERROR_CODE(masterIndex != NULL, UDS_BAD_STATE, - "cannot restore to null master index"); - if (result != UDS_SUCCESS) { - return result; - } - - int i; - for (i = 0; i < numReaders; i++) { - Buffer *buffer; - result = makeBuffer(sizeof(struct mi006_data), &buffer); - if (result != UDS_SUCCESS) { - return result; - } - result = readFromBufferedReader(bufferedReaders[i], - getBufferContents(buffer), - bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return logWarningWithStringError(result, - "failed to read master index header"); - } - result = resetBufferEnd(buffer, bufferLength(buffer)); - if (result != UDS_SUCCESS) { - freeBuffer(&buffer); - return result; - } - struct mi006_data header; - result = decodeMasterIndexHeader(buffer, &header); - freeBuffer(&buffer); - if (result != UDS_SUCCESS) { - return result; - } - if (memcmp(header.magic, MAGIC_MI_START, MAGIC_SIZE) != 0) { - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "master index file had bad magic" - " number"); - } - if (i == 0) { - mi6->sparseSampleRate = header.sparseSampleRate; - } else if (mi6->sparseSampleRate != header.sparseSampleRate) { - logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "Inconsistent sparse sample rate in delta" - " index zone files: %u vs. %u", - mi6->sparseSampleRate, - header.sparseSampleRate); - return UDS_CORRUPT_COMPONENT; - } - } - - result = startRestoringMasterIndex(mi6->miNonHook, bufferedReaders, - numReaders); - if (result != UDS_SUCCESS) { - return result; - } - return startRestoringMasterIndex(mi6->miHook, bufferedReaders, numReaders); -} - -/***********************************************************************/ -/** - * Have all the data been read while restoring a master index from an - * input stream? - * - * @param masterIndex The master index to restore into - * - * @return true if all the data are read - **/ -static bool isRestoringMasterIndexDone_006(const MasterIndex *masterIndex) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - return (isRestoringMasterIndexDone(mi6->miNonHook) - && isRestoringMasterIndexDone(mi6->miHook)); -} - -/***********************************************************************/ -/** - * Restore a saved delta list - * - * @param masterIndex The master index to restore into - * @param dlsi The DeltaListSaveInfo describing the delta list - * @param data The saved delta list bit stream - * - * @return error code or UDS_SUCCESS - **/ -static int restoreDeltaListToMasterIndex_006(MasterIndex *masterIndex, - const DeltaListSaveInfo *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); - int result = restoreDeltaListToMasterIndex(mi6->miNonHook, dlsi, data); - if (result != UDS_SUCCESS) { - result = restoreDeltaListToMasterIndex(mi6->miHook, dlsi, data); - } - return result; -} - -/***********************************************************************/ -/** - * Abort restoring a master index from an input stream. - * - * @param masterIndex The master index - **/ -static void abortRestoringMasterIndex_006(MasterIndex *masterIndex) -{ - MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); - abortRestoringMasterIndex(mi6->miNonHook); - abortRestoringMasterIndex(mi6->miHook); -} - -/***********************************************************************/ -/** - * Set the open chapter number on a zone. The master index zone will be - * modified to index the proper number of chapters ending with the new open - * chapter. - * - * @param masterIndex The master index - * @param zoneNumber The zone number - * @param virtualChapter The new open chapter number - **/ -static void setMasterIndexZoneOpenChapter_006(MasterIndex *masterIndex, - unsigned int zoneNumber, - uint64_t virtualChapter) -{ - MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); - setMasterIndexZoneOpenChapter(mi6->miNonHook, zoneNumber, virtualChapter); - - // We need to prevent a lookupMasterIndexName() happening while we are - // changing the open chapter number - Mutex *mutex = &mi6->masterZones[zoneNumber].hookMutex; - lockMutex(mutex); - setMasterIndexZoneOpenChapter(mi6->miHook, zoneNumber, virtualChapter); - unlockMutex(mutex); -} - -/***********************************************************************/ -/** - * Set the open chapter number. The master index will be modified to index - * the proper number of chapters ending with the new open chapter. - * - * @param masterIndex The master index - * @param virtualChapter The new open chapter number - **/ -static void setMasterIndexOpenChapter_006(MasterIndex *masterIndex, - uint64_t virtualChapter) -{ - MasterIndex6 *mi6 = container_of(masterIndex, MasterIndex6, common); - unsigned int zone; - for (zone = 0; zone < mi6->numZones; zone++) { - setMasterIndexZoneOpenChapter_006(masterIndex, zone, virtualChapter); - } -} - -/***********************************************************************/ -/** - * Find the master index zone associated with a chunk name - * - * @param masterIndex The master index - * @param name The chunk name - * - * @return the zone that the chunk name belongs to - **/ -static unsigned int getMasterIndexZone_006(const MasterIndex *masterIndex, - const UdsChunkName *name) -{ - return getMasterIndexZone(getSubIndex(masterIndex, name), name); -} - -/***********************************************************************/ -/** - * Do a quick read-only lookup of the chunk name and return information - * needed by the index code to process the chunk name. - * - * @param masterIndex The master index - * @param name The chunk name - * @param triage Information about the chunk name - * - * @return UDS_SUCCESS or an error code - **/ -static int lookupMasterIndexName_006(const MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexTriage *triage) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - triage->isSample = isMasterIndexSample_006(masterIndex, name); - triage->inSampledChapter = false; - triage->zone = getMasterIndexZone_006(masterIndex, name); - int result = UDS_SUCCESS; - if (triage->isSample) { - Mutex *mutex = &mi6->masterZones[triage->zone].hookMutex; - lockMutex(mutex); - result = lookupMasterIndexSampledName(mi6->miHook, name, triage); - unlockMutex(mutex); - } - return result; -} - -/***********************************************************************/ -/** - * Do a quick read-only lookup of the sampled chunk name and return - * information needed by the index code to process the chunk name. - * - * @param masterIndex The master index - * @param name The chunk name - * @param triage Information about the chunk name. The zone and - * isSample fields are already filled in. Set - * inSampledChapter and virtualChapter if the chunk - * name is found in the index. - * - * @return UDS_SUCCESS or an error code - **/ -static int lookupMasterIndexSampledName_006(const MasterIndex *masterIndex - __attribute__((unused)), - const UdsChunkName *name - __attribute__((unused)), - MasterIndexTriage *triage - __attribute__((unused))) -{ - return ASSERT_WITH_ERROR_CODE(false, UDS_BAD_STATE, - "%s should not be called", __func__); -} - -/***********************************************************************/ -/** - * Find the master index record associated with a block name - * - * This is always the first routine to be called when dealing with a delta - * master index entry. The fields of the record parameter should be - * examined to determine the state of the record: - * - * If isFound is false, then we did not find an entry for the block - * name. Information is saved in the MasterIndexRecord so that - * putMasterIndexRecord() will insert an entry for that block name at - * the proper place. - * - * If isFound is true, then we did find an entry for the block name. - * Information is saved in the MasterIndexRecord so that the "chapter" - * and "isCollision" fields reflect the entry found. - * Calls to removeMasterIndexRecord() will remove the entry, calls to - * setMasterIndexRecordChapter() can modify the entry, and calls to - * putMasterIndexRecord() can insert a collision record with this - * entry. - * - * @param masterIndex The master index to search - * @param name The chunk name - * @param record Set to the info about the record searched for - * - * @return UDS_SUCCESS or an error code - **/ -static int getMasterIndexRecord_006(MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexRecord *record) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - int result; - if (isMasterIndexSample_006(masterIndex, name)) { - /* - * We need to prevent a lookupMasterIndexName() happening while we are - * finding the master index record. Remember that because of lazy LRU - * flushing of the master index, getMasterIndexRecord() is not a - * read-only operation. - */ - unsigned int zone = getMasterIndexZone(mi6->miHook, name); - Mutex *mutex = &mi6->masterZones[zone].hookMutex; - lockMutex(mutex); - result = getMasterIndexRecord(mi6->miHook, name, record); - unlockMutex(mutex); - // Remember the mutex so that other operations on the MasterIndexRecord - // can use it - record->mutex = mutex; - } else { - result = getMasterIndexRecord(mi6->miNonHook, name, record); - } - return result; -} - -/***********************************************************************/ -/** - * Get the number of bytes used for master index entries. - * - * @param masterIndex The master index - * - * @return The number of bytes in use - **/ -static size_t getMasterIndexMemoryUsed_006(const MasterIndex *masterIndex) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - return (getMasterIndexMemoryUsed(mi6->miNonHook) - + getMasterIndexMemoryUsed(mi6->miHook)); -} - -/***********************************************************************/ -/** - * Return the master index stats. There is only one portion of the master - * index in this implementation, and we call it the dense portion of the - * index. - * - * @param masterIndex The master index - * @param dense Stats for the dense portion of the index - * @param sparse Stats for the sparse portion of the index - **/ -static void getMasterIndexStats_006(const MasterIndex *masterIndex, - MasterIndexStats *dense, - MasterIndexStats *sparse) -{ - const MasterIndex6 *mi6 = const_container_of(masterIndex, MasterIndex6, - common); - MasterIndexStats dummyStats; - getMasterIndexStats(mi6->miNonHook, dense, &dummyStats); - getMasterIndexStats(mi6->miHook, sparse, &dummyStats); -} - -/***********************************************************************/ -typedef struct { - Configuration hookConfig; // Describe the hook part of the index - Geometry hookGeometry; - Configuration nonHookConfig; // Describe the non-hook part of the index - Geometry nonHookGeometry; -} SplitConfig; - -/***********************************************************************/ -static int splitConfiguration006(const Configuration *config, - SplitConfig *split) -{ - int result - = ASSERT_WITH_ERROR_CODE(config->geometry->sparseChaptersPerVolume != 0, - UDS_INVALID_ARGUMENT, - "cannot initialize sparse+dense master index" - " with no sparse chapters"); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_WITH_ERROR_CODE(config->sparseSampleRate != 0, - UDS_INVALID_ARGUMENT, - "cannot initialize sparse+dense master" - " index with a sparse sample rate of %u", - config->sparseSampleRate); - if (result != UDS_SUCCESS) { - return result; - } - - // Start with copies of the base configuration - split->hookConfig = *config; - split->hookGeometry = *config->geometry; - split->hookConfig.geometry = &split->hookGeometry; - split->nonHookConfig = *config; - split->nonHookGeometry = *config->geometry; - split->nonHookConfig.geometry = &split->nonHookGeometry; - - uint64_t sampleRate = config->sparseSampleRate; - uint64_t numChapters = config->geometry->chaptersPerVolume; - uint64_t numSparseChapters = config->geometry->sparseChaptersPerVolume; - uint64_t numDenseChapters = numChapters - numSparseChapters; - uint64_t sampleRecords = config->geometry->recordsPerChapter / sampleRate; - - // Adjust the number of records indexed for each chapter - split->hookGeometry.recordsPerChapter = sampleRecords; - split->nonHookGeometry.recordsPerChapter -= sampleRecords; - - // Adjust the number of chapters indexed - split->hookGeometry.sparseChaptersPerVolume = 0; - split->nonHookGeometry.sparseChaptersPerVolume = 0; - split->nonHookGeometry.chaptersPerVolume = numDenseChapters; - return UDS_SUCCESS; -} - -/***********************************************************************/ -int computeMasterIndexSaveBytes006(const Configuration *config, - size_t *numBytes) -{ - SplitConfig split; - int result = splitConfiguration006(config, &split); - if (result != UDS_SUCCESS) { - return result; - } - size_t hookBytes, nonHookBytes; - result = computeMasterIndexSaveBytes005(&split.hookConfig, &hookBytes); - if (result != UDS_SUCCESS) { - return result; - } - result = computeMasterIndexSaveBytes005(&split.nonHookConfig, &nonHookBytes); - if (result != UDS_SUCCESS) { - return result; - } - // Saving a MasterIndex006 needs a header plus the hook index plus the - // non-hook index - *numBytes = sizeof(struct mi006_data) + hookBytes + nonHookBytes; - return UDS_SUCCESS; -} - -/***********************************************************************/ -int makeMasterIndex006(const Configuration *config, unsigned int numZones, - uint64_t volumeNonce, MasterIndex **masterIndex) -{ - SplitConfig split; - int result = splitConfiguration006(config, &split); - if (result != UDS_SUCCESS) { - return result; - } - - MasterIndex6 *mi6; - result = ALLOCATE(1, MasterIndex6, "master index", &mi6); - if (result != UDS_SUCCESS) { - return result; - } - - mi6->common.abortRestoringMasterIndex = abortRestoringMasterIndex_006; - mi6->common.abortSavingMasterIndex = abortSavingMasterIndex_006; - mi6->common.finishSavingMasterIndex = finishSavingMasterIndex_006; - mi6->common.freeMasterIndex = freeMasterIndex_006; - mi6->common.getMasterIndexMemoryUsed = getMasterIndexMemoryUsed_006; - mi6->common.getMasterIndexRecord = getMasterIndexRecord_006; - mi6->common.getMasterIndexStats = getMasterIndexStats_006; - mi6->common.getMasterIndexZone = getMasterIndexZone_006; - mi6->common.isMasterIndexSample = isMasterIndexSample_006; - mi6->common.isRestoringMasterIndexDone = isRestoringMasterIndexDone_006; - mi6->common.isSavingMasterIndexDone = isSavingMasterIndexDone_006; - mi6->common.lookupMasterIndexName = lookupMasterIndexName_006; - mi6->common.lookupMasterIndexSampledName = lookupMasterIndexSampledName_006; - mi6->common.restoreDeltaListToMasterIndex = restoreDeltaListToMasterIndex_006; - mi6->common.setMasterIndexOpenChapter = setMasterIndexOpenChapter_006; - mi6->common.setMasterIndexTag = setMasterIndexTag_006; - mi6->common.setMasterIndexZoneOpenChapter = setMasterIndexZoneOpenChapter_006; - mi6->common.startRestoringMasterIndex = startRestoringMasterIndex_006; - mi6->common.startSavingMasterIndex = startSavingMasterIndex_006; - - mi6->numZones = numZones; - mi6->sparseSampleRate = config->sparseSampleRate; - - result = ALLOCATE(numZones, MasterIndexZone, "master index zones", - &mi6->masterZones); - unsigned int zone; - for (zone = 0; zone < numZones; zone++) { - if (result == UDS_SUCCESS) { - result = initMutex(&mi6->masterZones[zone].hookMutex); - } - } - if (result != UDS_SUCCESS) { - freeMasterIndex_006(&mi6->common); - return result; - } - - result = makeMasterIndex005(&split.nonHookConfig, numZones, volumeNonce, - &mi6->miNonHook); - if (result != UDS_SUCCESS) { - freeMasterIndex_006(&mi6->common); - return logErrorWithStringError(result, - "Error creating non hook master index"); - } - setMasterIndexTag(mi6->miNonHook, 'd'); - - result = makeMasterIndex005(&split.hookConfig, numZones, volumeNonce, - &mi6->miHook); - if (result != UDS_SUCCESS) { - freeMasterIndex_006(&mi6->common); - return logErrorWithStringError(result, - "Error creating hook master index"); - } - setMasterIndexTag(mi6->miHook, 's'); - - *masterIndex = &mi6->common; - return UDS_SUCCESS; -} diff --git a/uds/masterIndex006.h b/uds/masterIndex006.h deleted file mode 100644 index 1d3b377..0000000 --- a/uds/masterIndex006.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/masterIndex006.h#1 $ - */ - -#ifndef MASTERINDEX006_H -#define MASTERINDEX006_H 1 - -#include "masterIndexOps.h" - -/** - * Make a new master index. - * - * @param config The configuration of the master index - * @param numZones The number of zones - * @param volumeNonce The nonce used to authenticate the index - * @param masterIndex Location to hold new master index ptr - * - * @return error code or UDS_SUCCESS - **/ -int makeMasterIndex006(const Configuration *config, unsigned int numZones, - uint64_t volumeNonce, MasterIndex **masterIndex) - __attribute__((warn_unused_result)); - -/** - * Compute the number of bytes required to save a master index of a given - * configuration. - * - * @param config The configuration of the master index - * @param numBytes The number of bytes required to save the master index - * - * @return UDS_SUCCESS or an error code. - **/ -int computeMasterIndexSaveBytes006(const Configuration *config, - size_t *numBytes) - __attribute__((warn_unused_result)); - -#endif /* MASTERINDEX006_H */ diff --git a/uds/masterIndexOps.c b/uds/masterIndexOps.c deleted file mode 100644 index 1cbd10b..0000000 --- a/uds/masterIndexOps.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/masterIndexOps.c#4 $ - */ -#include "masterIndexOps.h" - -#include "compiler.h" -#include "errors.h" -#include "indexComponent.h" -#include "logger.h" -#include "masterIndex005.h" -#include "masterIndex006.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" -#include "zone.h" - -/**********************************************************************/ -static INLINE bool usesSparse(const Configuration *config) -{ - return config->geometry->sparseChaptersPerVolume > 0; -} - -/**********************************************************************/ -void getMasterIndexCombinedStats(const MasterIndex *masterIndex, - MasterIndexStats *stats) -{ - MasterIndexStats dense, sparse; - getMasterIndexStats(masterIndex, &dense, &sparse); - stats->memoryAllocated = dense.memoryAllocated + sparse.memoryAllocated; - stats->rebalanceTime = dense.rebalanceTime + sparse.rebalanceTime; - stats->rebalanceCount = dense.rebalanceCount + sparse.rebalanceCount; - stats->recordCount = dense.recordCount + sparse.recordCount; - stats->collisionCount = dense.collisionCount + sparse.collisionCount; - stats->discardCount = dense.discardCount + sparse.discardCount; - stats->overflowCount = dense.overflowCount + sparse.overflowCount; - stats->numLists = dense.numLists + sparse.numLists; - stats->earlyFlushes = dense.earlyFlushes + sparse.earlyFlushes; -} - -/**********************************************************************/ -int makeMasterIndex(const Configuration *config, unsigned int numZones, - uint64_t volumeNonce, MasterIndex **masterIndex) -{ - if (usesSparse(config)) { - return makeMasterIndex006(config, numZones, volumeNonce, masterIndex); - } else { - return makeMasterIndex005(config, numZones, volumeNonce, masterIndex); - } -} - -/**********************************************************************/ -int computeMasterIndexSaveBlocks(const Configuration *config, - size_t blockSize, uint64_t *blockCount) -{ - size_t numBytes; - int result = (usesSparse(config) - ? computeMasterIndexSaveBytes006(config, &numBytes) - : computeMasterIndexSaveBytes005(config, &numBytes)); - if (result != UDS_SUCCESS) { - return result; - } - numBytes += sizeof(DeltaListSaveInfo); - *blockCount = (numBytes + blockSize - 1) / blockSize + MAX_ZONES; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int readMasterIndex(ReadPortal *portal) -{ - MasterIndex *masterIndex = indexComponentContext(portal->component); - unsigned int numZones = portal->zones; - if (numZones > MAX_ZONES) { - return logErrorWithStringError(UDS_BAD_STATE, - "zone count %u must not exceed MAX_ZONES", - numZones); - } - - BufferedReader *readers[MAX_ZONES]; - unsigned int z; - for (z = 0; z < numZones; ++z) { - int result = getBufferedReaderForPortal(portal, z, &readers[z]); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "cannot read component for zone %u", z); - } - } - return restoreMasterIndex(readers, numZones, masterIndex); -} - -/**********************************************************************/ -static int writeMasterIndex(IndexComponent *component, - BufferedWriter *writer, - unsigned int zone, - IncrementalWriterCommand command, - bool *completed) -{ - MasterIndex *masterIndex = indexComponentContext(component); - bool isComplete = false; - - int result = UDS_SUCCESS; - - switch (command) { - case IWC_START: - result = startSavingMasterIndex(masterIndex, zone, writer); - isComplete = result != UDS_SUCCESS; - break; - case IWC_CONTINUE: - isComplete = isSavingMasterIndexDone(masterIndex, zone); - break; - case IWC_FINISH: - result = finishSavingMasterIndex(masterIndex, zone); - if (result == UDS_SUCCESS) { - result = writeGuardDeltaList(writer); - } - isComplete = true; - break; - case IWC_ABORT: - result = abortSavingMasterIndex(masterIndex, zone); - isComplete = true; - break; - default: - result = logWarningWithStringError(UDS_INVALID_ARGUMENT, - "Invalid writer command"); - break; - } - if (completed != NULL) { - *completed = isComplete; - } - return result; -} - -/**********************************************************************/ - -static const IndexComponentInfo MASTER_INDEX_INFO_DATA = { - .kind = RL_KIND_MASTER_INDEX, - .name = "master index", - .saveOnly = false, - .chapterSync = false, - .multiZone = true, - .ioStorage = true, - .loader = readMasterIndex, - .saver = NULL, - .incremental = writeMasterIndex, -}; -const IndexComponentInfo *const MASTER_INDEX_INFO = &MASTER_INDEX_INFO_DATA; - -/**********************************************************************/ -static int restoreMasterIndexBody(BufferedReader **bufferedReaders, - unsigned int numReaders, - MasterIndex *masterIndex, - byte dlData[DELTA_LIST_MAX_BYTE_COUNT]) -{ - // Start by reading the "header" section of the stream - int result = startRestoringMasterIndex(masterIndex, bufferedReaders, - numReaders); - if (result != UDS_SUCCESS) { - return result; - } - // Loop to read the delta lists, stopping when they have all been processed. - unsigned int z; - for (z = 0; z < numReaders; z++) { - for (;;) { - DeltaListSaveInfo dlsi; - result = readSavedDeltaList(&dlsi, dlData, bufferedReaders[z]); - if (result == UDS_END_OF_FILE) { - break; - } else if (result != UDS_SUCCESS) { - abortRestoringMasterIndex(masterIndex); - return result; - } - result = restoreDeltaListToMasterIndex(masterIndex, &dlsi, dlData); - if (result != UDS_SUCCESS) { - abortRestoringMasterIndex(masterIndex); - return result; - } - } - } - if (!isRestoringMasterIndexDone(masterIndex)) { - abortRestoringMasterIndex(masterIndex); - return logWarningWithStringError(UDS_CORRUPT_COMPONENT, - "incomplete delta list data"); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int restoreMasterIndex(BufferedReader **bufferedReaders, - unsigned int numReaders, - MasterIndex *masterIndex) -{ - byte *dlData; - int result = ALLOCATE(DELTA_LIST_MAX_BYTE_COUNT, byte, __func__, &dlData); - if (result != UDS_SUCCESS) { - return result; - } - result = restoreMasterIndexBody(bufferedReaders, numReaders, masterIndex, - dlData); - FREE(dlData); - return result; -} diff --git a/uds/masterIndexOps.h b/uds/masterIndexOps.h deleted file mode 100644 index 90802ac..0000000 --- a/uds/masterIndexOps.h +++ /dev/null @@ -1,527 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/masterIndexOps.h#1 $ - */ - -#ifndef MASTERINDEXOPS_H -#define MASTERINDEXOPS_H 1 - -#include "compiler.h" -#include "deltaIndex.h" -#include "indexComponent.h" -#include "indexConfig.h" -#include "threads.h" -#include "uds.h" - -extern const IndexComponentInfo *const MASTER_INDEX_INFO; -extern unsigned int minMasterIndexDeltaLists; - -typedef struct masterIndex MasterIndex; - -typedef struct { - size_t memoryAllocated; // Number of bytes allocated - RelTime rebalanceTime; // The number of seconds spent rebalancing - int rebalanceCount; // Number of memory rebalances - long recordCount; // The number of records in the index - long collisionCount; // The number of collision records - long discardCount; // The number of records removed - long overflowCount; // The number of UDS_OVERFLOWs detected - unsigned int numLists; // The number of delta lists - long earlyFlushes; // Number of early flushes -} MasterIndexStats; - -/* - * The MasterIndexTriage structure is used by lookupMasterIndexName(), - * which is a read-only operation that looks at the chunk name and returns - * some information used by the index to select the thread/queue/code_path - * that will process the chunk. - */ -typedef struct { - uint64_t virtualChapter; // If inSampledChapter is true, then this is the - // chapter containing the entry for the chunk name - unsigned int zone; // The zone containing the chunk name - bool isSample; // If true, this chunk name belongs to the - // sampled index - bool inSampledChapter; // If true, this chunk already has an entry in the - // sampled index and virtualChapter is valid -} MasterIndexTriage; - -/* - * The MasterIndexRecord structure is used for normal index read-write - * processing of a chunk name. The first call must be to - * getMasterIndexRecord() to find the master index record for a chunk name. - * This call can be followed by putMasterIndexRecord() to add a master - * index record, or by setMasterIndexRecordChapter() to associate the chunk - * name with a different chapter, or by removeMasterIndexRecord() to delete - * a master index record. - */ -typedef struct { - // Public fields - uint64_t virtualChapter; // Chapter where the block info is found - bool isCollision; // This record is a collision - bool isFound; // This record is the block searched for - - // Private fields - unsigned char magic; // The magic number for valid records - unsigned int zoneNumber; // Zone that contains this block - MasterIndex *masterIndex; // The master index - Mutex *mutex; // Mutex that must be held while accessing - // this delta index entry; used only for - // a sampled index; otherwise is NULL - const UdsChunkName *name; // The blockname to which this record refers - DeltaIndexEntry deltaEntry; // The delta index entry for this record -} MasterIndexRecord; - -struct masterIndex { - void (*abortRestoringMasterIndex)(MasterIndex *masterIndex); - int (*abortSavingMasterIndex)(const MasterIndex *masterIndex, - unsigned int zoneNumber); - int (*finishSavingMasterIndex)(const MasterIndex *masterIndex, - unsigned int zoneNumber); - void (*freeMasterIndex)(MasterIndex *masterIndex); - size_t (*getMasterIndexMemoryUsed)(const MasterIndex *masterIndex); - int (*getMasterIndexRecord)(MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexRecord *record); - void (*getMasterIndexStats)(const MasterIndex *masterIndex, - MasterIndexStats *dense, - MasterIndexStats *sparse); - unsigned int (*getMasterIndexZone)(const MasterIndex *masterIndex, - const UdsChunkName *name); - bool (*isMasterIndexSample)(const MasterIndex *masterIndex, - const UdsChunkName *name); - bool (*isRestoringMasterIndexDone)(const MasterIndex *masterIndex); - bool (*isSavingMasterIndexDone)(const MasterIndex *masterIndex, - unsigned int zoneNumber); - int (*lookupMasterIndexName)(const MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexTriage *triage); - int (*lookupMasterIndexSampledName)(const MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexTriage *triage); - int (*restoreDeltaListToMasterIndex)(MasterIndex *masterIndex, - const DeltaListSaveInfo *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]); - void (*setMasterIndexOpenChapter)(MasterIndex *masterIndex, - uint64_t virtualChapter); - void (*setMasterIndexTag)(MasterIndex *masterIndex, byte tag); - void (*setMasterIndexZoneOpenChapter)(MasterIndex *masterIndex, - unsigned int zoneNumber, - uint64_t virtualChapter); - int (*startRestoringMasterIndex)(MasterIndex *masterIndex, - BufferedReader **bufferedReaders, - int numReaders); - int (*startSavingMasterIndex)(const MasterIndex *masterIndex, - unsigned int zoneNumber, - BufferedWriter *bufferedWriter); -}; - -/** - * Return the combined master index stats. - * - * @param masterIndex The master index - * @param stats Combined stats for the index - **/ -void getMasterIndexCombinedStats(const MasterIndex *masterIndex, - MasterIndexStats *stats); - -/** - * Make a new master index. - * - * @param config The configuration of the master index - * @param numZones The number of zones - * @param volumeNonce The nonce used to store the index - * @param masterIndex Location to hold new master index ptr - * - * @return error code or UDS_SUCCESS - **/ -int makeMasterIndex(const Configuration *config, unsigned int numZones, - uint64_t volumeNonce, MasterIndex **masterIndex) - __attribute__((warn_unused_result)); - -/** - * Compute the number of blocks required to save a master index of a given - * configuration. - * - * @param [in] config The configuration of a master index - * @param [in] blockSize The size of a block in bytes. - * @param [out] blockCount The resulting number of blocks. - * - * @return UDS_SUCCESS or an error code. - **/ -int computeMasterIndexSaveBlocks(const Configuration *config, - size_t blockSize, - uint64_t *blockCount) - __attribute__((warn_unused_result)); - -/** - * Restore a master index. This is exposed for unit tests. - * - * @param readers The readers to read from. - * @param numReaders The number of readers. - * @param masterIndex The master index - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -int restoreMasterIndex(BufferedReader **readers, - unsigned int numReaders, - MasterIndex *masterIndex) - __attribute__((warn_unused_result)); - -/** - * Abort restoring a master index from an input stream. - * - * @param masterIndex The master index - **/ -static INLINE void abortRestoringMasterIndex(MasterIndex *masterIndex) -{ - masterIndex->abortRestoringMasterIndex(masterIndex); -} - -/** - * Abort saving a master index to an output stream. If an error occurred - * asynchronously during the save operation, it will be dropped. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static INLINE int abortSavingMasterIndex(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - return masterIndex->abortSavingMasterIndex(masterIndex, zoneNumber); -} - -/** - * Finish saving a master index to an output stream. Force the writing of - * all of the remaining data. If an error occurred asynchronously during - * the save operation, it will be returned here. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static INLINE int finishSavingMasterIndex(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - return masterIndex->finishSavingMasterIndex(masterIndex, zoneNumber); -} - -/** - * Terminate and clean up the master index - * - * @param masterIndex The master index to terminate - **/ -static INLINE void freeMasterIndex(MasterIndex *masterIndex) -{ - masterIndex->freeMasterIndex(masterIndex); -} - -/** - * Get the number of bytes used for master index entries. - * - * @param masterIndex The master index - * - * @return The number of bytes in use - **/ -static INLINE size_t getMasterIndexMemoryUsed(const MasterIndex *masterIndex) -{ - return masterIndex->getMasterIndexMemoryUsed(masterIndex); -} - -/** - * Find the master index record associated with a block name - * - * This is always the first routine to be called when dealing with a delta - * master index entry. The fields of the record parameter should be - * examined to determine the state of the record: - * - * If isFound is false, then we did not find an entry for the block name. - * Information is saved in the MasterIndexRecord so that - * putMasterIndexRecord() will insert an entry for that block name at the - * proper place. - * - * If isFound is true, then we did find an entry for the block name. - * Information is saved in the MasterIndexRecord so that the "chapter" and - * "isCollision" fields reflect the entry found. Calls to - * removeMasterIndexRecord() will remove the entry, calls to - * setMasterIndexRecordChapter() can modify the entry, and calls to - * putMasterIndexRecord() can insert a collision record with this entry. - * - * @param masterIndex The master index to search - * @param name The chunk name - * @param record Set to the info about the record searched for - * - * @return UDS_SUCCESS or an error code - **/ -static INLINE int getMasterIndexRecord(MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexRecord *record) -{ - return masterIndex->getMasterIndexRecord(masterIndex, name, record); -} - -/** - * Return the master index stats. - * - * @param masterIndex The master index - * @param dense Stats for the dense portion of the index - * @param sparse Stats for the sparse portion of the index - **/ -static INLINE void getMasterIndexStats(const MasterIndex *masterIndex, - MasterIndexStats *dense, - MasterIndexStats *sparse) -{ - masterIndex->getMasterIndexStats(masterIndex, dense, sparse); -} - -/** - * Find the master index zone associated with a chunk name - * - * @param masterIndex The master index - * @param name The chunk name - * - * @return the zone that the chunk name belongs to - **/ -static INLINE unsigned int getMasterIndexZone(const MasterIndex *masterIndex, - const UdsChunkName *name) -{ - return masterIndex->getMasterIndexZone(masterIndex, name); -} - -/** - * Determine whether a given chunk name is a hook. - * - * @param masterIndex The master index - * @param name The block name - * - * @return whether to use as sample - **/ -static INLINE bool isMasterIndexSample(const MasterIndex *masterIndex, - const UdsChunkName *name) -{ - return masterIndex->isMasterIndexSample(masterIndex, name); -} - -/** - * Have all the data been read while restoring a master index from an input - * stream? - * - * @param masterIndex The master index to restore into - * - * @return true if all the data are read - **/ -static INLINE bool isRestoringMasterIndexDone(const MasterIndex *masterIndex) -{ - return masterIndex->isRestoringMasterIndexDone(masterIndex); -} - -/** - * Have all the data been written while saving a master index to an - * output stream? If the answer is yes, it is still necessary to call - * finishSavingMasterIndex(), which will return quickly. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * - * @return true if all the data are written - **/ -static INLINE bool isSavingMasterIndexDone(const MasterIndex *masterIndex, - unsigned int zoneNumber) -{ - return masterIndex->isSavingMasterIndexDone(masterIndex, zoneNumber); -} - -/** - * Do a quick read-only lookup of the chunk name and return information - * needed by the index code to process the chunk name. - * - * @param masterIndex The master index - * @param name The chunk name - * @param triage Information about the chunk name - * - * @return UDS_SUCCESS or an error code - **/ -static INLINE int lookupMasterIndexName(const MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexTriage *triage) -{ - return masterIndex->lookupMasterIndexName(masterIndex, name, triage); -} - -/** - * Do a quick read-only lookup of the sampled chunk name and return - * information needed by the index code to process the chunk name. - * - * @param masterIndex The master index - * @param name The chunk name - * @param triage Information about the chunk name. The zone and - * isSample fields are already filled in. Set - * inSampledChapter and virtualChapter if the chunk - * name is found in the index. - * - * @return UDS_SUCCESS or an error code - **/ -static INLINE int lookupMasterIndexSampledName(const MasterIndex *masterIndex, - const UdsChunkName *name, - MasterIndexTriage *triage) -{ - return masterIndex->lookupMasterIndexSampledName(masterIndex, name, triage); -} - -/** - * Create a new record associated with a block name. - * - * @param record The master index record found by getRecord() - * @param virtualChapter The chapter number where block info is found - * - * @return UDS_SUCCESS or an error code - **/ -int putMasterIndexRecord(MasterIndexRecord *record, uint64_t virtualChapter) - __attribute__((warn_unused_result)); - -/** - * Remove an existing record. - * - * @param record The master index record found by getRecord() - * - * @return UDS_SUCCESS or an error code - **/ -int removeMasterIndexRecord(MasterIndexRecord *record) - __attribute__((warn_unused_result)); - -/** - * Restore a saved delta list - * - * @param masterIndex The master index to restore into - * @param dlsi The DeltaListSaveInfo describing the delta list - * @param data The saved delta list bit stream - * - * @return error code or UDS_SUCCESS - **/ -static INLINE int restoreDeltaListToMasterIndex(MasterIndex *masterIndex, - const DeltaListSaveInfo *dlsi, - const byte data[DELTA_LIST_MAX_BYTE_COUNT]) -{ - return masterIndex->restoreDeltaListToMasterIndex(masterIndex, dlsi, data); -} - -/** - * Set the open chapter number. The master index will be modified to index - * the proper number of chapters ending with the new open chapter. - * - * In normal operation, the virtual chapter number will be the next chapter - * following the currently open chapter. We will advance the master index - * one chapter forward in the virtual chapter space, invalidating the - * oldest chapter in the index and be prepared to add index entries for the - * newly opened chapter. - * - * In abnormal operation we make a potentially large change to the range of - * chapters being indexed. This happens when we are replaying chapters or - * rebuilding an entire index. If we move the open chapter forward, we - * will invalidate many chapters (potentially the entire index). If we - * move the open chapter backward, we invalidate any entry in the newly - * open chapter and any higher numbered chapter (potentially the entire - * index). - * - * @param masterIndex The master index - * @param virtualChapter The new open chapter number - **/ -static INLINE void setMasterIndexOpenChapter(MasterIndex *masterIndex, - uint64_t virtualChapter) -{ - masterIndex->setMasterIndexOpenChapter(masterIndex, virtualChapter); -} - -/** - * Set the chapter number associated with a block name. - * - * @param record The master index record found by getRecord() - * @param virtualChapter The chapter number where block info is now found. - * - * @return UDS_SUCCESS or an error code - **/ -int setMasterIndexRecordChapter(MasterIndexRecord *record, uint64_t chapter) - __attribute__((warn_unused_result)); - -/** - * Set the tag value used when saving and/or restoring a master index. - * - * @param masterIndex The master index - * @param tag The tag value - **/ -static INLINE void setMasterIndexTag(MasterIndex *masterIndex, byte tag) -{ - masterIndex->setMasterIndexTag(masterIndex, tag); -} - -/** - * Set the open chapter number on a zone. The master index zone will be - * modified to index the proper number of chapters ending with the new open - * chapter. - * - * @param masterIndex The master index - * @param zoneNumber The zone number - * @param virtualChapter The new open chapter number - **/ -static INLINE void setMasterIndexZoneOpenChapter(MasterIndex *masterIndex, - unsigned int zoneNumber, - uint64_t virtualChapter) -{ - masterIndex->setMasterIndexZoneOpenChapter(masterIndex, zoneNumber, - virtualChapter); -} - -/** - * Start restoring the master index from multiple buffered readers - * - * @param masterIndex The master index to restore into - * @param bufferedReaders The buffered reader to read the master index from - * @param numReaders The number of buffered readers - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static INLINE int startRestoringMasterIndex(MasterIndex *masterIndex, - BufferedReader **bufferedReaders, - int numReaders) -{ - return masterIndex->startRestoringMasterIndex(masterIndex, bufferedReaders, - numReaders); -} - -/** - * Start saving a master index to a buffered output stream. - * - * @param masterIndex The master index - * @param zoneNumber The number of the zone to save - * @param bufferedWriter The index state component being written - * - * @return UDS_SUCCESS on success, or an error code on failure - **/ -static INLINE int startSavingMasterIndex(const MasterIndex *masterIndex, - unsigned int zoneNumber, - BufferedWriter *bufferedWriter) -{ - return masterIndex->startSavingMasterIndex(masterIndex, zoneNumber, - bufferedWriter); -} - -#endif /* MASTERINDEXOPS_H */ diff --git a/uds/memoryAlloc.c b/uds/memoryAlloc.c deleted file mode 100644 index e47494c..0000000 --- a/uds/memoryAlloc.c +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/memoryAlloc.c#1 $ - */ - -#include "memoryAlloc.h" - -#include "stringUtils.h" - -/**********************************************************************/ -int duplicateString(const char *string, const char *what, char **newString) -{ - return memdup(string, strlen(string) + 1, what, newString); -} - -/**********************************************************************/ -int memdup(const void *buffer, size_t size, const char *what, void *dupPtr) -{ - byte *dup; - int result = ALLOCATE(size, byte, what, &dup); - if (result != UDS_SUCCESS) { - return result; - } - - memcpy(dup, buffer, size); - *((void **) dupPtr) = dup; - return UDS_SUCCESS; -} diff --git a/uds/memoryAlloc.h b/uds/memoryAlloc.h deleted file mode 100644 index c669e2b..0000000 --- a/uds/memoryAlloc.h +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/memoryAlloc.h#2 $ - */ - -#ifndef MEMORY_ALLOC_H -#define MEMORY_ALLOC_H 1 - -#include - -#include "compiler.h" -#include "cpu.h" -#include "memoryDefs.h" -#include "permassert.h" - -/** - * Allocate storage based on memory size and alignment, logging an error if - * the allocation fails. The memory will be zeroed. - * - * @param size The size of an object - * @param align The required alignment - * @param what What is being allocated (for error logging) - * @param ptr A pointer to hold the allocated memory - * - * @return UDS_SUCCESS or an error code - **/ -int allocateMemory(size_t size, size_t align, const char *what, void *ptr) - __attribute__((warn_unused_result)); - -/** - * Free storage - * - * @param ptr The memory to be freed - **/ -void freeMemory(void *ptr); - -/** - * Allocate storage based on element counts, sizes, and alignment. - * - * This is a generalized form of our allocation use case: It allocates - * an array of objects, optionally preceded by one object of another - * type (i.e., a struct with trailing variable-length array), with the - * alignment indicated. - * - * Why is this inline? The sizes and alignment will always be - * constant, when invoked through the macros below, and often the - * count will be a compile-time constant 1 or the number of extra - * bytes will be a compile-time constant 0. So at least some of the - * arithmetic can usually be optimized away, and the run-time - * selection between allocation functions always can. In many cases, - * it'll boil down to just a function call with a constant size. - * - * @param count The number of objects to allocate - * @param size The size of an object - * @param extra The number of additional bytes to allocate - * @param align The required alignment - * @param what What is being allocated (for error logging) - * @param ptr A pointer to hold the allocated memory - * - * @return UDS_SUCCESS or an error code - **/ -static INLINE int doAllocation(size_t count, - size_t size, - size_t extra, - size_t align, - const char *what, - void *ptr) -{ - size_t totalSize = count * size + extra; - // Overflow check: - if ((size > 0) && (count > ((SIZE_MAX - extra) / size))) { - /* - * This is kind of a hack: We rely on the fact that SIZE_MAX would - * cover the entire address space (minus one byte) and thus the - * system can never allocate that much and the call will always - * fail. So we can report an overflow as "out of memory" by asking - * for "merely" SIZE_MAX bytes. - */ - totalSize = SIZE_MAX; - } - - return allocateMemory(totalSize, align, what, ptr); -} - -/** - * Reallocate dynamically allocated memory. There are no alignment guarantees - * for the reallocated memory. - * - * @param ptr The memory to reallocate. - * @param oldSize The old size of the memory - * @param size The new size to allocate - * @param what What is being allocated (for error logging) - * @param newPtr A pointer to hold the reallocated pointer - * - * @return UDS_SUCCESS or an error code - **/ -int reallocateMemory(void *ptr, - size_t oldSize, - size_t size, - const char *what, - void *newPtr) - __attribute__((warn_unused_result)); - -/** - * Allocate one or more elements of the indicated type, logging an - * error if the allocation fails. The memory will be zeroed. - * - * @param COUNT The number of objects to allocate - * @param TYPE The type of objects to allocate. This type determines the - * alignment of the allocated memory. - * @param WHAT What is being allocated (for error logging) - * @param PTR A pointer to hold the allocated memory - * - * @return UDS_SUCCESS or an error code - **/ -#define ALLOCATE(COUNT, TYPE, WHAT, PTR) \ - doAllocation(COUNT, sizeof(TYPE), 0, __alignof__(TYPE), WHAT, PTR) - -/** - * Allocate one object of an indicated type, followed by one or more - * elements of a second type, logging an error if the allocation - * fails. The memory will be zeroed. - * - * @param TYPE1 The type of the primary object to allocate. This type - * determines the alignment of the allocated memory. - * @param COUNT The number of objects to allocate - * @param TYPE2 The type of array objects to allocate - * @param WHAT What is being allocated (for error logging) - * @param PTR A pointer to hold the allocated memory - * - * @return UDS_SUCCESS or an error code - **/ -#define ALLOCATE_EXTENDED(TYPE1, COUNT, TYPE2, WHAT, PTR) \ - __extension__ ({ \ - TYPE1 **_ptr = (PTR); \ - STATIC_ASSERT(__alignof__(TYPE1) >= __alignof__(TYPE2)); \ - int _result = doAllocation(COUNT, sizeof(TYPE2), sizeof(TYPE1), \ - __alignof__(TYPE1), WHAT, _ptr); \ - _result; \ - }) - -/** - * Free memory allocated with ALLOCATE(). - * - * @param ptr Pointer to the memory to free - **/ -static INLINE void FREE(void *ptr) -{ - freeMemory(ptr); -} - -/** - * Allocate memory starting on a cache line boundary, logging an error if the - * allocation fails. The memory will be zeroed. - * - * @param size The number of bytes to allocate - * @param what What is being allocated (for error logging) - * @param ptr A pointer to hold the allocated memory - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static INLINE int allocateCacheAligned(size_t size, - const char *what, - void *ptr) -{ - return allocateMemory(size, CACHE_LINE_BYTES, what, ptr); -} - -/** - * Duplicate a string. - * - * @param string The string to duplicate - * @param what What is being allocated (for error logging) - * @param newString A pointer to hold the duplicated string - * - * @return UDS_SUCCESS or an error code - **/ -int duplicateString(const char *string, const char *what, char **newString) - __attribute__((warn_unused_result)); - -/** - * Duplicate a buffer, logging an error if the allocation fails. - * - * @param ptr The buffer to copy - * @param size The size of the buffer - * @param what What is being duplicated (for error logging) - * @param dupPtr A pointer to hold the allocated array - * - * @return UDS_SUCCESS or ENOMEM - **/ -int memdup(const void *ptr, size_t size, const char *what, void *dupPtr) - __attribute__((warn_unused_result)); - -/** - * Wrapper which permits freeing a const pointer. - * - * @param pointer the pointer to be freed - **/ -static INLINE void freeConst(const void *pointer) -{ - union { - const void *constP; - void *notConst; - } u = { .constP = pointer }; - FREE(u.notConst); -} - -/** - * Wrapper which permits freeing a volatile pointer. - * - * @param pointer the pointer to be freed - **/ -static INLINE void freeVolatile(volatile void *pointer) -{ - union { - volatile void *volP; - void *notVol; - } u = { .volP = pointer }; - FREE(u.notVol); -} - -#endif /* MEMORY_ALLOC_H */ diff --git a/uds/memoryDefs.h b/uds/memoryDefs.h deleted file mode 100644 index 3f8041e..0000000 --- a/uds/memoryDefs.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryDefs.h#2 $ - */ - -#ifndef LINUX_KERNEL_MEMORY_DEFS_H -#define LINUX_KERNEL_MEMORY_DEFS_H 1 - -#include // for PAGE_SIZE - -#include "compiler.h" -#include "threadRegistry.h" -#include "typeDefs.h" - -/** - * Allocate one or more elements of the indicated type, aligning them - * on the boundary that will allow them to be used in io, logging an - * error if the allocation fails. The memory will be zeroed. - * - * @param COUNT The number of objects to allocate - * @param TYPE The type of objects to allocate - * @param WHAT What is being allocated (for error logging) - * @param PTR A pointer to hold the allocated memory - * - * @return UDS_SUCCESS or an error code - **/ -#define ALLOCATE_IO_ALIGNED(COUNT, TYPE, WHAT, PTR) \ - doAllocation(COUNT, sizeof(TYPE), 0, PAGE_SIZE, WHAT, PTR) - -/** - * Allocate one element of the indicated type immediately, failing if the - * required memory is not immediately available. - * - * @param TYPE The type of objects to allocate - * @param WHAT What is being allocated (for error logging) - * - * @return pointer to the memory, or NULL if the memory is not available. - **/ -#define ALLOCATE_NOWAIT(TYPE, WHAT) allocateMemoryNowait(sizeof(TYPE), WHAT) - -/** - * Perform termination of the memory allocation subsystem. - **/ -void memoryExit(void); - -/** - * Perform initialization of the memory allocation subsystem. - **/ -void memoryInit(void); - -/** - * Allocate storage based on memory size, failing immediately if the required - * memory is not available. The memory will be zeroed. - * - * @param size The size of an object. - * @param what What is being allocated (for error logging) - * - * @return pointer to the allocated memory, or NULL if the required space is - * not available. - **/ -void *allocateMemoryNowait(size_t size, const char *what) - __attribute__((warn_unused_result)); - - -/** - * Register the current thread as an allocating thread. - * - * An optional flag location can be supplied indicating whether, at - * any given point in time, the threads associated with that flag - * should be allocating storage. If the flag is false, a message will - * be logged. - * - * If no flag is supplied, the thread is always allowed to allocate - * storage without complaint. - * - * @param newThread RegisteredThread structure to use for the current thread - * @param flagPtr Location of the allocation-allowed flag - **/ -void registerAllocatingThread(RegisteredThread *newThread, - const bool *flagPtr); - -/** - * Unregister the current thread as an allocating thread. - **/ -void unregisterAllocatingThread(void); - -/** - * Get the memory statistics. - * - * @param bytesUsed A pointer to hold the number of bytes in use - * @param peakBytesUsed A pointer to hold the maximum value bytesUsed has - * attained - **/ -void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed); - -/** - * Report stats on any allocated memory that we're tracking. - * - * Not all allocation types are guaranteed to be tracked in bytes - * (e.g., bios). - **/ -void reportMemoryUsage(void); - - -#endif /* LINUX_KERNEL_MEMORY_DEFS_H */ diff --git a/uds/memoryLinuxKernel.c b/uds/memoryLinuxKernel.c deleted file mode 100644 index 5a42583..0000000 --- a/uds/memoryLinuxKernel.c +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/memoryLinuxKernel.c#6 $ - */ - -#include -#include -#include -#include -#include -#include -#include - -#include "compilerDefs.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - - -/* - ****************************************************************************** - * Production: UDS and VDO keep track of which threads are allowed to allocate - * memory freely, and which threads must be careful to not do a memory - * allocation that does an I/O request. The allocatingThreads ThreadsRegistry - * and its associated methods implement this tracking. - */ - -static ThreadRegistry allocatingThreads; - -/*****************************************************************************/ -static bool allocationsAllowed(void) -{ - const bool *pointer = lookupThread(&allocatingThreads); - return pointer != NULL ? *pointer : false; -} - -/*****************************************************************************/ -void registerAllocatingThread(RegisteredThread *newThread, const bool *flagPtr) -{ - if (flagPtr == NULL) { - static const bool allocationAlwaysAllowed = true; - flagPtr = &allocationAlwaysAllowed; - } - registerThread(&allocatingThreads, newThread, flagPtr); -} - -/*****************************************************************************/ -void unregisterAllocatingThread(void) -{ - unregisterThread(&allocatingThreads); -} - -/* - ****************************************************************************** - * Production: We track how much memory has been allocated and freed. When we - * unload the UDS module, we log an error if we have not freed all the memory - * that we allocated. Nearly all memory allocation and freeing is done using - * this module. - * - * We do not use kernel functions like the kvasprintf() method, which allocate - * memory indirectly using kmalloc. - * - * These data structures and methods are used to track the amount of memory - * used. - */ - -// We allocate very few large objects, and allocation/deallocation isn't done -// in a performance-critical stage for us, so a linked list should be fine. -typedef struct vmallocBlockInfo { - void *ptr; - size_t size; - struct vmallocBlockInfo *next; -} VmallocBlockInfo; - -static struct { - spinlock_t lock; - size_t kmallocBlocks; - size_t kmallocBytes; - size_t vmallocBlocks; - size_t vmallocBytes; - size_t peakBytes; - VmallocBlockInfo *vmallocList; -} memoryStats __cacheline_aligned; - -/*****************************************************************************/ -static void updatePeakUsage(void) -{ - size_t totalBytes = memoryStats.kmallocBytes + memoryStats.vmallocBytes; - if (totalBytes > memoryStats.peakBytes) { - memoryStats.peakBytes = totalBytes; - } -} - -/*****************************************************************************/ -static void addKmallocBlock(size_t size) -{ - unsigned long flags; - spin_lock_irqsave(&memoryStats.lock, flags); - memoryStats.kmallocBlocks++; - memoryStats.kmallocBytes += size; - updatePeakUsage(); - spin_unlock_irqrestore(&memoryStats.lock, flags); -} - -/*****************************************************************************/ -static void removeKmallocBlock(size_t size) -{ - unsigned long flags; - spin_lock_irqsave(&memoryStats.lock, flags); - memoryStats.kmallocBlocks--; - memoryStats.kmallocBytes -= size; - spin_unlock_irqrestore(&memoryStats.lock, flags); -} - -/*****************************************************************************/ -static void addVmallocBlock(VmallocBlockInfo *block) -{ - unsigned long flags; - spin_lock_irqsave(&memoryStats.lock, flags); - block->next = memoryStats.vmallocList; - memoryStats.vmallocList = block; - memoryStats.vmallocBlocks++; - memoryStats.vmallocBytes += block->size; - updatePeakUsage(); - spin_unlock_irqrestore(&memoryStats.lock, flags); -} - -/*****************************************************************************/ -static void removeVmallocBlock(void *ptr) -{ - VmallocBlockInfo *block, **blockPtr; - unsigned long flags; - spin_lock_irqsave(&memoryStats.lock, flags); - for (blockPtr = &memoryStats.vmallocList; - (block = *blockPtr) != NULL; - blockPtr = &block->next) { - if (block->ptr == ptr) { - *blockPtr = block->next; - memoryStats.vmallocBlocks--; - memoryStats.vmallocBytes -= block->size; - break; - } - } - spin_unlock_irqrestore(&memoryStats.lock, flags); - if (block != NULL) { - FREE(block); - } else { - logInfo("attempting to remove ptr %" PRIptr " not found in vmalloc list", - ptr); - } -} - - - -/** - * Determine whether allocating a memory block should use kmalloc or vmalloc. - * - * vmalloc can allocate any integral number of pages. - * - * kmalloc can allocate any number of bytes up to a configured limit, which - * defaults to 8 megabytes on some of our systems. kmalloc is especially good - * when memory is being both allocated and freed, and it does this efficiently - * in a multi CPU environment. - * - * kmalloc usually rounds the size of the block up to the next power of two. - * So when the requested block is bigger than PAGE_SIZE / 2 bytes, kmalloc will - * never give you less space than the corresponding vmalloc allocation. - * Sometimes vmalloc will use less overhead than kmalloc. - * - * The advantages of kmalloc do not help out UDS or VDO, because we allocate - * all our memory up front and do not free and reallocate it. Sometimes we - * have problems using kmalloc, because the Linux memory page map can become so - * fragmented that kmalloc will not give us a 32KB chunk. We have used vmalloc - * as a backup to kmalloc in the past, and a followup vmalloc of 32KB will - * work. But there is no strong case to be made for using kmalloc over vmalloc - * for these size chunks. - * - * The kmalloc/vmalloc boundary is set at 4KB, and kmalloc gets the 4KB - * requests. There is no strong reason for favoring either kmalloc or vmalloc - * for 4KB requests, except that the keeping of vmalloc statistics uses a - * linked list implementation. Using a simple test, this choice of boundary - * results in 132 vmalloc calls. Using vmalloc for requests of exactly 4KB - * results in an additional 6374 vmalloc calls, which will require a change to - * the code that tracks vmalloc statistics. - * - * @param size How many bytes to allocate - **/ -static INLINE bool useKmalloc(size_t size) -{ - return size <= PAGE_SIZE; -} - -/*****************************************************************************/ -int allocateMemory(size_t size, size_t align, const char *what, void *ptr) -{ - if (ptr == NULL) { - return UDS_INVALID_ARGUMENT; - } - if (size == 0) { - *((void **) ptr) = NULL; - return UDS_SUCCESS; - } - - - /* - * The __GFP_RETRY_MAYFAIL means: The VM implementation will retry memory - * reclaim procedures that have previously failed if there is some indication - * that progress has been made else where. It can wait for other tasks to - * attempt high level approaches to freeing memory such as compaction (which - * removes fragmentation) and page-out. There is still a definite limit to - * the number of retries, but it is a larger limit than with __GFP_NORETRY. - * Allocations with this flag may fail, but only when there is genuinely - * little unused memory. While these allocations do not directly trigger the - * OOM killer, their failure indicates that the system is likely to need to - * use the OOM killer soon. The caller must handle failure, but can - * reasonably do so by failing a higher-level request, or completing it only - * in a much less efficient manner. - */ - const gfp_t gfpFlags = GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL; - - bool allocationsRestricted = !allocationsAllowed(); - unsigned int noioFlags; - if (allocationsRestricted) { - noioFlags = memalloc_noio_save(); - } - - unsigned long startTime = jiffies; - void *p = NULL; - if (useKmalloc(size) && (align < PAGE_SIZE)) { - p = kmalloc(size, gfpFlags | __GFP_NOWARN); - if (p == NULL) { - /* - * If we had just done kmalloc(size, gfpFlags) it is possible that the - * allocation would fail (see VDO-3688). The kernel log would then - * contain a long report about the failure. Although the failure occurs - * because there is no page available to allocate, by the time it logs - * the available space, there is a page available. So hopefully a short - * sleep will allow the page reclaimer to free a single page, which is - * all that we need. - */ - msleep(1); - p = kmalloc(size, gfpFlags); - } - if (p != NULL) { - addKmallocBlock(ksize(p)); - } - } else { - VmallocBlockInfo *block; - if (ALLOCATE(1, VmallocBlockInfo, __func__, &block) == UDS_SUCCESS) { - /* - * If we just do __vmalloc(size, gfpFlags, PAGE_KERNEL) it is possible - * that the allocation will fail (see VDO-3661). The kernel log will - * then contain a long report about the failure. Although the failure - * occurs because there are not enough pages available to allocate, by - * the time it logs the available space, there may enough pages available - * for smaller allocations. So hopefully a short sleep will allow the - * page reclaimer to free enough pages for us. - * - * For larger allocations, the kernel page_alloc code is racing against - * the page reclaimer. If the page reclaimer can stay ahead of - * page_alloc, the __vmalloc will succeed. But if page_alloc overtakes - * the page reclaimer, the allocation fails. It is possible that more - * retries will succeed. - */ - for (;;) { - p = __vmalloc(size, gfpFlags | __GFP_NOWARN, PAGE_KERNEL); - // Try again unless we succeeded or more than 1 second has elapsed. - if ((p != NULL) || (jiffies_to_msecs(jiffies - startTime) > 1000)) { - break; - } - msleep(1); - } - if (p == NULL) { - // Try one more time, logging a failure for this call. - p = __vmalloc(size, gfpFlags, PAGE_KERNEL); - } - if (p == NULL) { - FREE(block); - } else { - block->ptr = p; - block->size = PAGE_ALIGN(size); - addVmallocBlock(block); - } - } - } - - if (allocationsRestricted) { - memalloc_noio_restore(noioFlags); - } - - if (p == NULL) { - unsigned int duration = jiffies_to_msecs(jiffies - startTime); - logError("Could not allocate %zu bytes for %s in %u msecs", - size, what, duration); - return ENOMEM; - } - *((void **) ptr) = p; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -void *allocateMemoryNowait(size_t size, - const char *what __attribute__((unused))) -{ - void *p = kmalloc(size, GFP_NOWAIT | __GFP_ZERO); - if (p != NULL) { - addKmallocBlock(ksize(p)); - } - return p; -} - -/*****************************************************************************/ -void freeMemory(void *ptr) -{ - if (ptr != NULL) { - if (is_vmalloc_addr(ptr)) { - removeVmallocBlock(ptr); - vfree(ptr); - } else { - removeKmallocBlock(ksize(ptr)); - kfree(ptr); - } - } -} - -/*****************************************************************************/ -int reallocateMemory(void *ptr, - size_t oldSize, - size_t size, - const char *what, - void *newPtr) -{ - // Handle special case of zero sized result - if (size == 0) { - FREE(ptr); - *(void **)newPtr = NULL; - return UDS_SUCCESS; - } - - int result = ALLOCATE(size, char, what, newPtr); - if (result != UDS_SUCCESS) { - return result; - } - - if (ptr != NULL) { - if (oldSize < size) { - size = oldSize; - } - memcpy(*((void **) newPtr), ptr, size); - FREE(ptr); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -void memoryInit(void) -{ - - spin_lock_init(&memoryStats.lock); - initializeThreadRegistry(&allocatingThreads); -} - - -/*****************************************************************************/ -void memoryExit(void) -{ - - ASSERT_LOG_ONLY(memoryStats.kmallocBytes == 0, - "kmalloc memory used (%zd bytes in %zd blocks)" - " is returned to the kernel", - memoryStats.kmallocBytes, memoryStats.kmallocBlocks); - ASSERT_LOG_ONLY(memoryStats.vmallocBytes == 0, - "vmalloc memory used (%zd bytes in %zd blocks)" - " is returned to the kernel", - memoryStats.vmallocBytes, memoryStats.vmallocBlocks); - logDebug("%s peak usage %zd bytes", THIS_MODULE->name, - memoryStats.peakBytes); -} - -/**********************************************************************/ -void getMemoryStats(uint64_t *bytesUsed, uint64_t *peakBytesUsed) -{ - unsigned long flags; - spin_lock_irqsave(&memoryStats.lock, flags); - *bytesUsed = memoryStats.kmallocBytes + memoryStats.vmallocBytes; - *peakBytesUsed = memoryStats.peakBytes; - spin_unlock_irqrestore(&memoryStats.lock, flags); -} - -/**********************************************************************/ -void reportMemoryUsage() -{ - unsigned long flags; - spin_lock_irqsave(&memoryStats.lock, flags); - uint64_t kmallocBlocks = memoryStats.kmallocBlocks; - uint64_t kmallocBytes = memoryStats.kmallocBytes; - uint64_t vmallocBlocks = memoryStats.vmallocBlocks; - uint64_t vmallocBytes = memoryStats.vmallocBytes; - uint64_t peakUsage = memoryStats.peakBytes; - spin_unlock_irqrestore(&memoryStats.lock, flags); - uint64_t totalBytes = kmallocBytes + vmallocBytes; - logInfo("current module memory tracking" - " (actual allocation sizes, not requested):"); - logInfo(" %llu bytes in %llu kmalloc blocks", - kmallocBytes, kmallocBlocks); - logInfo(" %llu bytes in %llu vmalloc blocks", - vmallocBytes, vmallocBlocks); - logInfo(" total %llu bytes, peak usage %llu bytes", - totalBytes, peakUsage); -} diff --git a/uds/murmur/MurmurHash3.c b/uds/murmur/MurmurHash3.c deleted file mode 100644 index 42af11a..0000000 --- a/uds/murmur/MurmurHash3.c +++ /dev/null @@ -1,379 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -// Note - The x86 and x64 versions do _not_ produce the same results, as the -// algorithms are optimized for their respective platforms. You can still -// compile and run any of them on any platform, but your performance with the -// non-native version will be less than optimal. - -#include "MurmurHash3.h" - -#include "cpu.h" - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Microsoft Visual Studio - -#if defined(_MSC_VER) - -#define FORCE_INLINE __forceinline - -#include - -#define ROTL32(x,y) _rotl(x,y) -#define ROTL64(x,y) _rotl64(x,y) - -#define BIG_CONSTANT(x) (x) - -// Other compilers - -#else // defined(_MSC_VER) - -#if __GNUC__ >= 7 -#pragma GCC diagnostic warning "-Wimplicit-fallthrough=0" -#endif - -#define FORCE_INLINE __attribute__((always_inline)) inline - -static inline uint32_t rotl32 ( uint32_t x, int8_t r ) -{ - return (x << r) | (x >> (32 - r)); -} - -static inline uint64_t rotl64 ( uint64_t x, int8_t r ) -{ - return (x << r) | (x >> (64 - r)); -} - -#define ROTL32(x,y) rotl32(x,y) -#define ROTL64(x,y) rotl64(x,y) - -#define BIG_CONSTANT(x) (x##LLU) - -#endif // !defined(_MSC_VER) - -//----------------------------------------------------------------------------- -// Block read - if your platform needs to do endian-swapping or can only -// handle aligned reads, do the conversion here - -static FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return p[i]; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return __builtin_bswap32(p[i]); -#else -#error "can't figure out byte order" -#endif -} - -static FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return p[i]; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return __builtin_bswap64(p[i]); -#else -#error "can't figure out byte order" -#endif -} - -// Block write -static FORCE_INLINE void putblock (uint32_t *p, int i, uint32_t value) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - p[i] = value; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - p[i] = __builtin_bswap32(value); -#else -#error "can't figure out byte order" -#endif -} - -static FORCE_INLINE void putblock64 (uint64_t *p, int i, uint64_t value) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - p[i] = value; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - p[i] = __builtin_bswap64(value); -#else -#error "can't figure out byte order" -#endif -} - -//----------------------------------------------------------------------------- -// Finalization mix - force all bits of a hash block to avalanche - -static FORCE_INLINE uint32_t fmix32 ( uint32_t h ) -{ - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - - return h; -} - -//---------- - -static FORCE_INLINE uint64_t fmix64 ( uint64_t k ) -{ - k ^= k >> 33; - k *= BIG_CONSTANT(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - - return k; -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 4; - - uint32_t h1 = seed; - - uint32_t c1 = 0xcc9e2d51; - uint32_t c2 = 0x1b873593; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); - - int i; - for(i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i); - - k1 *= c1; - k1 = ROTL32(k1,15); - k1 *= c2; - - h1 ^= k1; - h1 = ROTL32(h1,13); - h1 = h1*5+0xe6546b64; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*4); - - uint32_t k1 = 0; - - switch(len & 3) - { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; - case 1: k1 ^= tail[0]; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - default: break; - }; - - //---------- - // finalization - - h1 ^= len; - - h1 = fmix32(h1); - - putblock(out, 0, h1); -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_128 ( const void * key, const int len, - uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - - uint32_t h1 = seed; - uint32_t h2 = seed; - uint32_t h3 = seed; - uint32_t h4 = seed; - - uint32_t c1 = 0x239b961b; - uint32_t c2 = 0xab0e9789; - uint32_t c3 = 0x38b34ae5; - uint32_t c4 = 0xa1e38b93; - - //---------- - // body - - const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); - - int i; - for(i = -nblocks; i; i++) - { - uint32_t k1 = getblock(blocks,i*4+0); - uint32_t k2 = getblock(blocks,i*4+1); - uint32_t k3 = getblock(blocks,i*4+2); - uint32_t k4 = getblock(blocks,i*4+3); - - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - - h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; - - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; - - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; - - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); - - uint32_t k1 = 0; - uint32_t k2 = 0; - uint32_t k3 = 0; - uint32_t k4 = 0; - - switch(len & 15) - { - case 15: k4 ^= tail[14] << 16; - case 14: k4 ^= tail[13] << 8; - case 13: k4 ^= tail[12] << 0; - k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; - - case 12: k3 ^= tail[11] << 24; - case 11: k3 ^= tail[10] << 16; - case 10: k3 ^= tail[ 9] << 8; - case 9: k3 ^= tail[ 8] << 0; - k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; - - case 8: k2 ^= tail[ 7] << 24; - case 7: k2 ^= tail[ 6] << 16; - case 6: k2 ^= tail[ 5] << 8; - case 5: k2 ^= tail[ 4] << 0; - k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; - - case 4: k1 ^= tail[ 3] << 24; - case 3: k1 ^= tail[ 2] << 16; - case 2: k1 ^= tail[ 1] << 8; - case 1: k1 ^= tail[ 0] << 0; - k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; - default: break; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - h1 = fmix32(h1); - h2 = fmix32(h2); - h3 = fmix32(h3); - h4 = fmix32(h4); - - h1 += h2; h1 += h3; h1 += h4; - h2 += h1; h3 += h1; h4 += h1; - - putblock((uint32_t*)out, 0, h1); - putblock((uint32_t*)out, 1, h2); - putblock((uint32_t*)out, 2, h3); - putblock((uint32_t*)out, 3, h4); -} - -//----------------------------------------------------------------------------- - -void MurmurHash3_x64_128 ( const void * key, const int len, - const uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; - const int nblocks = len / 16; - - uint64_t h1 = seed; - uint64_t h2 = seed; - - uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); - uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); - - //---------- - // body - - const uint64_t * blocks = (const uint64_t *)(data); - - int i; - for(i = 0; i < nblocks; i++) - { - uint64_t k1 = getblock64(blocks,i*2+0); - uint64_t k2 = getblock64(blocks,i*2+1); - - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - - h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; - - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; - } - - //---------- - // tail - - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); - - uint64_t k1 = 0; - uint64_t k2 = 0; - - switch(len & 15) - { - case 15: k2 ^= ((uint64_t)tail[14]) << 48; - case 14: k2 ^= ((uint64_t)tail[13]) << 40; - case 13: k2 ^= ((uint64_t)tail[12]) << 32; - case 12: k2 ^= ((uint64_t)tail[11]) << 24; - case 11: k2 ^= ((uint64_t)tail[10]) << 16; - case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; - case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; - k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; - - case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; - case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; - case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; - case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; - case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; - case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; - case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; - case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; - k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; - default: break; - }; - - //---------- - // finalization - - h1 ^= len; h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - putblock64((uint64_t*)out, 0, h1); - putblock64((uint64_t*)out, 1, h2); -} diff --git a/uds/murmur/MurmurHash3.h b/uds/murmur/MurmurHash3.h deleted file mode 100644 index bebb8fa..0000000 --- a/uds/murmur/MurmurHash3.h +++ /dev/null @@ -1,44 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef _MURMURHASH3_H_ -#define _MURMURHASH3_H_ - -//----------------------------------------------------------------------------- -// Platform-specific functions and macros - -// Linux kernel - -#ifdef __KERNEL__ -# include - -// Microsoft Visual Studio - -#else // defined(__KERNEL__) -# if defined(_MSC_VER) - - typedef unsigned char uint8_t; - typedef unsigned long uint32_t; - typedef unsigned __int64 uint64_t; - -// Other compilers - -# else // defined(_MSC_VER) - -# include - -# endif // !defined(_MSC_VER) -#endif // !defined(__KERNEL__) - -//----------------------------------------------------------------------------- - -void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH3_H_ diff --git a/uds/nonce.c b/uds/nonce.c deleted file mode 100644 index 43b0f80..0000000 --- a/uds/nonce.c +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/nonce.c#3 $ - */ - -#include "nonce.h" - -#include "murmur/MurmurHash3.h" -#include "numeric.h" -#include "random.h" -#include "stringUtils.h" -#include "timeUtils.h" - -/*****************************************************************************/ -static uint64_t hashStuff(uint64_t start, const void *data, size_t len) -{ - uint32_t seed = start ^ (start >> 27); - byte hashBuffer[16]; - MurmurHash3_x64_128(data, len, seed, hashBuffer); - return getUInt64LE(hashBuffer + 4); -} - -/*****************************************************************************/ -static void *memput(void *buf, void *end, const void *data, size_t len) -{ - byte *bp = buf; - byte *be = end; - - size_t chunk = minSizeT(len, be - bp); - memcpy(bp, data, chunk); - return bp + chunk; -} - -/*****************************************************************************/ -size_t createUniqueNonceData(byte *buffer, size_t length) -{ - AbsTime now = currentTime(CLOCK_REALTIME); - - byte *be = buffer + length; - byte *bp = memput(buffer, be, &now, sizeof(now)); - - uint32_t rand = randomInRange(1, (1<<30) - 1); - - bp = memput(bp, be, &rand, sizeof(rand)); - - while (bp < be) { - size_t n = minSizeT(be - bp, bp - buffer); - memcpy(bp, buffer, n); - bp += n; - } - - return bp - buffer; -} - -/*****************************************************************************/ -uint64_t generateMasterNonce(const void *data, size_t len) -{ - return hashStuff(0xa1b1e0fc, data, len); -} - -/*****************************************************************************/ -uint64_t generateSecondaryNonce(uint64_t nonce, - const void *data, - size_t len) -{ - return hashStuff(nonce + 1, data, len); -} diff --git a/uds/nonce.h b/uds/nonce.h deleted file mode 100644 index 43f2054..0000000 --- a/uds/nonce.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/nonce.h#1 $ - */ - -#ifndef NONCE_H -#define NONCE_H - -#include "typeDefs.h" - -/** - * Create unique data for the master nonce, using system-specific - * methods such as the current time and a random number. - * - * @param buffer A buffer of length specified next. - * @param length Length of the buffer. - * - * @return the amount of the buffer that has been filled with unique data - **/ -size_t createUniqueNonceData(byte *buffer, size_t length); - -/** - * Generate a master nonce, using the specified data. - * - * @param data Some arbitrary information. - * @param len The length of the information. - * - * @return a number which will be fairly unique - **/ -uint64_t generateMasterNonce(const void *data, size_t len); - -/** - * Deterministically generate a secondary nonce based on an existing - * nonce and some arbitrary data. Effectively hashes the nonce and - * the data to produce a new nonce which is deterministic. - * - * @param nonce An existing nonce which is well known. - * @param data Some data related to the creation of this nonce. - * @param len The length of the data. - * - * @return a number which will be fairly unique and depend solely on - * the nonce and the data. - **/ -uint64_t generateSecondaryNonce(uint64_t nonce, - const void *data, - size_t len); - -#endif // NONCE_H diff --git a/uds/numeric.c b/uds/numeric.c deleted file mode 100644 index 4bc1e2d..0000000 --- a/uds/numeric.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/numeric.c#2 $ - */ - -#include "numeric.h" -#include "permassert.h" - -#define STATIC_ASSERT_ALIGNOF(type, expectedAlignment) \ - STATIC_ASSERT(__alignof__(type) == (expectedAlignment)) - -/**********************************************************************/ -bool multiplyWouldOverflow(uint64_t a, uint64_t b) -{ - return b != 0 && a > UINT64_MAX / b; -} - -/**********************************************************************/ -void numericCompileTimeAssertions(void) -{ - STATIC_ASSERT_SIZEOF(uint64_t, 8); - STATIC_ASSERT_SIZEOF(uint32_t, 4); - STATIC_ASSERT_SIZEOF(uint16_t, 2); - - STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint64_t), 8); - STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint32_t), 4); - STATIC_ASSERT_SIZEOF(UNALIGNED_WRAPPER(uint16_t), 2); - - STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint64_t), 1); - STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint32_t), 1); - STATIC_ASSERT_ALIGNOF(UNALIGNED_WRAPPER(uint16_t), 1); -} diff --git a/uds/numeric.h b/uds/numeric.h deleted file mode 100644 index 06d7eee..0000000 --- a/uds/numeric.h +++ /dev/null @@ -1,721 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/numeric.h#2 $ - */ - -#ifndef NUMERIC_H -#define NUMERIC_H 1 - -#include "compiler.h" -#include "numericDefs.h" -#include "typeDefs.h" - -#if !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) \ - || !defined(__BYTE_ORDER__) -#error "GCC byte order macros not defined?" -#endif - -/* - * Define a type describing an integer value that is only byte-aligned - * and may explicitly alias other types. GCC keeps getting better - * about type-based alias analysis (both for optimization and for - * warnings), so simply casting a pointer to pointer-to-uintXX_t isn't - * good enough. - * - * C is okay with defining the structures directly in a cast, but - * C++ is not, and we use this header in some C++ code internally. - */ -#define UNALIGNED_WRAPPER(TYPE) \ - unaligned_wrap_##TYPE -#define UNALIGNED_WRAPPER_DEF(TYPE) \ - typedef struct __attribute__((packed, may_alias)) { TYPE value; } \ - UNALIGNED_WRAPPER(TYPE) -UNALIGNED_WRAPPER_DEF(int64_t); -UNALIGNED_WRAPPER_DEF(uint64_t); -UNALIGNED_WRAPPER_DEF(int32_t); -UNALIGNED_WRAPPER_DEF(uint32_t); -UNALIGNED_WRAPPER_DEF(uint16_t); - -#define GET_UNALIGNED(TYPE,ADDR) \ - (((const UNALIGNED_WRAPPER(TYPE) *)(ADDR))->value) -#define PUT_UNALIGNED(TYPE,ADDR,VALUE) \ - (((UNALIGNED_WRAPPER(TYPE) *)(ADDR))->value = (VALUE)) - -/** - * Find the minimum of two ints. - * - * @param a The first int - * @param b The second int - * - * @return The lesser of a and b - **/ -__attribute__((warn_unused_result)) -static INLINE int minInt(int a, int b) -{ - return ((a < b) ? a : b); -} - -/** - * Find the maximum of two ints. - * - * @param a The first int - * @param b The second int - * - * @return The greater of a and b - **/ -__attribute__((warn_unused_result)) -static INLINE int maxInt(int a, int b) -{ - return ((a > b) ? a : b); -} - -/** - * Find the maximum of two unsigned ints. - * - * @param a The first value - * @param b The second value - * - * @return The greater of a and b - **/ -__attribute__((warn_unused_result)) -static INLINE unsigned int maxUInt(unsigned int a, unsigned int b) -{ - return ((a > b) ? a : b); -} - -/** - * Find the maximum of two signed longs. - * - * @param a The first int - * @param b The second int - * - * @return The greater of a and b - **/ -__attribute__((warn_unused_result)) -static INLINE long maxLong(long a, long b) -{ - return ((a > b) ? a : b); -} - -/** - * Find the maximum of two unsigned longs. - * - * @param a The first int - * @param b The second int - * - * @return The greater of a and b - **/ -__attribute__((warn_unused_result)) -static INLINE unsigned long maxULong(unsigned long a, unsigned long b) -{ - return ((a > b) ? a : b); -} - -/** - * Find the minimum of two size_ts. - * - * @param a The first size_t - * @param b The second size_t - * - * @return The lesser of a and b - **/ -__attribute__((warn_unused_result)) -static INLINE size_t minSizeT(size_t a, size_t b) -{ - return ((a < b) ? a : b); -} - -/** - * Find the maximum of two size_ts. - * - * @param a The first size_t - * @param b The second size_t - * - * @return The greater of a and b - **/ -__attribute__((warn_unused_result)) -static INLINE size_t maxSizeT(size_t a, size_t b) -{ - return ((a > b) ? a : b); -} - -/** - * Find the minimum of two uint64_ts. - * - * @param a The first uint64_t - * @param b The second uint64_t - * - * @return The lesser of a and b - **/ -__attribute__((warn_unused_result)) -static INLINE uint64_t minUInt64(uint64_t a, uint64_t b) -{ - return ((a < b) ? a : b); -} - -/** - * Multiply two uint64_t and check for overflow. Does division. - **/ -bool multiplyWouldOverflow(uint64_t a, uint64_t b); - -/** - * Extract a 64 bit unsigned number from a buffer stored in - * big-endian representation. - * - * @param data The buffer from which to extract the number - * - * @return The extracted quantity - **/ -__attribute__((warn_unused_result)) -static INLINE uint64_t getUInt64BE(const byte* data) -{ - uint64_t num = GET_UNALIGNED(uint64_t, data); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap64(num); -#endif - return num; -} - -/** - * Extract a 64 bit unsigned big-endian number from a buffer at a - * specified offset. The offset will be advanced to the first byte - * after the number. - * - * @param buffer The buffer from which to extract the number - * @param offset A pointer to the offset into the buffer at which to extract - * @param decoded A pointer to hold the extracted number - **/ -static INLINE void decodeUInt64BE(const byte *buffer, - size_t *offset, - uint64_t *decoded) -{ - *decoded = getUInt64BE(buffer + *offset); - *offset += sizeof(uint64_t); -} - -/** - * Store a 64 bit unsigned number in a buffer in - * big-endian representation. - * - * @param data The buffer in which to store the number - * @param num The number to store - **/ -static INLINE void storeUInt64BE(byte* data, uint64_t num) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap64(num); -#endif - PUT_UNALIGNED(uint64_t, data, num); -} - -/** - * Encode a 64 bit unsigned number into a buffer at a given offset - * using a big-endian representation. The offset will be advanced to - * first byte after the encoded number. - * - * @param data The buffer to encode into - * @param offset A pointer to the offset at which to start encoding - * @param toEncode The number to encode - **/ -static INLINE void encodeUInt64BE(byte *data, - size_t *offset, - uint64_t toEncode) -{ - storeUInt64BE(data + *offset, toEncode); - *offset += sizeof(uint64_t); -} - -/** - * Extract a 32 bit unsigned number from a buffer stored in big-endian - * representation. - * - * @param data The buffer from which to extract the number - * - * @return The extracted quantity - **/ -__attribute__((warn_unused_result)) -static INLINE uint32_t getUInt32BE(const byte* data) -{ - uint32_t num = GET_UNALIGNED(uint32_t, data); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap32(num); -#endif - return num; -} - -/** - * Extract a 32 bit unsigned big-endian number from a buffer at a - * specified offset. The offset will be advanced to the first byte - * after the number. - * - * @param buffer The buffer from which to extract the number - * @param offset A pointer to the offset into the buffer at which to extract - * @param decoded A pointer to hold the extracted number - **/ -static INLINE void decodeUInt32BE(const byte *buffer, - size_t *offset, - uint32_t *decoded) -{ - *decoded = getUInt32BE(buffer + *offset); - *offset += sizeof(uint32_t); -} - -/** - * Store a 32 bit number in a buffer in - * big-endian representation. - * - * @param data The buffer in which to store the number - * @param num The number to store - **/ -static INLINE void storeUInt32BE(byte* data, uint32_t num) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap32(num); -#endif - PUT_UNALIGNED(uint32_t, data, num); -} - -/** - * Encode a 32 bit number into a buffer at a given offset using a - * big-endian representation. The offset will be advanced to first byte - * after the encoded number. - * - * @param data The buffer to encode into - * @param offset A pointer to the offset at which to start encoding - * @param toEncode The number to encode - **/ -static INLINE void encodeUInt32BE(byte *data, - size_t *offset, - uint32_t toEncode) -{ - storeUInt32BE(data + *offset, toEncode); - *offset += sizeof(uint32_t); -} - -/** - * Extract a 16 bit number from a buffer stored in - * big-endian representation. - * - * @param data The buffer from which to extract the number - * - * @return The extracted quantity - **/ -__attribute__((warn_unused_result)) -static INLINE uint16_t getUInt16BE(const byte* data) -{ - uint16_t num = GET_UNALIGNED(uint16_t, data); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - num = bswap_16(num); -#endif - return num; -} - -/** - * Extract a 16 bit, big-endian number from a buffer at a specified offset. - * The offset will be advanced to the first byte after the number. - * - * @param buffer The buffer from which to extract the number - * @param offset A pointer to the offset into the buffer at which to - * extract - * @param decoded A pointer to hold the extracted number - **/ -static INLINE void decodeUInt16BE(const byte *buffer, - size_t *offset, - uint16_t *decoded) -{ - *decoded = getUInt16BE(buffer + *offset); - *offset += sizeof(uint16_t); -} - -/** - * Store a 16 bit number in a buffer in - * big-endian representation. - * - * @param data The buffer in which to store the number - * @param num The number to store - **/ -static INLINE void storeUInt16BE(byte* data, uint16_t num) -{ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - num = bswap_16(num); -#endif - PUT_UNALIGNED(uint16_t, data, num); -} - -/** - * Encode a 16 bit number into a buffer at a given offset using a - * big-endian representation. The offset will be advanced to first byte - * after the encoded number. - * - * @param data The buffer to encode into - * @param offset A pointer to the offset at which to start encoding - * @param toEncode The number to encode - **/ -static INLINE void encodeUInt16BE(byte *data, - size_t *offset, - uint16_t toEncode) -{ - storeUInt16BE(data + *offset, toEncode); - *offset += sizeof(uint16_t); -} - -/** - * Extract a 64 bit signed number from a buffer stored in - * little-endian representation. - * - * @param data The buffer from which to extract the number - * - * @return The extracted quantity - **/ -__attribute__((warn_unused_result)) -static INLINE int64_t getInt64LE(const byte* data) -{ - int64_t num = GET_UNALIGNED(int64_t, data); -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap64(num); -#endif - return num; -} - -/** - * Extract a 64 bit signed little-endian number from a buffer at a - * specified offset. The offset will be advanced to the first byte - * after the number. - * - * @param buffer The buffer from which to extract the number - * @param offset A pointer to the offset into the buffer at which to extract - * @param decoded A pointer to hold the extracted number - **/ -static INLINE void decodeInt64LE(const byte *buffer, - size_t *offset, - int64_t *decoded) -{ - *decoded = getInt64LE(buffer + *offset); - *offset += sizeof(int64_t); -} - -/** - * Store a signed 64 bit number in a buffer in little-endian - * representation. - * - * @param data The buffer in which to store the number - * @param num The number to store - **/ -static INLINE void storeInt64LE(byte* data, int64_t num) -{ -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap64(num); -#endif - PUT_UNALIGNED(int64_t, data, num); -} - -/** - * Encode a 64 bit signed number into a buffer at a given offset using - * a little-endian representation. The offset will be advanced to - * first byte after the encoded number. - * - * @param data The buffer to encode into - * @param offset A pointer to the offset at which to start encoding - * @param toEncode The number to encode - **/ -static INLINE void encodeInt64LE(byte *data, - size_t *offset, - int64_t toEncode) -{ - storeInt64LE(data + *offset, toEncode); - *offset += sizeof(int64_t); -} - -/** - * Extract a 64 bit number from a buffer stored in - * little-endian representation. - * - * @param data The buffer from which to extract the number - * - * @return The extracted quantity - **/ -__attribute__((warn_unused_result)) -static INLINE uint64_t getUInt64LE(const byte* data) -{ - uint64_t num = GET_UNALIGNED(uint64_t, data); -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap64(num); -#endif - return num; -} - -/** - * Extract a 64 bit unsigned little-endian number from a buffer at a - * specified offset. The offset will be advanced to the first byte - * after the number. - * - * @param buffer The buffer from which to extract the number - * @param offset A pointer to the offset into the buffer at which to extract - * @param decoded A pointer to hold the extracted number - **/ -static INLINE void decodeUInt64LE(const byte *buffer, - size_t *offset, - uint64_t *decoded) -{ - *decoded = getUInt64LE(buffer + *offset); - *offset += sizeof(uint64_t); -} - -/** - * Store a 64 bit unsigned number in a buffer in little-endian - * representation. - * - * @param data The buffer in which to store the number - * @param num The number to store - **/ -static INLINE void storeUInt64LE(byte* data, uint64_t num) -{ -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap64(num); -#endif - PUT_UNALIGNED(uint64_t, data, num); -} - -/** - * Encode a 64 bit unsigned number into a buffer at a given offset - * using a little-endian representation. The offset will be advanced - * to first byte after the encoded number. - * - * @param data The buffer to encode into - * @param offset A pointer to the offset at which to start encoding - * @param toEncode The number to encode - **/ -static INLINE void encodeUInt64LE(byte *data, - size_t *offset, - uint64_t toEncode) -{ - storeUInt64LE(data + *offset, toEncode); - *offset += sizeof(uint64_t); -} - -/** - * Extract a 32 bit signed number from a buffer stored in - * little-endian representation. - * - * @param data The buffer from which to extract the number - * - * @return The extracted quantity - **/ -__attribute__((warn_unused_result)) -static INLINE int32_t getInt32LE(const byte* data) -{ - int32_t num = GET_UNALIGNED(int32_t, data); -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap32(num); -#endif - return num; -} - -/** - * Extract a 32 bit signed little-endian number from a buffer at a - * specified offset. The offset will be advanced to the first byte - * after the number. - * - * @param buffer The buffer from which to extract the number - * @param offset A pointer to the offset into the buffer at which to extract - * @param decoded A pointer to hold the extracted number - **/ -static INLINE void decodeInt32LE(const byte *buffer, - size_t *offset, - int32_t *decoded) -{ - *decoded = getInt32LE(buffer + *offset); - *offset += sizeof(int32_t); -} - -/** - * Store a signed 32 bit number in a buffer in little-endian - * representation. - * - * @param data The buffer in which to store the number - * @param num The number to store - **/ -static INLINE void storeInt32LE(byte* data, int32_t num) -{ -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap32(num); -#endif - PUT_UNALIGNED(int32_t, data, num); -} - -/** - * Encode a 32 bit signed number into a buffer at a given offset using - * a little-endian representation. The offset will be advanced to - * first byte after the encoded number. - * - * @param data The buffer to encode into - * @param offset A pointer to the offset at which to start encoding - * @param toEncode The number to encode - **/ -static INLINE void encodeInt32LE(byte *data, - size_t *offset, - int32_t toEncode) -{ - storeInt32LE(data + *offset, toEncode); - *offset += sizeof(int32_t); -} - -/** - * Extract a 32 bit unsigned number from a buffer stored in - * little-endian representation. - - * - * @param data The buffer from which to extract the number - * - * @return The extracted quantity - **/ -__attribute__((warn_unused_result)) -static INLINE uint32_t getUInt32LE(const byte* data) -{ - uint32_t num = GET_UNALIGNED(uint32_t, data); -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap32(num); -#endif - return num; -} - -/** - * Extract a 32 bit unsigned little-endian number from a buffer at a - * specified offset. The offset will be advanced to the first byte - * after the number. - * - * @param buffer The buffer from which to extract the number - * @param offset A pointer to the offset into the buffer at which to extract - * @param decoded A pointer to hold the extracted number - **/ -static INLINE void decodeUInt32LE(const byte *buffer, - size_t *offset, - uint32_t *decoded) -{ - *decoded = getUInt32LE(buffer + *offset); - *offset += sizeof(uint32_t); -} - -/** - * Store a 32 bit unsigned number in a buffer in little-endian - * representation. - * - * @param data The buffer in which to store the number - * @param num The number to store - **/ -static INLINE void storeUInt32LE(byte* data, uint32_t num) -{ -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = __builtin_bswap32(num); -#endif - PUT_UNALIGNED(uint32_t, data, num); -} - -/** - * Encode a 32 bit unsigned number into a buffer at a given offset - * using a little-endian representation. The offset will be advanced - * to first byte after the encoded number. - * - * @param data The buffer to encode into - * @param offset A pointer to the offset at which to start encoding - * @param toEncode The number to encode - **/ -static INLINE void encodeUInt32LE(byte *data, - size_t *offset, - uint32_t toEncode) -{ - storeUInt32LE(data + *offset, toEncode); - *offset += sizeof(uint32_t); -} - -/** - * Extract a 16 bit number from a buffer stored in - * little-endian representation. - * - * @param data The buffer from which to extract the number - * - * @return The extracted quantity - **/ -__attribute__((warn_unused_result)) -static INLINE uint16_t getUInt16LE(const byte* data) -{ - uint16_t num = GET_UNALIGNED(uint16_t, data); -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = bswap_16(num); -#endif - return num; -} - -/** - * Extract a 16 bit unsigned little-endian number from a buffer at a - * specified offset. The offset will be advanced to the first byte - * after the number. - * - * @param buffer The buffer from which to extract the number - * @param offset A pointer to the offset into the buffer at which to - * extract - * @param decoded A pointer to hold the extracted number - **/ -static INLINE void decodeUInt16LE(const byte *buffer, - size_t *offset, - uint16_t *decoded) -{ - *decoded = getUInt16LE(buffer + *offset); - *offset += sizeof(uint16_t); -} - -/** - * Store a 16 bit number in a buffer in little-endian representation. - * - * @param data The buffer in which to store the number - * @param num The number to store - **/ -static INLINE void storeUInt16LE(byte* data, uint16_t num) -{ -#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ - num = bswap_16(num); -#endif - PUT_UNALIGNED(uint16_t, data, num); -} - -/** - * Encode a 16 bit unsigned number into a buffer at a given offset - * using a little-endian representation. The offset will be advanced - * to first byte after the encoded number. - * - * @param data The buffer to encode into - * @param offset A pointer to the offset at which to start encoding - * @param toEncode The number to encode - **/ -static INLINE void encodeUInt16LE(byte *data, - size_t *offset, - uint16_t toEncode) -{ - storeUInt16LE(data + *offset, toEncode); - *offset += sizeof(uint16_t); -} - -/** - * Special function wrapper required for compile-time assertions. This - * function will fail to compile if any of the uint*_t types are not of the - * size we expect. This function should never be called. - **/ -void numericCompileTimeAssertions(void); - -#endif /* NUMERIC_H */ diff --git a/uds/numericDefs.h b/uds/numericDefs.h deleted file mode 100644 index c8795a1..0000000 --- a/uds/numericDefs.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/numericDefs.h#1 $ - */ - -#ifndef LINUX_KERNEL_NUMERIC_DEFS_H -#define LINUX_KERNEL_NUMERIC_DEFS_H 1 - -#ifdef __x86_64__ -/* - * __builtin_bswap16 should work fine here too, but check for a - * performance impact before changing it, just to be safe. - */ -#define bswap_16(x) \ - (__extension__ \ - ({ register unsigned short int __v, __x = (unsigned short int) (x); \ - __asm__ ("rorw $8, %w0" \ - : "=r" (__v) \ - : "0" (__x) \ - : "cc"); \ - __v; })) -#else -#define bswap_16(x) __builtin_bswap16(x) -#endif - -#endif /* LINUX_KERNEL_NUMERIC_DEFS_H */ diff --git a/uds/opaqueTypes.h b/uds/opaqueTypes.h deleted file mode 100644 index 478631a..0000000 --- a/uds/opaqueTypes.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/opaqueTypes.h#3 $ - */ - -#ifndef OPAQUE_TYPES_H -#define OPAQUE_TYPES_H - -/* - * This file contains typedefs of structures internal to the UDS library - * for which many users of those structures do need to know the details - * of the structures themselves. - */ -typedef struct indexRouter IndexRouter; -typedef struct internalRequest Request; -typedef struct requestQueue RequestQueue; - -#endif /* OPAQUE_TYPES_H */ diff --git a/uds/openChapter.c b/uds/openChapter.c deleted file mode 100644 index 7a8a613..0000000 --- a/uds/openChapter.c +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/openChapter.c#4 $ - */ - -#include "openChapter.h" - -#include "compiler.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "zone.h" - -static int readOpenChapters(ReadPortal *portal); -static int writeOpenChapters(IndexComponent *component, - BufferedWriter *writer, - unsigned int zone); - -const IndexComponentInfo OPEN_CHAPTER_INFO = { - .kind = RL_KIND_OPEN_CHAPTER, - .name = "open chapter", - .saveOnly = true, - .chapterSync = false, - .multiZone = false, - .ioStorage = true, - .loader = readOpenChapters, - .saver = writeOpenChapters, - .incremental = NULL, -}; - -static const byte OPEN_CHAPTER_MAGIC[] = "ALBOC"; -static const byte OPEN_CHAPTER_VERSION[] = "02.00"; - -enum { - OPEN_CHAPTER_MAGIC_LENGTH = sizeof(OPEN_CHAPTER_MAGIC) - 1, - OPEN_CHAPTER_VERSION_LENGTH = sizeof(OPEN_CHAPTER_VERSION) - 1 -}; - -/**********************************************************************/ -static int fillDeltaChapterIndex(OpenChapterZone **chapterZones, - unsigned int zoneCount, - OpenChapterIndex *index, - UdsChunkRecord *collatedRecords) -{ - // Find a record to replace any deleted records, and fill the chapter if - // it was closed early. The last record in any filled zone is guaranteed - // to not have been deleted in this chapter, so use one of those. - OpenChapterZone *fillChapterZone = NULL; - UdsChunkRecord *fillRecord = NULL; - unsigned int z; - for (z = 0; z < zoneCount; ++z) { - fillChapterZone = chapterZones[z]; - if (fillChapterZone->size == fillChapterZone->capacity) { - fillRecord = &fillChapterZone->records[fillChapterZone->size]; - break; - } - } - int result = ASSERT((fillRecord != NULL), - "some open chapter zone filled"); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT(!fillChapterZone->slots[fillChapterZone->size].recordDeleted, - "chapter fill record not deleted"); - if (result != UDS_SUCCESS) { - return result; - } - - const Geometry *geometry = index->geometry; - unsigned int pagesPerChapter = geometry->recordPagesPerChapter; - unsigned int recordsPerPage = geometry->recordsPerPage; - int overflowCount = 0; - unsigned int recordsAdded = 0; - unsigned int zone = 0; - - unsigned int page; - for (page = 0; page < pagesPerChapter; page++) { - unsigned int i; - for (i = 0; - i < recordsPerPage; - i++, recordsAdded++, zone = (zone + 1) % zoneCount) { - - // The record arrays are 1-based. - unsigned int recordNumber = 1 + (recordsAdded / zoneCount); - - // If the zone has been exhausted, or the record was deleted, - // add the fill record to the chapter. - if (recordNumber > chapterZones[zone]->size - || chapterZones[zone]->slots[recordNumber].recordDeleted) { - collatedRecords[1 + recordsAdded] = *fillRecord; - continue; - } - - UdsChunkRecord *nextRecord = &chapterZones[zone]->records[recordNumber]; - collatedRecords[1 + recordsAdded] = *nextRecord; - - int result = putOpenChapterIndexRecord(index, &nextRecord->name, page); - switch (result) { - case UDS_SUCCESS: - break; - case UDS_OVERFLOW: - overflowCount++; - break; - default: - logErrorWithStringError(result, "failed to build open chapter index"); - return result; - } - } - } - if (overflowCount > 0) { - logWarning("Failed to add %d entries to chapter index", overflowCount); - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int closeOpenChapter(OpenChapterZone **chapterZones, - unsigned int zoneCount, - Volume *volume, - OpenChapterIndex *chapterIndex, - UdsChunkRecord *collatedRecords, - uint64_t virtualChapterNumber) -{ - // Empty the delta chapter index, and prepare it for the new virtual chapter. - emptyOpenChapterIndex(chapterIndex, virtualChapterNumber); - - // Map each non-deleted record name to its record page number in the delta - // chapter index. - int result = fillDeltaChapterIndex(chapterZones, zoneCount, chapterIndex, - collatedRecords); - if (result != UDS_SUCCESS) { - return result; - } - - // Pass the populated chapter index and the records to the volume, which - // will generate and write the index and record pages for the chapter. - return writeChapter(volume, chapterIndex, collatedRecords); -} - -/**********************************************************************/ -int saveOpenChapters(Index *index, BufferedWriter *writer) -{ - int result = writeToBufferedWriter(writer, OPEN_CHAPTER_MAGIC, - OPEN_CHAPTER_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - - result = writeToBufferedWriter(writer, OPEN_CHAPTER_VERSION, - OPEN_CHAPTER_VERSION_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - - uint32_t totalRecords = 0; - unsigned int i; - for (i = 0; i < index->zoneCount; i++) { - totalRecords += openChapterSize(index->zones[i]->openChapter); - } - - // Store the record count in little-endian order. - byte totalRecordData[sizeof(totalRecords)]; - storeUInt32LE(totalRecordData, totalRecords); - - result = writeToBufferedWriter(writer, totalRecordData, - sizeof(totalRecordData)); - if (result != UDS_SUCCESS) { - return result; - } - - // Only write out the records that have been added and not deleted. - uint32_t recordsAdded = 0; - unsigned int recordIndex = 1; - while(recordsAdded < totalRecords) { - unsigned int i; - for (i = 0; i < index->zoneCount; i++) { - if (recordIndex > index->zones[i]->openChapter->size) { - continue; - } - if (index->zones[i]->openChapter->slots[recordIndex].recordDeleted) { - continue; - } - UdsChunkRecord *record - = &index->zones[i]->openChapter->records[recordIndex]; - result = writeToBufferedWriter(writer, record, sizeof(UdsChunkRecord)); - if (result != UDS_SUCCESS) { - return result; - } - recordsAdded++; - } - recordIndex++; - } - - return flushBufferedWriter(writer); -} - -/**********************************************************************/ -uint64_t computeSavedOpenChapterSize(Geometry *geometry) -{ - return OPEN_CHAPTER_MAGIC_LENGTH + OPEN_CHAPTER_VERSION_LENGTH + - sizeof(uint32_t) + geometry->recordsPerChapter * sizeof(UdsChunkRecord); -} - -/**********************************************************************/ -static int writeOpenChapters(IndexComponent *component, - BufferedWriter *writer, - unsigned int zone) -{ - int result = ASSERT((zone == 0), "open chapter write not zoned"); - if (result != UDS_SUCCESS) { - return result; - } - - Index *index = indexComponentData(component); - return saveOpenChapters(index, writer); -} - -/** - * Read the version field from a buffered reader, checking whether it is a - * supported version. Returns (via a pointer parameter) the matching - * version constant, which can be used by comparing to the version - * constants using simple pointer equality. - * - * @param [in] reader A buffered reader. - * @param [out] version The version constant that was matched. - * - * @return UDS_SUCCESS or an error code if the file could not be read or - * the version is invalid or unsupported - **/ -static int readVersion(BufferedReader *reader, const byte **version) -{ - byte buffer[OPEN_CHAPTER_VERSION_LENGTH]; - int result = readFromBufferedReader(reader, buffer, sizeof(buffer)); - if (result != UDS_SUCCESS) { - return result; - } - if (memcmp(OPEN_CHAPTER_VERSION, buffer, sizeof(buffer)) != 0) { - return logErrorWithStringError(UDS_CORRUPT_COMPONENT, - "Invalid open chapter version: %.*s", - (int) sizeof(buffer), buffer); - } - *version = OPEN_CHAPTER_VERSION; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int loadVersion20(Index *index, BufferedReader *reader) -{ - byte numRecordsData[sizeof(uint32_t)]; - int result - = readFromBufferedReader(reader, &numRecordsData, sizeof(numRecordsData)); - if (result != UDS_SUCCESS) { - return result; - } - uint32_t numRecords = getUInt32LE(numRecordsData); - - // Keep track of which zones cannot accept any more records. - bool fullFlags[MAX_ZONES] = { false, }; - - // Assign records to the correct zones. - UdsChunkRecord record; - uint32_t records; - for (records = 0; records < numRecords; records++) { - result = readFromBufferedReader(reader, &record, sizeof(UdsChunkRecord)); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int zone = 0; - if (index->zoneCount > 1) { - // A read-only index has no master index, but it also has only one zone. - zone = getMasterIndexZone(index->masterIndex, &record.name); - } - // Add records until the open chapter zone almost runs out of space. - // The chapter can't be closed here, so don't add the last record. - if (!fullFlags[zone]) { - unsigned int remaining; - result = putOpenChapter(index->zones[zone]->openChapter, - &record.name, &record.data, &remaining); - fullFlags[zone] = (remaining <= 1); - if (result != UDS_SUCCESS) { - return result; - } - } - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int loadOpenChapters(Index *index, BufferedReader *reader) -{ - // Read and check the magic number. - int result = - verifyBufferedData(reader, OPEN_CHAPTER_MAGIC, OPEN_CHAPTER_MAGIC_LENGTH); - if (result != UDS_SUCCESS) { - return result; - } - - // Read and check the version. - const byte *version = NULL; - result = readVersion(reader, &version); - if (result != UDS_SUCCESS) { - return result; - } - - return loadVersion20(index, reader); -} - -/**********************************************************************/ -int readOpenChapters(ReadPortal *portal) -{ - Index *index = indexComponentData(portal->component); - - BufferedReader *reader; - int result = getBufferedReaderForPortal(portal, 0, &reader); - if (result != UDS_SUCCESS) { - return result; - } - return loadOpenChapters(index, reader); -} diff --git a/uds/openChapter.h b/uds/openChapter.h deleted file mode 100644 index 381badd..0000000 --- a/uds/openChapter.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/openChapter.h#1 $ - */ - -#ifndef OPENCHAPTER_H -#define OPENCHAPTER_H 1 - -#include "common.h" -#include "geometry.h" -#include "index.h" -#include "indexComponent.h" - -extern const IndexComponentInfo OPEN_CHAPTER_INFO; - -/** - * OpenChapter handles writing the open chapter records to the volume. It also - * manages the open chapter index component, and all the tools to generate and - * parse the open chapter file. The open chapter file interleaves records from - * each openChapterZone structure. - * - *

Once each open chapter zone is filled, the records are interleaved to - * preserve temporal locality, the index pages are generated through a - * delta chapter index, and the record pages are derived by sorting each - * page-sized batch of records by their names. - * - *

Upon index shutdown, the open chapter zone records are again - * interleaved, and the records are stored as a single array. The hash - * slots are not preserved, since the records may be reassigned to new - * zones at load time. - **/ - -/** - * Close the open chapter and write it to disk. - * - * @param chapterZones The zones of the chapter to close - * @param zoneCount The number of zones - * @param volume The volume to which to write the chapter - * @param chapterIndex The OpenChapterIndex to use while writing - * @param collatedRecords Collated records array to use while writing - * @param virtualChapterNumber The virtual chapter number of the open chapter - * - * @return UDS_SUCCESS or an error code - **/ -int closeOpenChapter(OpenChapterZone **chapterZones, - unsigned int zoneCount, - Volume *volume, - OpenChapterIndex *chapterIndex, - UdsChunkRecord *collatedRecords, - uint64_t virtualChapterNumber) - __attribute__((warn_unused_result)); - -/** - * Write out a partially filled chapter to a file. - * - * @param index the index to save the data from - * @param writer the writer to write out the chapters - * - * @return UDS_SUCCESS on success - **/ -int saveOpenChapters(Index *index, BufferedWriter *writer) - __attribute__((warn_unused_result)); - -/** - * Read a partially filled chapter from a file. - * - * @param index the index to load the data into - * @param reader the buffered reader to read from - * - * @return UDS_SUCCESS on success - **/ -int loadOpenChapters(Index *index, BufferedReader *reader) - __attribute__((warn_unused_result)); - -/** - * Compute the size of the maximum open chapter save image. - * - * @param geometry the index geometry - * - * @return the number of bytes of the largest possible open chapter save - * image - **/ -uint64_t computeSavedOpenChapterSize(Geometry *geometry); - -#endif /* OPENCHAPTER_H */ diff --git a/uds/openChapterZone.c b/uds/openChapterZone.c deleted file mode 100644 index f346409..0000000 --- a/uds/openChapterZone.c +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/openChapterZone.c#2 $ - */ - -#include "openChapterZone.h" - -#include "compiler.h" -#include "hashUtils.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -/**********************************************************************/ -static INLINE size_t recordsSize(const OpenChapterZone *openChapter) -{ - return (sizeof(UdsChunkRecord) * (1 + openChapter->capacity)); -} - -/**********************************************************************/ -static INLINE size_t slotsSize(size_t slotCount) -{ - return (sizeof(Slot) * slotCount); -} - -/** - * Round up to the first power of two greater than or equal - * to the supplied number. - * - * @param val the number to round up - * - * @return the first power of two not smaller than val for any - * val <= 2^63 - **/ -static INLINE size_t nextPowerOfTwo(size_t val) -{ - if (val == 0) { - return 1; - } - return (1 << computeBits(val - 1)); -} - -/**********************************************************************/ -int makeOpenChapter(const Geometry *geometry, - unsigned int zoneCount, - OpenChapterZone **openChapterPtr) -{ - int result = ASSERT(zoneCount > 0, "zone count must be > 0"); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_WITH_ERROR_CODE(geometry->openChapterLoadRatio > 1, - UDS_BAD_STATE, - "Open chapter hash table is too small"); - if (result != UDS_SUCCESS) { - return result; - } - result = ASSERT_WITH_ERROR_CODE((geometry->recordsPerChapter - <= OPEN_CHAPTER_MAX_RECORD_NUMBER), - UDS_BAD_STATE, - "Too many records (%u) for a single chapter", - geometry->recordsPerChapter); - if (result != UDS_SUCCESS) { - return result; - } - - if (geometry->recordsPerChapter < zoneCount) { - return logUnrecoverable( - UDS_INVALID_ARGUMENT, - "zone count: %u is larger than the records per chapter %u", - zoneCount, geometry->recordsPerChapter); - } - size_t capacity = geometry->recordsPerChapter / zoneCount; - - // The slot count must be at least one greater than the capacity. - // Using a power of two slot count guarantees that hash insertion - // will never fail if the hash table is not full. - size_t slotCount = nextPowerOfTwo(capacity * geometry->openChapterLoadRatio); - OpenChapterZone *openChapter; - result = ALLOCATE_EXTENDED(OpenChapterZone, slotCount, Slot, - "open chapter", &openChapter); - if (result != UDS_SUCCESS) { - return result; - } - openChapter->slotCount = slotCount; - openChapter->capacity = capacity; - result = allocateCacheAligned(recordsSize(openChapter), "record pages", - &openChapter->records); - if (result != UDS_SUCCESS) { - freeOpenChapter(openChapter); - return result; - } - - *openChapterPtr = openChapter; - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t openChapterSize(const OpenChapterZone *openChapter) -{ - return openChapter->size - openChapter->deleted; -} - -/**********************************************************************/ -void resetOpenChapter(OpenChapterZone *openChapter) -{ - openChapter->size = 0; - openChapter->deleted = 0; - - memset(openChapter->records, 0, recordsSize(openChapter)); - memset(openChapter->slots, 0, slotsSize(openChapter->slotCount)); -} - -/**********************************************************************/ -static UdsChunkRecord *probeChapterSlots(OpenChapterZone *openChapter, - const UdsChunkName *name, - unsigned int *slotPtr, - unsigned int *recordNumberPtr) -{ - unsigned int slots = openChapter->slotCount; - unsigned int probe = nameToHashSlot(name, slots); - unsigned int firstSlot = 0; - - UdsChunkRecord *record; - unsigned int probeSlot; - unsigned int recordNumber; - unsigned int probeAttempts; - - for (probeAttempts = 1; ; ++probeAttempts) { - probeSlot = firstSlot + probe; - recordNumber = openChapter->slots[probeSlot].recordNumber; - - // If the hash slot is empty, we've reached the end of a chain without - // finding the record and should terminate the search. - if (recordNumber == 0) { - record = NULL; - break; - } - - // If the name of the record referenced by the slot matches and has not - // been deleted, then we've found the requested name. - record = &openChapter->records[recordNumber]; - if ((memcmp(&record->name, name, UDS_CHUNK_NAME_SIZE) == 0) - && !openChapter->slots[recordNumber].recordDeleted) { - break; - } - - // Quadratic probing: advance the probe by 1, 2, 3, etc. and try again. - // This performs better than linear probing and works best for 2^N slots. - probe += probeAttempts; - if (probe >= slots) { - probe = probe % slots; - } - } - - // These NULL checks will be optimized away in callers who don't care about - // the values when this function is inlined. - if (slotPtr != NULL) { - *slotPtr = probeSlot; - } - if (recordNumberPtr != NULL) { - *recordNumberPtr = recordNumber; - } - - return record; -} - -/**********************************************************************/ -void searchOpenChapter(OpenChapterZone *openChapter, - const UdsChunkName *name, - UdsChunkData *metadata, - bool *found) -{ - UdsChunkRecord *record = probeChapterSlots(openChapter, name, NULL, NULL); - - if (record == NULL) { - *found = false; - } else { - *found = true; - if (metadata != NULL) { - *metadata = record->data; - } - } -} - -/**********************************************************************/ -int putOpenChapter(OpenChapterZone *openChapter, - const UdsChunkName *name, - const UdsChunkData *metadata, - unsigned int *remaining) -{ - unsigned int slot; - UdsChunkRecord *record = probeChapterSlots(openChapter, name, &slot, NULL); - - if (record != NULL) { - record->data = *metadata; - *remaining = openChapter->capacity - openChapter->size; - return UDS_SUCCESS; - } - - if (openChapter->size >= openChapter->capacity) { - return makeUnrecoverable(UDS_VOLUME_OVERFLOW); - } - - unsigned int recordNumber = ++openChapter->size; - openChapter->slots[slot].recordNumber = recordNumber; - record = &openChapter->records[recordNumber]; - record->name = *name; - record->data = *metadata; - - *remaining = openChapter->capacity - openChapter->size; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void removeFromOpenChapter(OpenChapterZone *openChapter, - const UdsChunkName *name, - bool *removed) -{ - unsigned int recordNumber; - UdsChunkRecord *record - = probeChapterSlots(openChapter, name, NULL, &recordNumber); - - if (record == NULL) { - *removed = false; - return; - } - - // Set the deleted flag on the recordNumber in the slot array so search - // won't find it and close won't index it. - openChapter->slots[recordNumber].recordDeleted = true; - openChapter->deleted += 1; - *removed = true; -} - -/**********************************************************************/ -void freeOpenChapter(OpenChapterZone *openChapter) -{ - if (openChapter != NULL) { - FREE(openChapter->records); - FREE(openChapter); - } -} diff --git a/uds/openChapterZone.h b/uds/openChapterZone.h deleted file mode 100644 index cecee4b..0000000 --- a/uds/openChapterZone.h +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/openChapterZone.h#1 $ - */ - -#ifndef OPEN_CHAPTER_ZONE_H -#define OPEN_CHAPTER_ZONE_H 1 - -#include "common.h" -#include "geometry.h" -#include "typeDefs.h" - -/** - * OpenChapterZone is the mutable, in-memory representation of one zone's - * section of an Albireo index chapter. - * - *

In addition to providing the same access to records as an on-disk - * chapter, the open chapter zone must allow records to be added or - * modified. It must provide a way to generate the on-disk representation - * without excessive work. It does that by accumulating records in the order - * they are added (maintaining temporal locality), and referencing them (as - * record numbers) from hash slots selected from the name. If the metadata for - * a name changes, the record field is just modified in place. - * - *

Storage for the records (names and metadata) is allocated when the zone - * is created. It keeps no references to the data passed to it, and performs - * no additional allocation when adding records. Opening a new chapter simply - * marks it as being empty. - * - *

Records are stored in a flat array. To allow a value of zero in a - * hash slot to indicate that the slot is empty, records are numbered starting - * at one (1-based). Since C arrays are 0-based, the records array contains - * enough space for N+1 records, and the record that starts at array index - * zero is never used or referenced. - * - *

The array of hash slots is actually two arrays, superimposed: an - * array of record numbers, indexed by hash value, and an array of deleted - * flags, indexed by record number. This overlay is possible because the - * number of hash slots always exceeds the number of records, and is done - * simply to save on memory. - **/ - -enum { - OPEN_CHAPTER_RECORD_NUMBER_BITS = 23, - OPEN_CHAPTER_MAX_RECORD_NUMBER = (1 << OPEN_CHAPTER_RECORD_NUMBER_BITS) - 1 -}; - -typedef struct { - /** If non-zero, the record number addressed by this hash slot */ - unsigned int recordNumber : OPEN_CHAPTER_RECORD_NUMBER_BITS; - /** If true, the record at the index of this hash slot was deleted */ - bool recordDeleted : 1; -} __attribute__((packed)) Slot; - -typedef struct openChapterZone { - /** Maximum number of records that can be stored */ - unsigned int capacity; - /** Number of records stored */ - unsigned int size; - /** Number of deleted records */ - unsigned int deleted; - /** Record data, stored as (name, metadata), 1-based */ - UdsChunkRecord *records; - /** The number of slots in the chapter zone hash table. */ - unsigned int slotCount; - /** Hash table, referencing virtual record numbers */ - Slot slots[]; -} OpenChapterZone; - -/** - * Allocate an open chapter zone. - * - * @param geometry the geometry of the volume - * @param zoneCount the total number of open chapter zones - * @param openChapterPtr a pointer to hold the new open chapter - * - * @return UDS_SUCCESS or an error code - **/ -int makeOpenChapter(const Geometry *geometry, - unsigned int zoneCount, - OpenChapterZone **openChapterPtr) - __attribute__((warn_unused_result)); - -/** - * Return the number of records in the open chapter zone that have not been - * deleted. - * - * @return the number of non-deleted records - **/ -size_t openChapterSize(const OpenChapterZone *openChapter) - __attribute__((warn_unused_result)); - -/** - * Open a chapter by marking it empty. - * - * @param openChapter The chapter to open - **/ -void resetOpenChapter(OpenChapterZone *openChapter); - -/** - * Search the open chapter for a chunk name. - * - * @param openChapter The chapter to search - * @param name The name of the desired chunk - * @param metadata The holder for the metadata associated with the - * chunk, if found (or NULL) - * @param found A pointer which will be set to true if the chunk - * name was found - **/ -void searchOpenChapter(OpenChapterZone *openChapter, - const UdsChunkName *name, - UdsChunkData *metadata, - bool *found); - -/** - * Put a record into the open chapter. - * - * @param openChapter The chapter into which to put the record - * @param name The name of the record - * @param metadata The record data - * @param remaining Pointer to an integer set to the number of additional - * records that can be added to this chapter - * - * @return UDS_SUCCESS or an error code - **/ -int putOpenChapter(OpenChapterZone *openChapter, - const UdsChunkName *name, - const UdsChunkData *metadata, - unsigned int *remaining) - __attribute__((warn_unused_result)); - -/** - * Remove a record from the open chapter. - * - * @param openChapter The chapter from which to remove the record - * @param name The name of the record - * @param removed Pointer to bool set to true if the - * record was found - **/ -void removeFromOpenChapter(OpenChapterZone *openChapter, - const UdsChunkName *name, - bool *removed); - -/** - * Clean up an open chapter and its memory. - * - * @param openChapter the chapter to destroy - **/ -void freeOpenChapter(OpenChapterZone *openChapter); - -#endif /* OPEN_CHAPTER_ZONE_H */ diff --git a/uds/pageCache.c b/uds/pageCache.c deleted file mode 100644 index b2db9a5..0000000 --- a/uds/pageCache.c +++ /dev/null @@ -1,719 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/pageCache.c#6 $ - */ - -#include "pageCache.h" - -#include "atomicDefs.h" -#include "cacheCounters.h" -#include "chapterIndex.h" -#include "compiler.h" -#include "errors.h" -#include "geometry.h" -#include "hashUtils.h" -#include "indexConfig.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "recordPage.h" -#include "stringUtils.h" -#include "threads.h" -#include "zone.h" - -/**********************************************************************/ -int assertPageInCache(PageCache *cache, CachedPage *page) -{ - int result = ASSERT((page->cp_physicalPage < cache->numIndexEntries), - "physicalPage %u is valid (< %u)", - page->cp_physicalPage, cache->numIndexEntries); - if (result != UDS_SUCCESS) { - return result; - } - - uint16_t pageIndex = cache->index[page->cp_physicalPage]; - return ASSERT((pageIndex < cache->numCacheEntries) - && (&cache->cache[pageIndex] == page), - "page is at expected location in cache"); -} - -/** - * Clear a cache page. Note: this does not clear readPending - a read could - * still be pending and the read thread needs to be able to proceed and restart - * the requests regardless. This page will still be marked invalid, but it - * won't get reused (see getLeastRecentPage()) until the readPending flag - * is cleared. This is a valid case, e.g. the chapter gets forgotten and - * replaced with a new one in LRU. Restarting the requests will lead them to - * not find the records in the MI. - * - * @param cache the cache - * @param page the cached page to clear - * - **/ -static void clearPage(PageCache *cache, CachedPage *page) -{ - page->cp_physicalPage = cache->numIndexEntries; - WRITE_ONCE(page->cp_lastUsed, 0); -} - -/** - * Get a page from the cache, but with no stats - * - * @param cache the cache - * @param physicalPage the physical page to get - * @param queueIndex the index of the page in the read queue if - * queued, -1 otherwise - * @param pagePtr a pointer to hold the page - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int getPageNoStats(PageCache *cache, - unsigned int physicalPage, - int *queueIndex, - CachedPage **pagePtr) -{ - /* - * ASSERTION: We are either a zone thread holding a searchPendingCounter, - * or we are any thread holding the readThreadsMutex. - * - * Holding only a searchPendingCounter is the most frequent case. - */ - - int result = ASSERT((physicalPage < cache->numIndexEntries), - "physical page %u is invalid", physicalPage); - if (result != UDS_SUCCESS) { - return result; - } - - /* - * It would be unlikely that the compiler turns the usage of indexValue into - * two reads of cache->index, but it would be possible and very bad if those - * reads did not return the same bits. - */ - uint16_t indexValue = READ_ONCE(cache->index[physicalPage]); - bool queued = (indexValue & VOLUME_CACHE_QUEUED_FLAG) != 0; - uint16_t index = indexValue & ~VOLUME_CACHE_QUEUED_FLAG; - - if (!queued && (index < cache->numCacheEntries)) { - *pagePtr = &cache->cache[index]; - /* - * We have acquired access to the cached page, but unless we hold the - * readThreadsMutex, we need a read memory barrier now. The corresponding - * write memory barrier is in putPageInCache. - */ - smp_rmb(); - } else { - *pagePtr = NULL; - } - if (queueIndex != NULL) { - *queueIndex = queued ? index : -1; - } - return UDS_SUCCESS; -} - -/** - * Wait for all pending searches on a page in the cache to complete - * - * @param cache the page cache - * @param physicalPage the page to check searches on - **/ -static void waitForPendingSearches(PageCache *cache, unsigned int physicalPage) -{ - /* - * We hold the readThreadsMutex. We are waiting for threads that do not hold - * the readThreadsMutex. Those threads have "locked" their targeted page by - * setting the searchPendingCounter. The corresponding write memory barrier - * is in beginPendingSearch. - */ - smp_mb(); - - InvalidateCounter initialCounters[MAX_ZONES]; - unsigned int i; - for (i = 0; i < cache->zoneCount; i++) { - initialCounters[i] = getInvalidateCounter(cache, i); - } - for (i = 0; i < cache->zoneCount; i++) { - if (searchPending(initialCounters[i]) - && (pageBeingSearched(initialCounters[i]) == physicalPage)) { - // There is an active search using the physical page. - // We need to wait for the search to finish. - while (initialCounters[i] == getInvalidateCounter(cache, i)) { - yieldScheduler(); - } - } - } -} - -/** - * Invalidate a cache page - * - * @param cache the cache - * @param page the cached page - * @param reason the reason for invalidation, for stats - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int invalidatePageInCache(PageCache *cache, - CachedPage *page, - InvalidationReason reason) -{ - // We hold the readThreadsMutex. - if (page == NULL) { - return UDS_SUCCESS; - } - - if (page->cp_physicalPage != cache->numIndexEntries) { - switch (reason) { - case INVALIDATION_EVICT: - cache->counters.evictions++; - break; - case INVALIDATION_EXPIRE: - cache->counters.expirations++; - break; - default: - break; - } - - if (reason != INVALIDATION_ERROR) { - int result = assertPageInCache(cache, page); - if (result != UDS_SUCCESS) { - return result; - } - } - - WRITE_ONCE(cache->index[page->cp_physicalPage], cache->numCacheEntries); - waitForPendingSearches(cache, page->cp_physicalPage); - } - - clearPage(cache, page); - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int findInvalidateAndMakeLeastRecent(PageCache *cache, - unsigned int physicalPage, - QueuedRead *readQueue, - InvalidationReason reason, - bool mustFind) -{ - // We hold the readThreadsMutex. - if (cache == NULL) { - return UDS_SUCCESS; - } - - CachedPage *page; - int queuedIndex = -1; - int result - = getPageNoStats(cache, physicalPage, - ((readQueue != NULL) ? &queuedIndex : NULL), &page); - if (result != UDS_SUCCESS) { - return result; - } - - if (page == NULL) { - result = ASSERT(!mustFind, "found page"); - if (result != UDS_SUCCESS) { - return result; - } - - if (queuedIndex > -1) { - logDebug("setting pending read to invalid"); - readQueue[queuedIndex].invalid = true; - } - return UDS_SUCCESS; - } - - // Invalidate the page and unmap it from the cache. - result = invalidatePageInCache(cache, page, reason); - if (result != UDS_SUCCESS) { - return result; - } - - // Move the cached page to the least recently used end of the list - // so it will be replaced before any page with valid data. - WRITE_ONCE(page->cp_lastUsed, 0); - - return UDS_SUCCESS; -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int initializePageCache(PageCache *cache, - const Geometry *geometry, - unsigned int chaptersInCache, - unsigned int readQueueMaxSize, - unsigned int zoneCount) -{ - cache->geometry = geometry; - cache->numIndexEntries = geometry->pagesPerVolume + 1; - cache->numCacheEntries = chaptersInCache * geometry->recordPagesPerChapter; - cache->readQueueMaxSize = readQueueMaxSize; - cache->zoneCount = zoneCount; - atomic64_set(&cache->clock, 1); - - int result = ALLOCATE(readQueueMaxSize, QueuedRead, - "volume read queue", &cache->readQueue); - if (result != UDS_SUCCESS) { - return result; - } - - result = ALLOCATE(cache->zoneCount, SearchPendingCounter, - "Volume Cache Zones", &cache->searchPendingCounters); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT((cache->numCacheEntries <= VOLUME_CACHE_MAX_ENTRIES), - "requested cache size, %u, within limit %u", - cache->numCacheEntries, VOLUME_CACHE_MAX_ENTRIES); - if (result != UDS_SUCCESS) { - return result; - } - - result = ALLOCATE(cache->numIndexEntries, uint16_t, "page cache index", - &cache->index); - if (result != UDS_SUCCESS) { - return result; - } - - // Initialize index values to invalid values. - unsigned int i; - for (i = 0; i < cache->numIndexEntries; i++) { - cache->index[i] = cache->numCacheEntries; - } - - result = ALLOCATE(cache->numCacheEntries, CachedPage, - "page cache cache", &cache->cache); - if (result != UDS_SUCCESS) { - return result; - } - - for (i = 0; i < cache->numCacheEntries; i++) { - CachedPage *page = &cache->cache[i]; - result = initializeVolumePage(geometry, &page->cp_pageData); - if (result != UDS_SUCCESS) { - return result; - } - clearPage(cache, page); - } - - return UDS_SUCCESS; -} - -/*********************************************************************/ -int makePageCache(const Geometry *geometry, - unsigned int chaptersInCache, - unsigned int readQueueMaxSize, - unsigned int zoneCount, - PageCache **cachePtr) -{ - if (chaptersInCache < 1) { - return logWarningWithStringError(UDS_BAD_STATE, - "cache size must be" - " at least one chapter"); - } - if (readQueueMaxSize <= 0) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "read queue max size must be" - " greater than 0"); - } - if (zoneCount < 1) { - return logWarningWithStringError(UDS_INVALID_ARGUMENT, - "cache must have at least one zone"); - } - - PageCache *cache; - int result = ALLOCATE(1, PageCache, "volume cache", &cache); - if (result != UDS_SUCCESS) { - return result; - } - - result = initializePageCache(cache, geometry, chaptersInCache, - readQueueMaxSize, zoneCount); - if (result != UDS_SUCCESS) { - freePageCache(cache); - return result; - } - - *cachePtr = cache; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freePageCache(PageCache *cache) -{ - if (cache == NULL) { - return; - } - if (cache->cache != NULL) { - unsigned int i; - for (i = 0; i < cache->numCacheEntries; i++) { - destroyVolumePage(&cache->cache[i].cp_pageData); - } - } - FREE(cache->index); - FREE(cache->cache); - FREE(cache->searchPendingCounters); - FREE(cache->readQueue); - FREE(cache); -} - -/**********************************************************************/ -int invalidatePageCacheForChapter(PageCache *cache, - unsigned int chapter, - unsigned int pagesPerChapter, - InvalidationReason reason) -{ - // We hold the readThreadsMutex. - if ((cache == NULL) || (cache->cache == NULL)) { - return UDS_SUCCESS; - } - - int result; - unsigned int i; - for (i = 0; i < pagesPerChapter; i++) { - unsigned int physicalPage = 1 + (pagesPerChapter * chapter) + i; - result = findInvalidateAndMakeLeastRecent(cache, physicalPage, - cache->readQueue, reason, false); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/*********************************************************************/ -void makePageMostRecent(PageCache *cache, CachedPage *page) -{ - // ASSERTION: We are either a zone thread holding a searchPendingCounter, - // or we are any thread holding the readThreadsMutex. - if (atomic64_read(&cache->clock) != READ_ONCE(page->cp_lastUsed)) { - WRITE_ONCE(page->cp_lastUsed, atomic64_inc_return(&cache->clock)); - } -} - -/** - * Get the least recent valid page from the cache. - * - * @param cache the cache - * @param pagePtr a pointer to hold the new page (will be set to NULL - * if the page was not found) - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int getLeastRecentPage(PageCache *cache, CachedPage **pagePtr) -{ - // We hold the readThreadsMutex. - int oldestIndex = 0; - // Our first candidate is any page that does have a pending read. We ensure - // above that there are more entries than read threads, so there must be one. - unsigned int i; - for (i = 0;; i++) { - if (i >= cache->numCacheEntries) { - // This should never happen. - return ASSERT(false, "oldest page is not NULL"); - } - if (!cache->cache[i].cp_readPending) { - oldestIndex = i; - break; - } - } - // Now find the least recently used page that does not have a pending read. - for (i = 0; i < cache->numCacheEntries; i++) { - if (!cache->cache[i].cp_readPending - && (READ_ONCE(cache->cache[i].cp_lastUsed) - <= READ_ONCE(cache->cache[oldestIndex].cp_lastUsed))) { - oldestIndex = i; - } - } - *pagePtr = &cache->cache[oldestIndex]; - return UDS_SUCCESS; -} - -/***********************************************************************/ -int getPageFromCache(PageCache *cache, - unsigned int physicalPage, - int probeType, - CachedPage **pagePtr) -{ - // ASSERTION: We are in a zone thread. - // ASSERTION: We holding a searchPendingCounter or the readThreadsMutex. - if (cache == NULL) { - return logWarningWithStringError(UDS_BAD_STATE, - "cannot get page with NULL cache"); - } - - // Get the cache page from the index - CachedPage *page; - int queueIndex = -1; - int result = getPageNoStats(cache, physicalPage, &queueIndex, &page); - if (result != UDS_SUCCESS) { - return result; - } - - CacheResultKind cacheResult = ((page != NULL) - ? CACHE_RESULT_HIT - : ((queueIndex != -1) - ? CACHE_RESULT_QUEUED - : CACHE_RESULT_MISS)); - incrementCacheCounter(&cache->counters, probeType, cacheResult); - - if (pagePtr != NULL) { - *pagePtr = page; - } - return UDS_SUCCESS; -} - -/***********************************************************************/ -int enqueueRead(PageCache *cache, Request *request, unsigned int physicalPage) -{ - // We hold the readThreadsMutex. - uint16_t first = cache->readQueueFirst; - uint16_t last = cache->readQueueLast; - uint16_t next = (last + 1) % cache->readQueueMaxSize; - uint16_t readQueuePos; - - if ((cache->index[physicalPage] & VOLUME_CACHE_QUEUED_FLAG) == 0) { - /* Not seen before, add this to the read queue and mark it as queued */ - if (next == first) { - /* queue is full */ - return UDS_SUCCESS; - } - /* fill the read queue entry */ - cache->readQueue[last].physicalPage = physicalPage; - cache->readQueue[last].invalid = false; - - /* point the cache index to it */ - readQueuePos = last; - WRITE_ONCE(cache->index[physicalPage], - readQueuePos | VOLUME_CACHE_QUEUED_FLAG); - cache->readQueue[readQueuePos].requestList.first = NULL; - cache->readQueue[readQueuePos].requestList.last = NULL; - /* bump the last pointer */ - cache->readQueueLast = next; - } else { - /* It's already queued, just add on to it */ - readQueuePos = cache->index[physicalPage] & ~VOLUME_CACHE_QUEUED_FLAG; - } - - int result = ASSERT((readQueuePos < cache->readQueueMaxSize), - "queue is not overfull"); - if (result != UDS_SUCCESS) { - return result; - } - - request->nextRequest = NULL; - if (cache->readQueue[readQueuePos].requestList.first == NULL) { - cache->readQueue[readQueuePos].requestList.first = request; - } else { - cache->readQueue[readQueuePos].requestList.last->nextRequest = request; - } - cache->readQueue[readQueuePos].requestList.last = request; - return UDS_QUEUED; -} - -/***********************************************************************/ -bool reserveReadQueueEntry(PageCache *cache, - unsigned int *queuePos, - Request **firstRequest, - unsigned int *physicalPage, - bool *invalid) -{ - // We hold the readThreadsMutex. - uint16_t lastRead = cache->readQueueLastRead; - - // No items to dequeue - if (lastRead == cache->readQueueLast) { - return false; - } - - unsigned int pageNo = cache->readQueue[lastRead].physicalPage; - bool isInvalid = cache->readQueue[lastRead].invalid; - - uint16_t indexValue = cache->index[pageNo]; - bool queued = (indexValue & VOLUME_CACHE_QUEUED_FLAG) != 0; - - // ALB-1429 ... need to check to see if its still queued before resetting - if (isInvalid && queued) { - // invalidate cache index slot - WRITE_ONCE(cache->index[pageNo], cache->numCacheEntries); - } - - // If a sync read has taken this page, set invalid to true so we don't - // overwrite, we simply just requeue requests. - if (!queued) { - isInvalid = true; - } - - cache->readQueue[lastRead].reserved = true; - - *queuePos = lastRead; - *firstRequest = cache->readQueue[lastRead].requestList.first; - *physicalPage = pageNo; - *invalid = isInvalid; - cache->readQueueLastRead = (lastRead + 1) % cache->readQueueMaxSize; - - return true; -} - -/************************************************************************/ -void releaseReadQueueEntry(PageCache *cache, unsigned int queuePos) -{ - // We hold the readThreadsMutex. - cache->readQueue[queuePos].reserved = false; - - uint16_t lastRead = cache->readQueueLastRead; - - // Move the readQueueFirst pointer along when we can - while ((cache->readQueueFirst != lastRead) - && (!cache->readQueue[cache->readQueueFirst].reserved)) { - cache->readQueueFirst = - (cache->readQueueFirst + 1) % cache->readQueueMaxSize; - } -} - -/***********************************************************************/ -int selectVictimInCache(PageCache *cache, - CachedPage **pagePtr) -{ - // We hold the readThreadsMutex. - if (cache == NULL) { - return logWarningWithStringError(UDS_BAD_STATE, - "cannot put page in NULL cache"); - } - - CachedPage *page = NULL; - int result = getLeastRecentPage(cache, &page); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT((page != NULL), "least recent page was not NULL"); - if (result != UDS_SUCCESS) { - return result; - } - - // If the page is currently being pointed to by the page map, clear - // it from the page map, and update cache stats - if (page->cp_physicalPage != cache->numIndexEntries) { - cache->counters.evictions++; - WRITE_ONCE(cache->index[page->cp_physicalPage], cache->numCacheEntries); - waitForPendingSearches(cache, page->cp_physicalPage); - } - - page->cp_readPending = true; - - *pagePtr = page; - - return UDS_SUCCESS; -} - -/***********************************************************************/ -int putPageInCache(PageCache *cache, - unsigned int physicalPage, - CachedPage *page) -{ - // We hold the readThreadsMutex. - if (cache == NULL) { - return logWarningWithStringError(UDS_BAD_STATE, - "cannot complete page in NULL cache"); - } - - int result = ASSERT((page != NULL), "page to install exists"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT((page->cp_readPending), - "page to install has a pending read"); - if (result != UDS_SUCCESS) { - return result; - } - - clearPage(cache, page); - - page->cp_physicalPage = physicalPage; - - // Figure out the index into the cache array using pointer arithmetic - uint16_t value = page - cache->cache; - result = ASSERT((value < cache->numCacheEntries), "cache index is valid"); - if (result != UDS_SUCCESS) { - return result; - } - - makePageMostRecent(cache, page); - - page->cp_readPending = false; - - /* - * We hold the readThreadsMutex, but we must have a write memory barrier - * before making the CachedPage available to the readers that do not hold the - * mutex. The corresponding read memory barrier is in getPageNoStats. - */ - smp_wmb(); - - // Point the page map to the new page. Will clear queued flag - WRITE_ONCE(cache->index[physicalPage], value); - - return UDS_SUCCESS; -} - -/***********************************************************************/ -void cancelPageInCache(PageCache *cache, - unsigned int physicalPage, - CachedPage *page) -{ - // We hold the readThreadsMutex. - if (cache == NULL) { - logWarning("cannot cancel page in NULL cache"); - return; - } - - int result = ASSERT((page != NULL), "page to install exists"); - if (result != UDS_SUCCESS) { - return; - } - - result = ASSERT((page->cp_readPending), - "page to install has a pending read"); - if (result != UDS_SUCCESS) { - return; - } - - clearPage(cache, page); - page->cp_readPending = false; - - // Clear the page map for the new page. Will clear queued flag - WRITE_ONCE(cache->index[physicalPage], cache->numCacheEntries); -} - -/**********************************************************************/ -size_t getPageCacheSize(PageCache *cache) -{ - if (cache == NULL) { - return 0; - } - return sizeof(DeltaIndexPage) * cache->numCacheEntries; -} - diff --git a/uds/pageCache.h b/uds/pageCache.h deleted file mode 100644 index d639b4a..0000000 --- a/uds/pageCache.h +++ /dev/null @@ -1,504 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/pageCache.h#5 $ - */ - -#ifndef PAGE_CACHE_H -#define PAGE_CACHE_H - -#include "atomicDefs.h" -#include "cacheCounters.h" -#include "chapterIndex.h" -#include "common.h" -#include "compiler.h" -#include "indexConfig.h" -#include "opaqueTypes.h" -#include "permassert.h" -#include "request.h" -#include "volumeStore.h" - -typedef struct requestList { - Request *first; - Request *last; -} RequestList; - -typedef struct cachedPage { - /* whether this page is currently being read asynchronously */ - bool cp_readPending; - /* if equal to numCacheEntries, the page is invalid */ - unsigned int cp_physicalPage; - /* the value of the volume clock when this page was last used */ - int64_t cp_lastUsed; - /* the cache page data */ - struct volume_page cp_pageData; - /* the chapter index page. This is here, even for record pages */ - DeltaIndexPage cp_indexPage; -} CachedPage; - -enum { - VOLUME_CACHE_MAX_ENTRIES = (UINT16_MAX >> 1), - VOLUME_CACHE_QUEUED_FLAG = (1 << 15), - VOLUME_CACHE_DEFAULT_MAX_QUEUED_READS = 4096 -}; - -typedef struct queuedRead { - /* whether this queue entry is invalid */ - bool invalid; - /* whether this queue entry has a pending read on it */ - bool reserved; - /* physical page to read */ - unsigned int physicalPage; - /* list of requests waiting on a queued read */ - RequestList requestList; -} QueuedRead; - -// Reason for invalidating a cache entry, used for gathering statistics -typedef enum invalidationReason { - INVALIDATION_EVICT, // cache is full, goodbye - INVALIDATION_EXPIRE, // your chapter is being overwritten - INVALIDATION_ERROR, // error happened; don't try to use data - INVALIDATION_INIT_SHUTDOWN -} InvalidationReason; - -/* - * Value stored atomically in a SearchPendingCounter. The low order 32 bits is - * the physical page number of the cached page being read. The high order 32 - * bits is a sequence number. - * - * An InvalidateCounter is only written by its zone thread by calling the - * beginPendingSearch or endPendingSearch methods. - * - * Any other thread that is accessing an InvalidateCounter is reading the value - * in the waitForPendingSearches method. - */ -typedef int64_t InvalidateCounter; -// Fields of InvalidateCounter. -// These must be 64 bit, so an enum cannot be not used. -#define PAGE_FIELD ((long)UINT_MAX) // The page number field -#define COUNTER_LSB (PAGE_FIELD + 1L) // The LSB of the counter field - -typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) { - atomic64_t atomicValue; -} SearchPendingCounter; - -typedef struct pageCache { - // Geometry governing the volume - const Geometry *geometry; - // The number of zones - unsigned int zoneCount; - // The number of index entries - unsigned int numIndexEntries; - // The max number of cached entries - uint16_t numCacheEntries; - // The index used to quickly access page in cache - top bit is a 'queued' - // flag - uint16_t *index; - // The cache - CachedPage *cache; - // A counter for each zone to keep track of when a search is occurring - // within that zone. - SearchPendingCounter *searchPendingCounters; - // Queued reads, as a circular array, with first and last indexes - QueuedRead *readQueue; - // Cache counters for stats. This is the first field of a PageCache that is - // not constant after the struct is initialized. - CacheCounters counters; - /** - * Entries are enqueued at readQueueLast. - * To 'reserve' entries, we get the entry pointed to by readQueueLastRead - * and increment last read. This is done with a lock so if another reader - * thread reserves a read, it will grab the next one. After every read - * is completed, the reader thread calls releaseReadQueueEntry which - * increments readQueueFirst until it is equal to readQueueLastRead, but only - * if the value pointed to by readQueueFirst is no longer pending. - * This means that if n reads are outstanding, readQueueFirst may not - * be incremented until the last of the reads finishes. - * - * First Last - * || | | | | | || - * LR (1) (2) - * - * Read thread 1 increments last read (1), then read thread 2 increments it - * (2). When each read completes, it checks to see if it can increment first, - * when all concurrent reads have completed, readQueueFirst should equal - * readQueueLastRead. - **/ - uint16_t readQueueFirst; - uint16_t readQueueLastRead; - uint16_t readQueueLast; - // The size of the read queue - unsigned int readQueueMaxSize; - // Page access counter - atomic64_t clock; -} PageCache; - -/** - * Allocate a cache for a volume. - * - * @param geometry The geometry governing the volume - * @param chaptersInCache The size (in chapters) of the page cache - * @param readQueueMaxSize The maximum size of the read queue - * @param zoneCount The number of zones in the index - * @param cachePtr A pointer to hold the new page cache - * - * @return UDS_SUCCESS or an error code - **/ -int makePageCache(const Geometry *geometry, - unsigned int chaptersInCache, - unsigned int readQueueMaxSize, - unsigned int zoneCount, - PageCache **cachePtr) - __attribute__((warn_unused_result)); - -/** - * Clean up a volume's cache - * - * @param cache the volumecache - **/ -void freePageCache(PageCache *cache); - -/** - * Invalidates a page cache for a particular chapter - * - * @param cache the page cache - * @param chapter the chapter - * @param pagesPerChapter the number of pages per chapter - * @param reason the reason for invalidation - * - * @return UDS_SUCCESS or an error code - **/ -int invalidatePageCacheForChapter(PageCache *cache, - unsigned int chapter, - unsigned int pagesPerChapter, - InvalidationReason reason) - __attribute__((warn_unused_result)); - -/** - * Find a page, invalidate it, and make its memory the least recent. This - * method is only exposed for the use of unit tests. - * - * @param cache The cache containing the page - * @param physicalPage The id of the page to invalidate - * @param readQueue The queue of pending reads (may be NULL) - * @param reason The reason for the invalidation, for stats - * @param mustFind If true, it is an error if the page - * can't be found - * - * @return UDS_SUCCESS or an error code - **/ -int findInvalidateAndMakeLeastRecent(PageCache *cache, - unsigned int physicalPage, - QueuedRead *readQueue, - InvalidationReason reason, - bool mustFind); - -/** - * Make the page the most recent in the cache - * - * @param cache the page cache - * @param pagePtr the page to make most recent - * - * @return UDS_SUCCESS or an error code - **/ -void makePageMostRecent(PageCache *cache, CachedPage *pagePtr); - -/** - * Verifies that a page is in the cache. This method is only exposed for the - * use of unit tests. - * - * @param cache the cache to verify - * @param page the page to find - * - * @return UDS_SUCCESS or an error code - **/ -int assertPageInCache(PageCache *cache, CachedPage *page) - __attribute__((warn_unused_result)); - -/** - * Gets a page from the cache. - * - * @param [in] cache the page cache - * @param [in] physicalPage the page number - * @param [in] probeType the type of cache access being done (CacheProbeType - * optionally OR'ed with CACHE_PROBE_IGNORE_FAILURE) - * @param [out] pagePtr the found page - * - * @return UDS_SUCCESS or an error code - **/ -int getPageFromCache(PageCache *cache, - unsigned int physicalPage, - int probeType, - CachedPage **pagePtr) - __attribute__((warn_unused_result)); - -/** - * Enqueue a read request - * - * @param cache the page cache - * @param request the request that depends on the read - * @param physicalPage the physicalPage for the request - * - * @return UDS_QUEUED if the page was queued - * UDS_SUCCESS if the queue was full - * an error code if there was an error - **/ -int enqueueRead(PageCache *cache, Request *request, unsigned int physicalPage) - __attribute__((warn_unused_result)); - -/** - * Reserves a queued read for future dequeuing, but does not remove it from - * the queue. Must call releaseReadQueueEntry to complete the process - * - * @param cache the page cache - * @param queuePos the position in the read queue for this pending read - * @param firstRequests list of requests for the pending read - * @param physicalPage the physicalPage for the requests - * @param invalid whether or not this entry is invalid - * - * @return UDS_SUCCESS or an error code - **/ -bool reserveReadQueueEntry(PageCache *cache, - unsigned int *queuePos, - Request **firstRequests, - unsigned int *physicalPage, - bool *invalid); - -/** - * Releases a read from the queue, allowing it to be reused by future - * enqueues - * - * @param cache the page cache - * @param queuePos queue entry position - * - * @return UDS_SUCCESS or an error code - **/ -void releaseReadQueueEntry(PageCache *cache, - unsigned int queuePos); - -/** - * Check for the page cache read queue being empty. - * - * @param cache the page cache for which to check the read queue. - * - * @return true if the read queue for cache is empty, false otherwise. - **/ -static INLINE bool readQueueIsEmpty(PageCache *cache) -{ - return (cache->readQueueFirst == cache->readQueueLast); -} - -/** - * Check for the page cache read queue being full. - * - * @param cache the page cache for which to check the read queue. - * - * @return true if the read queue for cache is full, false otherwise. - **/ -static INLINE bool readQueueIsFull(PageCache *cache) -{ - return (cache->readQueueFirst == - (cache->readQueueLast + 1) % cache->readQueueMaxSize); -} - -/** - * Selects a page in the cache to be used for a read. - * - * This will clear the pointer in the page map and - * set readPending to true on the cache page - * - * @param cache the page cache - * @param pagePtr the page to add - * - * @return UDS_SUCCESS or an error code - **/ -int selectVictimInCache(PageCache *cache, - CachedPage **pagePtr) - __attribute__((warn_unused_result)); - -/** - * Completes an async page read in the cache, so that - * the page can now be used for incoming requests. - * - * This will invalidate the old cache entry and point - * the page map for the new page to this entry - * - * @param cache the page cache - * @param physicalPage the page number - * @param page the page to complete processing on - * - * @return UDS_SUCCESS or an error code - **/ -int putPageInCache(PageCache *cache, - unsigned int physicalPage, - CachedPage *page) - __attribute__((warn_unused_result)); - -/** - * Cancels an async page read in the cache, so that - * the page can now be used for incoming requests. - * - * This will invalidate the old cache entry and clear - * the read queued flag on the page map entry, if it - * was set. - * - * @param cache the page cache - * @param physicalPage the page number to clear the queued read flag on - * @param page the page to cancel processing on - * - * @return UDS_SUCCESS or an error code - **/ -void cancelPageInCache(PageCache *cache, - unsigned int physicalPage, - CachedPage *page); - -/** - * Get the page cache size - * - * @param cache the page cache - * - * @return the size of the page cache - **/ -size_t getPageCacheSize(PageCache *cache) - __attribute__((warn_unused_result)); - - -/** - * Read the InvalidateCounter for the given zone. - * - * @param cache the page cache - * @param zoneNumber the zone number - * - * @return the InvalidateCounter value - **/ -static INLINE InvalidateCounter getInvalidateCounter(PageCache *cache, - unsigned int zoneNumber) -{ - return atomic64_read(&cache->searchPendingCounters[zoneNumber].atomicValue); -} - -/** - * Write the InvalidateCounter for the given zone. - * - * @param cache the page cache - * @param zoneNumber the zone number - * @param invalidateCounter the InvalidateCounter value to write - **/ -static INLINE void setInvalidateCounter(PageCache *cache, - unsigned int zoneNumber, - InvalidateCounter invalidateCounter) -{ - atomic64_set(&cache->searchPendingCounters[zoneNumber].atomicValue, - invalidateCounter); -} - -/** - * Return the physical page number of the page being searched. The return - * value is only valid if searchPending indicates that a search is in progress. - * - * @param counter the InvalidateCounter value to check - * - * @return the page that the zone is searching - **/ -static INLINE unsigned int pageBeingSearched(InvalidateCounter counter) -{ - return counter & PAGE_FIELD; -} - -/** - * Determines whether a given value indicates that a search is occuring. - * - * @param invalidateCounter the InvalidateCounter value to check - * - * @return true if a search is pending, false otherwise - **/ -static INLINE bool searchPending(InvalidateCounter invalidateCounter) -{ - return (invalidateCounter & COUNTER_LSB) != 0; -} - -/** - * Determines whether there is a search occuring for the given zone. - * - * @param cache the page cache - * @param zoneNumber the zone number - * - * @return true if a search is pending, false otherwise - **/ -static INLINE bool isSearchPending(PageCache *cache, - unsigned int zoneNumber) -{ - return searchPending(getInvalidateCounter(cache, zoneNumber)); -} - -/** - * Increment the counter for the specified zone to signal that a search has - * begun. Also set which page is being searched. The searchPendingCounters - * are protecting read access to pages indexed by the cache. This is the - * "lock" action. - * - * @param cache the page cache - * @param physicalPage the page that the zone is searching - * @param zoneNumber the zone number - **/ -static INLINE void beginPendingSearch(PageCache *cache, - unsigned int physicalPage, - unsigned int zoneNumber) -{ - InvalidateCounter invalidateCounter = getInvalidateCounter(cache, - zoneNumber); - invalidateCounter &= ~PAGE_FIELD; - invalidateCounter |= physicalPage; - invalidateCounter += COUNTER_LSB; - setInvalidateCounter(cache, zoneNumber, invalidateCounter); - ASSERT_LOG_ONLY(searchPending(invalidateCounter), - "Search is pending for zone %u", zoneNumber); - /* - * This memory barrier ensures that the write to the invalidate counter is - * seen by other threads before this threads accesses the cached page. The - * corresponding read memory barrier is in waitForPendingSearches. - */ - smp_mb(); -} - -/** - * Increment the counter for the specified zone to signal that a search has - * finished. We do not need to reset the page since we only should ever look - * at the page value if the counter indicates a search is ongoing. The - * searchPendingCounters are protecting read access to pages indexed by the - * cache. This is the "unlock" action. - * - * @param cache the page cache - * @param zoneNumber the zone number - **/ -static INLINE void endPendingSearch(PageCache *cache, - unsigned int zoneNumber) -{ - // This memory barrier ensures that this thread completes reads of the - // cached page before other threads see the write to the invalidate counter. - smp_mb(); - - InvalidateCounter invalidateCounter = getInvalidateCounter(cache, - zoneNumber); - ASSERT_LOG_ONLY(searchPending(invalidateCounter), - "Search is pending for zone %u", zoneNumber); - invalidateCounter += COUNTER_LSB; - setInvalidateCounter(cache, zoneNumber, invalidateCounter); -} - -#endif /* PAGE_CACHE_H */ diff --git a/uds/permassert.c b/uds/permassert.c deleted file mode 100644 index 0c8afeb..0000000 --- a/uds/permassert.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/permassert.c#1 $ - */ - -#include "permassert.h" -#include "permassertInternals.h" - -#include "errors.h" - -/*****************************************************************************/ -int assertionFailed(const char *expressionString, - int code, - const char *fileName, - int lineNumber, - const char *format, - ...) -{ - va_list args; - va_start(args, format); - handleAssertionFailure(expressionString, fileName, lineNumber, format, args); - va_end(args); - - return code; -} - -/*****************************************************************************/ -int assertionFailedLogOnly(const char *expressionString, - const char *fileName, - int lineNumber, - const char *format, - ...) -{ - va_list args; - va_start(args, format); - handleAssertionFailure(expressionString, fileName, lineNumber, format, args); - va_end(args); - - return UDS_ASSERTION_FAILED; -} diff --git a/uds/permassert.h b/uds/permassert.h deleted file mode 100644 index d04336b..0000000 --- a/uds/permassert.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/permassert.h#1 $ - */ - -#ifndef PERMASSERT_H -#define PERMASSERT_H - -#include "compiler.h" -#include "errors.h" -#include "uds-error.h" - -#define STRINGIFY(X) #X -#define STRINGIFY_VALUE(X) STRINGIFY(X) - -/* - * A hack to apply the "warn if unused" attribute to an integral expression. - * - * Since GCC doesn't propagate the warn_unused_result attribute to - * conditional expressions incorporating calls to functions with that - * attribute, this function can be used to wrap such an expression. - * With optimization enabled, this function contributes no additional - * instructions, but the warn_unused_result attribute still applies to - * the code calling it. - * - * @param value The value to return - * - * @return The supplied value - */ -__attribute__((warn_unused_result)) -static INLINE int mustUse(int value) -{ - return value; -} - -/* - * A replacement for assert() from assert.h. - * - * @param expr The boolean expression being asserted - * @param code The error code to return on non-fatal assertion - * failure - * @param format A printf() style format for the message to log on - * assertion failure - * @param arguments Any additional arguments required by the format - * - * @return UDS_SUCCESS If expr is true, code if expr is false and - * exitOnAssertionFailure is false. When exitOnAssertionFailure - * is true and expr is false, the program will exit from within - * this macro. - */ -#define ASSERT_WITH_ERROR_CODE(expr, code, ...) \ - mustUse(__builtin_expect(!!(expr), 1) \ - ? UDS_SUCCESS \ - : assertionFailed(STRINGIFY(expr), code, __FILE__, __LINE__, \ - __VA_ARGS__)) - -/* - * A replacement for assert() from assert.h. - * - * @param expr The boolean expression being asserted - * @param format A printf() style format for the message to log on - * assertion failure - * @param arguments Any additional arguments required by the format - * - * @return UDS_SUCCESS If expr is true, UDS_ASSERTION_FAILED if expr is - * false and exitOnAssertionFailure is false. When - * exitOnAssertionFailure is true and expr is false, the - * program will exit from within this macro. - */ -#define ASSERT(expr, ...) \ - ASSERT_WITH_ERROR_CODE(expr, UDS_ASSERTION_FAILED, __VA_ARGS__) - -/* - * A replacement for assert() which logs on failure, but does not return an - * error code. This should be used sparingly. If the expression is false and - * exitOnAssertionFailure is true, the program will exit from within this macro. - * - * @param expr The boolean expression being asserted - * @param format A printf() syle format for the message to log on - * assertion failure - * @param arguments Any additional arguments required by the format - */ -#define ASSERT_LOG_ONLY(expr, ...) \ - (__builtin_expect(!!(expr), 1) \ - ? UDS_SUCCESS \ - : assertionFailedLogOnly(STRINGIFY(expr), __FILE__, __LINE__, __VA_ARGS__)) - -/* - * This macro is a convenient wrapper for ASSERT(false, ...). - */ -#define ASSERT_FALSE(...) \ - ASSERT(false, __VA_ARGS__) - -#define STATIC_ASSERT(expr) \ - do { \ - switch (0) { \ - case 0: \ - case expr: \ - ; \ - default: \ - ; \ - } \ - } while(0) - -#define STATIC_ASSERT_SIZEOF(type, expectedSize) \ - STATIC_ASSERT(sizeof(type) == (expectedSize)) - -/** - * Set whether or not to exit on an assertion failure. - * - * @param shouldExit If true assertion failures will cause - * the program to exit - * - * @return The previous setting - **/ -bool setExitOnAssertionFailure(bool shouldExit); - -/** - * Log an assertion failure. - * - * @param expressionString The assertion - * @param errorCode The error code to return - * @param fileName The file in which the assertion appears - * @param lineNumber The line number on which the assertion - * appears - * @param format A printf() style format describing the - * assertion - * - * @return The supplied errorCode unless exitOnAssertionFailure is - * true, in which case the process will be aborted - **/ -int assertionFailed(const char *expressionString, - int errorCode, - const char *fileName, - int lineNumber, - const char *format, - ...) - __attribute__((format(printf, 5, 6), warn_unused_result)); - -/** - * Log an assertion failure. This function is different from - * assertionFailed() in that its return value may be ignored, and so should - * only be used in cases where the return value will be ignored. - * - * @param expressionString The assertion - * @param fileName The file in which the assertion appears - * @param lineNumber The line number on which the assertion - * appears - * @param format A printf() style format describing the - * assertion - * - * @return UDS_ASSERTION_FAILED unless exitOnAssertionFailure is - * true, in which case the process will be aborted - **/ -int assertionFailedLogOnly(const char *expressionString, - const char *fileName, - int lineNumber, - const char *format, - ...) - __attribute__((format(printf, 4, 5))); - -#endif /* PERMASSERT_H */ diff --git a/uds/permassertInternals.h b/uds/permassertInternals.h deleted file mode 100644 index f0a3b95..0000000 --- a/uds/permassertInternals.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/permassertInternals.h#1 $ - */ - -#ifndef PERMASSERT_INTERNALS_H -#define PERMASSERT_INTERNALS_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -void handleAssertionFailure(const char *expressionString, - const char *fileName, - int lineNumber, - const char *format, - va_list args) - __attribute__((format(printf, 4, 0))); - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif /* PERMASSERT_INTERNALS_H */ diff --git a/uds/permassertLinuxKernel.c b/uds/permassertLinuxKernel.c deleted file mode 100644 index 67f66d9..0000000 --- a/uds/permassertLinuxKernel.c +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/permassertLinuxKernel.c#1 $ - */ - -#include "logger.h" -#include "permassert.h" -#include "permassertInternals.h" - -/**********************************************************************/ -__attribute__((format(printf, 4, 0))) -void handleAssertionFailure(const char *expressionString, - const char *fileName, - int lineNumber, - const char *format, - va_list args) -{ - logEmbeddedMessage(LOG_ERR, "assertion \"", format, args, - "\" (%s) failed at %s:%d", - expressionString, fileName, lineNumber); - logBacktrace(LOG_ERR); -} diff --git a/uds/random.c b/uds/random.c deleted file mode 100644 index acad146..0000000 --- a/uds/random.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/random.c#2 $ - */ - -#include "random.h" - -#include "permassert.h" - -/*****************************************************************************/ -unsigned int randomInRange(unsigned int lo, unsigned int hi) -{ - return lo + random() % (hi - lo + 1); -} - -/*****************************************************************************/ -void randomCompileTimeAssertions(void) -{ - STATIC_ASSERT((((uint64_t) RAND_MAX + 1) & RAND_MAX) == 0); -} - -#ifndef __KERNEL__ -/*****************************************************************************/ -void fillRandomly(void *ptr, size_t len) -{ - uint64_t randNum = 0; - uint64_t randMask = 0; - const uint64_t multiplier = (uint64_t) RAND_MAX + 1; - - byte *bp = ptr; - for (size_t i = 0; i < len; ++i) { - if (randMask < 0xff) { - randNum = randNum * multiplier + random(); - randMask = randMask * multiplier + RAND_MAX; - } - bp[i] = randNum & 0xff; - randNum >>= 8; - randMask >>= 8; - } -} -#endif diff --git a/uds/random.h b/uds/random.h deleted file mode 100644 index f5d2f49..0000000 --- a/uds/random.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/random.h#2 $ - */ - -#ifndef RANDOM_H -#define RANDOM_H - -#ifdef __KERNEL__ -#include -#else -#include -#endif - -#include "compiler.h" -#include "typeDefs.h" - -/** - * Get random unsigned integer in a given range - * - * @param lo Minimum unsigned integer value - * @param hi Maximum unsigned integer value - * - * @return unsigned integer in the interval [lo,hi] - **/ -unsigned int randomInRange(unsigned int lo, unsigned int hi); - -/** - * Special function wrapper required for compile-time assertions. This - * function will fail to compile if RAND_MAX is not of the form 2^n - 1. - **/ -void randomCompileTimeAssertions(void); - -/** - * Fill bytes with random data. - * - * @param ptr where to store bytes - * @param len number of bytes to write - **/ -#ifdef __KERNEL__ -static INLINE void fillRandomly(void *ptr, size_t len) -{ - prandom_bytes(ptr, len); -} -#else -void fillRandomly(void *ptr, size_t len); -#endif - -#ifdef __KERNEL__ -#define RAND_MAX 2147483647 - -/** - * Random number generator - * - * @return a random number in the rand 0 to RAND_MAX - **/ -static INLINE long random(void) -{ - long value; - fillRandomly(&value, sizeof(value)); - return value & RAND_MAX; -} -#endif - -#endif /* RANDOM_H */ diff --git a/uds/recordPage.c b/uds/recordPage.c deleted file mode 100644 index f4c2572..0000000 --- a/uds/recordPage.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/recordPage.c#3 $ - */ - -#include "recordPage.h" - -#include "permassert.h" - -/**********************************************************************/ -static unsigned int encodeTree(byte recordPage[], - const UdsChunkRecord *sortedPointers[], - unsigned int nextRecord, - unsigned int node, - unsigned int nodeCount) -{ - if (node < nodeCount) { - unsigned int child = (2 * node) + 1; - nextRecord = encodeTree(recordPage, sortedPointers, nextRecord, - child, nodeCount); - - // In-order traversal: copy the contents of the next record - // into the page at the node offset. - memcpy(&recordPage[node * BYTES_PER_RECORD], - sortedPointers[nextRecord], - BYTES_PER_RECORD); - ++nextRecord; - - nextRecord = encodeTree(recordPage, sortedPointers, nextRecord, - child + 1, nodeCount); - } - return nextRecord; -} - -/**********************************************************************/ -int encodeRecordPage(const Volume *volume, - const UdsChunkRecord records[], - byte recordPage[]) -{ - unsigned int recordsPerPage = volume->geometry->recordsPerPage; - const UdsChunkRecord **recordPointers = volume->recordPointers; - - // Build an array of record pointers. We'll sort the pointers by the block - // names in the records, which is less work than sorting the record values. - unsigned int i; - for (i = 0; i < recordsPerPage; i++) { - recordPointers[i] = &records[i]; - } - - STATIC_ASSERT(offsetof(UdsChunkRecord, name) == 0); - int result = radixSort(volume->radixSorter, (const byte **) recordPointers, - recordsPerPage, UDS_CHUNK_NAME_SIZE); - if (result != UDS_SUCCESS) { - return result; - } - - // Use the sorted pointers to copy the records from the chapter to the - // record page in tree order. - encodeTree(recordPage, recordPointers, 0, 0, recordsPerPage); - return UDS_SUCCESS; -} - -/**********************************************************************/ -bool searchRecordPage(const byte recordPage[], - const UdsChunkName *name, - const Geometry *geometry, - UdsChunkData *metadata) -{ - // The record page is just an array of chunk records. - const UdsChunkRecord *records = (const UdsChunkRecord *) recordPage; - - // The array of records is sorted by name and stored as a binary tree in - // heap order, so the root of the tree is the first array element. - unsigned int node = 0; - while (node < geometry->recordsPerPage) { - const UdsChunkRecord *record = &records[node]; - int result = memcmp(name, &record->name, UDS_CHUNK_NAME_SIZE); - if (result == 0) { - if (metadata != NULL) { - *metadata = record->data; - } - return true; - } - // The children of node N are in the heap at indexes 2N+1 and 2N+2. - node = ((2 * node) + ((result < 0) ? 1 : 2)); - } - return false; -} diff --git a/uds/recordPage.h b/uds/recordPage.h deleted file mode 100644 index ecf9ddc..0000000 --- a/uds/recordPage.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/recordPage.h#2 $ - */ - -#ifndef RECORDPAGE_H -#define RECORDPAGE_H 1 - -#include "common.h" -#include "volume.h" - -/** - * Generate the on-disk encoding of a record page from the list of records - * in the open chapter representation. - * - * @param volume The volume - * @param records The records to be encoded - * @param recordPage The record page - * - * @return UDS_SUCCESS or an error code - **/ -int encodeRecordPage(const Volume *volume, - const UdsChunkRecord records[], - byte recordPage[]); - -/** - * Find the metadata for a given block name in this page. - * - * @param recordPage The record page - * @param name The block name to look for - * @param geometry The geometry of the volume - * @param metadata an array in which to place the metadata of the - * record, if one was found - * - * @return true if the record was found - **/ -bool searchRecordPage(const byte recordPage[], - const UdsChunkName *name, - const Geometry *geometry, - UdsChunkData *metadata); - -#endif /* RECORDPAGE_H */ diff --git a/uds/regionIdentifiers.h b/uds/regionIdentifiers.h deleted file mode 100644 index ff72b19..0000000 --- a/uds/regionIdentifiers.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/regionIdentifiers.h#1 $ - */ - -#ifndef REGION_IDENTIFIERS_H -#define REGION_IDENTIFIERS_H - -enum { - RH_TYPE_FREE = 0, // unused - RH_TYPE_SUPER = 1, - RH_TYPE_SAVE = 2, - RH_TYPE_CHECKPOINT = 3, - RH_TYPE_UNSAVED = 4, - - RL_KIND_SCRATCH = 0, // uninitialized or scrapped - RL_KIND_HEADER = 1, // for self-referential items - RL_KIND_CONFIG = 100, - RL_KIND_INDEX = 101, - RL_KIND_SEAL = 102, - RL_KIND_VOLUME = 201, - RL_KIND_SAVE = 202, - RL_KIND_INDEX_PAGE_MAP = 301, - RL_KIND_MASTER_INDEX = 302, - RL_KIND_OPEN_CHAPTER = 303, - RL_KIND_INDEX_STATE = 401, // not saved as region - - RL_SOLE_INSTANCE = 65535, -}; - -typedef unsigned int RegionType; -typedef unsigned int RegionKind; - -#endif // REGION_IDENTIFIERS_H diff --git a/uds/request.c b/uds/request.c deleted file mode 100644 index c994181..0000000 --- a/uds/request.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/request.c#6 $ - */ - -#include "request.h" - -#include "indexRouter.h" -#include "indexSession.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "requestQueue.h" - -/**********************************************************************/ -int udsStartChunkOperation(UdsRequest *udsRequest) -{ - if (udsRequest->callback == NULL) { - return UDS_CALLBACK_REQUIRED; - } - switch (udsRequest->type) { - case UDS_DELETE: - case UDS_POST: - case UDS_QUERY: - case UDS_UPDATE: - break; - default: - return UDS_INVALID_OPERATION_TYPE; - } - memset(udsRequest->private, 0, sizeof(udsRequest->private)); - Request *request = (Request *)udsRequest; - - int result = getIndexSession(request->session); - if (result != UDS_SUCCESS) { - return sansUnrecoverable(result); - } - - request->found = false; - request->action = (RequestAction) request->type; - request->isControlMessage = false; - request->unbatched = false; - request->router = request->session->router; - - enqueueRequest(request, STAGE_TRIAGE); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int launchZoneControlMessage(RequestAction action, - ZoneMessage message, - unsigned int zone, - IndexRouter *router) -{ - Request *request; - int result = ALLOCATE(1, Request, __func__, &request); - if (result != UDS_SUCCESS) { - return result; - } - - request->router = router; - request->isControlMessage = true; - request->unbatched = true; - request->action = action; - request->zoneNumber = zone; - request->zoneMessage = message; - - enqueueRequest(request, STAGE_INDEX); - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeRequest(Request *request) -{ - if (request != NULL) { - FREE(request); - } -} - -/**********************************************************************/ -static RequestQueue *getNextStageQueue(Request *request, - RequestStage nextStage) -{ - if (nextStage == STAGE_CALLBACK) { - return request->session->callbackQueue; - } - - // Local and remote index routers handle the rest of the pipeline - // differently, so delegate the choice of queue to the router. - return selectIndexRouterQueue(request->router, request, nextStage); -} - -/**********************************************************************/ -static void handleRequestErrors(Request *request) -{ - // XXX Use the router's callback function to hand back the error - // and clean up the request? (Possible thread issues doing that.) - - freeRequest(request); -} - -/**********************************************************************/ -void enqueueRequest(Request *request, RequestStage nextStage) -{ - RequestQueue *nextQueue = getNextStageQueue(request, nextStage); - if (nextQueue == NULL) { - handleRequestErrors(request); - return; - } - - requestQueueEnqueue(nextQueue, request); -} - -/* - * This function pointer allows unit test code to intercept the slow-lane - * requeuing of a request. - */ -static RequestRestarter requestRestarter = NULL; - -/**********************************************************************/ -void restartRequest(Request *request) -{ - request->requeued = true; - if (requestRestarter == NULL) { - enqueueRequest(request, STAGE_INDEX); - } else { - requestRestarter(request); - } -} - -/**********************************************************************/ -void setRequestRestarter(RequestRestarter restarter) -{ - requestRestarter = restarter; -} - -/**********************************************************************/ -static INLINE void increment_once(uint64_t *countPtr) -{ - WRITE_ONCE(*countPtr, READ_ONCE(*countPtr) + 1); -} - -/**********************************************************************/ -void updateRequestContextStats(Request *request) -{ - /* - * We don't need any synchronization since the context stats are only - * modified from the single callback thread. - * - * We increment either 2 or 3 counters in this method. - * - * XXX We always increment the "requests" counter. But there is no code - * that uses the value stored in this counter. - * - * We always increment exactly one of these counters (unless there is an - * error in the code, which never happens): - * postsFound postsNotFound - * updatesFound updatesNotFound - * deletionsFound deletionsNotFound - * queriesFound queriesNotFound - * - * XXX In the case of post request that were found in the index, we increment - * exactly one of these counters. But there is no code that uses the - * value stored in these counters. - * inMemoryPostsFound - * densePostsFound - * sparsePostsFound - */ - - SessionStats *sessionStats = &request->session->stats; - - increment_once(&sessionStats->requests); - bool found = (request->location != LOC_UNAVAILABLE); - - switch (request->action) { - case REQUEST_INDEX: - if (found) { - increment_once(&sessionStats->postsFound); - - if (request->location == LOC_IN_OPEN_CHAPTER) { - increment_once(&sessionStats->postsFoundOpenChapter); - } else if (request->location == LOC_IN_DENSE) { - increment_once(&sessionStats->postsFoundDense); - } else if (request->location == LOC_IN_SPARSE) { - increment_once(&sessionStats->postsFoundSparse); - } - } else { - increment_once(&sessionStats->postsNotFound); - } - break; - - case REQUEST_UPDATE: - if (found) { - increment_once(&sessionStats->updatesFound); - } else { - increment_once(&sessionStats->updatesNotFound); - } - break; - - case REQUEST_DELETE: - if (found) { - increment_once(&sessionStats->deletionsFound); - } else { - increment_once(&sessionStats->deletionsNotFound); - } - break; - - case REQUEST_QUERY: - if (found) { - increment_once(&sessionStats->queriesFound); - } else { - increment_once(&sessionStats->queriesNotFound); - } - break; - - default: - request->status = ASSERT(false, "unknown next action in request: %d", - request->action); - } -} - -/**********************************************************************/ -void enterCallbackStage(Request *request) -{ - if (!request->isControlMessage) { - if (isUnrecoverable(request->status)) { - // Unrecoverable errors must disable the index session - disableIndexSession(request->session); - // The unrecoverable state is internal and must not sent to the client. - request->status = sansUnrecoverable(request->status); - } - - // Handle asynchronous client callbacks in the designated thread. - enqueueRequest(request, STAGE_CALLBACK); - } else { - /* - * Asynchronous control messages are complete when they are executed. - * There should be nothing they need to do on the callback thread. The - * message has been completely processed, so just free it. - */ - freeRequest(request); - } -} diff --git a/uds/request.h b/uds/request.h deleted file mode 100644 index fb6250e..0000000 --- a/uds/request.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/request.h#7 $ - */ - -#ifndef REQUEST_H -#define REQUEST_H - -#include "cacheCounters.h" -#include "common.h" -#include "compiler.h" -#include "opaqueTypes.h" -#include "threads.h" -#include "timeUtils.h" -#include "uds.h" -#include "util/funnelQueue.h" - -/** - * RequestAction values indicate what action, command, or query is to be - * performed when processing a Request instance. - **/ -typedef enum { - // Map the API's UdsCallbackType values directly to a corresponding action. - REQUEST_INDEX = UDS_POST, - REQUEST_UPDATE = UDS_UPDATE, - REQUEST_DELETE = UDS_DELETE, - REQUEST_QUERY = UDS_QUERY, - - REQUEST_CONTROL, - - // REQUEST_SPARSE_CACHE_BARRIER is the action for the control request used - // by localIndexRouter. - REQUEST_SPARSE_CACHE_BARRIER, - - // REQUEST_ANNOUNCE_CHAPTER_CLOSED is the action for the control - // request used by an indexZone to signal the other zones that it - // has closed the current open chapter. - REQUEST_ANNOUNCE_CHAPTER_CLOSED, -} RequestAction; - -/** - * The block's rough location in the index, if any. - **/ -typedef enum { - /* the block doesn't exist or the location isn't available */ - LOC_UNAVAILABLE, - /* if the block was found in the open chapter */ - LOC_IN_OPEN_CHAPTER, - /* if the block was found in the dense part of the index */ - LOC_IN_DENSE, - /* if the block was found in the sparse part of the index */ - LOC_IN_SPARSE -} IndexRegion; - -/** - * Abstract request pipeline stages, which can also be viewed as stages in the - * life-cycle of a request. - **/ -typedef enum { - STAGE_TRIAGE, - STAGE_INDEX, - STAGE_CALLBACK, -} RequestStage; - -/** - * Control message fields for the barrier messages used to coordinate the - * addition of a chapter to the sparse chapter index cache. - **/ -typedef struct barrierMessageData { - /** virtual chapter number of the chapter index to add to the sparse cache */ - uint64_t virtualChapter; -} BarrierMessageData; - -/** - * Control message fields for the chapter closed messages used to inform - * lagging zones of the first zone to close a given open chapter. - **/ -typedef struct chapterClosedMessageData { - /** virtual chapter number of the chapter which was closed */ - uint64_t virtualChapter; -} ChapterClosedMessageData; - -/** - * Union of the all the zone control message fields. The RequestAction field - * (or launch function argument) selects which of the members is valid. - **/ -typedef union zoneMessageData { - BarrierMessageData barrier; // for REQUEST_SPARSE_CACHE_BARRIER - ChapterClosedMessageData chapterClosed; // for REQUEST_ANNOUNCE_CHAPTER_CLOSED -} ZoneMessageData; - -typedef struct zoneMessage { - /** the index to which the message is directed */ - struct index *index; - /** the message specific data */ - ZoneMessageData data; -} ZoneMessage; - -/** - * Request context for queuing throughout the uds pipeline - * - * XXX Note that the typedef for this struct defines "Request", and that this - * should therefore be "struct request". However, this conflicts with the - * Linux kernel which also has a "struct request". This is a workaround so - * that we can make upstreaming progress. The real solution is to expose - * this structure as the true "struct uds_request" and do a lot of - * renaming. - **/ -struct internalRequest { - /* - * The first part of this structure must be exactly parallel to the - * UdsRequest structure, which is part of the public UDS API. - */ - UdsChunkName chunkName; // hash value - UdsChunkData oldMetadata; // metadata from index - UdsChunkData newMetadata; // metadata from request - UdsChunkCallback *callback; // callback method when complete - struct uds_index_session *session; // The public index session - UdsCallbackType type; // the type of request - int status; // success or error code for this request - bool found; // True if the block was found in index - bool update; // move record to newest chapter if found - - /* - * The remainder of this structure is private to the UDS implementation. - */ - FunnelQueueEntry requestQueueLink; // for lock-free request queue - Request *nextRequest; - IndexRouter *router; - - // Data for control message requests - ZoneMessage zoneMessage; - bool isControlMessage; - - bool unbatched; // if true, must wake worker when enqueued - bool requeued; - RequestAction action; // the action for the index to perform - unsigned int zoneNumber; // the zone for this request to use - IndexRegion location; // if and where the block was found - - bool slLocationKnown; // slow lane has determined a location - IndexRegion slLocation; // location determined by slowlane -}; - -typedef void (*RequestRestarter)(Request *); - -/** - * Make an asynchronous control message for an index zone and enqueue it for - * processing. - * - * @param action The control action to perform - * @param message The message to send - * @param zone The zone number of the zone to receive the message - * @param router The index router responsible for handling the message - * - * @return UDS_SUCCESS or an error code - **/ -int launchZoneControlMessage(RequestAction action, - ZoneMessage message, - unsigned int zone, - IndexRouter *router) - __attribute__((warn_unused_result)); - -/** - * Free an index request. - * - * @param request The request to free - **/ -void freeRequest(Request *request); - -/** - * Enqueue a request for the next stage of the pipeline. If there is more than - * one possible queue for a stage, this function uses the request to decide - * which queue should handle it. - * - * @param request The request to enqueue - * @param nextStage The next stage of the pipeline to process the request - **/ -void enqueueRequest(Request *request, RequestStage nextStage); - -/** - * A method to restart delayed requests. - * - * @param request The request to restart - **/ -void restartRequest(Request *request); - -/** - * Set the function pointer which is used to restart requests. - * This is needed by albserver code and is used as a test hook by the unit - * tests. - * - * @param restarter The function to call to restart requests. - **/ -void setRequestRestarter(RequestRestarter restarter); - -/** - * Enter the callback stage of processing for a request, notifying the waiting - * thread if the request is synchronous, freeing the request if it is an - * asynchronous control message, or placing it on the callback queue if it is - * an asynchronous client request. - * - * @param request the request which has completed execution - **/ -void enterCallbackStage(Request *request); - -/** - * Update the context statistics to reflect the successful completion of a - * client request. - * - * @param request a client request that has successfully completed execution - **/ -void updateRequestContextStats(Request *request); - -/** - * Compute the CacheProbeType value reflecting the request and page type. - * - * @param request The request being processed, or NULL - * @param isIndexPage Whether the cache probe will be for an index page - * - * @return the cache probe type enumeration - **/ -static INLINE CacheProbeType cacheProbeType(Request *request, - bool isIndexPage) -{ - if ((request != NULL) && request->requeued) { - return isIndexPage ? CACHE_PROBE_INDEX_RETRY : CACHE_PROBE_RECORD_RETRY; - } else { - return isIndexPage ? CACHE_PROBE_INDEX_FIRST : CACHE_PROBE_RECORD_FIRST; - } -} -#endif /* REQUEST_H */ diff --git a/uds/requestQueue.h b/uds/requestQueue.h deleted file mode 100644 index 5bf7ef6..0000000 --- a/uds/requestQueue.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/requestQueue.h#1 $ - */ - -#ifndef REQUEST_QUEUE_H -#define REQUEST_QUEUE_H - -#include "opaqueTypes.h" -#include "typeDefs.h" - -/* void return value because this function will process its own errors */ -typedef void RequestQueueProcessor(Request *); - -/** - * Allocate a new request processing queue and start a worker thread to - * consume and service requests in the queue. - * - * @param queueName the name of the queue and the worker thread - * @param processOne the function the worker will invoke on each request - * @param queuePtr a pointer to receive the new queue - * - * @return UDS_SUCCESS or an error code - **/ -int makeRequestQueue(const char *queueName, - RequestQueueProcessor *processOne, - RequestQueue **queuePtr) - __attribute__((warn_unused_result)); - -/** - * Add a request to the end of the queue for processing by the worker thread. - * If the requeued flag is set on the request, it will be processed before - * any non-requeued requests under most circumstances. - * - * @param queue the request queue that should process the request - * @param request the request to be processed on the queue's worker thread - **/ -void requestQueueEnqueue(RequestQueue *queue, Request *request); - -/** - * Shut down the request queue worker thread, then destroy and free the queue. - * - * @param queue the queue to shut down and free - **/ -void requestQueueFinish(RequestQueue *queue); - -#endif /* REQUEST_QUEUE_H */ diff --git a/uds/requestQueueKernel.c b/uds/requestQueueKernel.c deleted file mode 100644 index a53ff12..0000000 --- a/uds/requestQueueKernel.c +++ /dev/null @@ -1,389 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/requestQueueKernel.c#3 $ - */ - -#include "requestQueue.h" - -#include - -#include "atomicDefs.h" -#include "compiler.h" -#include "logger.h" -#include "request.h" -#include "memoryAlloc.h" -#include "threads.h" -#include "util/funnelQueue.h" - -/* - * Ordering: - * - * Multiple retry requests or multiple non-retry requests enqueued from - * a single producer thread will be processed in the order enqueued. - * - * Retry requests will generally be processed before normal requests. - * - * HOWEVER, a producer thread can enqueue a retry request (generally given - * higher priority) and then enqueue a normal request, and they can get - * processed in the reverse order. The checking of the two internal queues is - * very simple and there's a potential race with the producer regarding the - * "priority" handling. If an ordering guarantee is needed, it can be added - * without much difficulty, it just makes the code a bit more complicated. - * - * If requests are enqueued while the processing of another request is - * happening, and the enqueuing operations complete while the request - * processing is still in progress, then the retry request(s) *will* - * get processed next. (This is used for testing.) - */ - -/** - * Time constants, all in units of nanoseconds. - **/ -enum { - ONE_NANOSECOND = 1, - ONE_MICROSECOND = 1000 * ONE_NANOSECOND, - ONE_MILLISECOND = 1000 * ONE_MICROSECOND, - ONE_SECOND = 1000 * ONE_MILLISECOND, - - /** The initial time to wait after waiting with no timeout */ - DEFAULT_WAIT_TIME = 20 * ONE_MICROSECOND, - - /** The minimum time to wait when waiting with a timeout */ - MINIMUM_WAIT_TIME = DEFAULT_WAIT_TIME / 2, - - /** The maximimum time to wait when waiting with a timeout */ - MAXIMUM_WAIT_TIME = ONE_MILLISECOND -}; - -/** - * Batch size tuning constants. These are compared to the number of requests - * that have been processed since the worker thread last woke up. - **/ -enum { - MINIMUM_BATCH = 32, // wait time increases if batches are smaller than this - MAXIMUM_BATCH = 64 // wait time decreases if batches are larger than this -}; - -struct requestQueue { - /* Wait queue for synchronizing producers and consumer */ - struct wait_queue_head wqhead; - /* function to process 1 request */ - RequestQueueProcessor *processOne; - /* new incoming requests */ - FunnelQueue *mainQueue; - /* old requests to retry first */ - FunnelQueue *retryQueue; - /* thread id of the worker thread */ - Thread thread; - /* true if the worker was started */ - bool started; - /* when true, requests can be enqueued */ - bool alive; - /* A flag set when the worker is waiting without a timeout */ - atomic_t dormant; -}; - -/*****************************************************************************/ -/** - * Poll the underlying lock-free queues for a request to process. Must only be - * called by the worker thread. - * - * @param queue the RequestQueue being serviced - * - * @return a dequeued request, or NULL if no request was available - **/ -static INLINE Request *pollQueues(RequestQueue *queue) -{ - // The retry queue has higher priority. - FunnelQueueEntry *entry = funnelQueuePoll(queue->retryQueue); - if (entry != NULL) { - return container_of(entry, Request, requestQueueLink); - } - - // The main queue has lower priority. - entry = funnelQueuePoll(queue->mainQueue); - if (entry != NULL) { - return container_of(entry, Request, requestQueueLink); - } - - // No entry found. - return NULL; -} - -/*****************************************************************************/ -/** - * Check if the underlying lock-free queues appear not just not to have any - * requests available right now, but also not to be in the intermediate state - * of getting requests added. Must only be called by the worker thread. - * - * @param queue the RequestQueue being serviced - * - * @return true iff both funnel queues are idle - **/ -static INLINE bool areQueuesIdle(RequestQueue *queue) -{ - return (isFunnelQueueIdle(queue->retryQueue) && - isFunnelQueueIdle(queue->mainQueue)); -} - -/*****************************************************************************/ -/** - * Remove the next request to be processed from the queue. Must only be called - * by the worker thread. - * - * @param queue the queue from which to remove an entry - * @param requestPtr the next request is returned here, or a NULL pointer to - * indicate that there will be no more requests - * @param waitedPtr return a boolean to indicate that we need to wait - * - * @return True when there is a next request, or when we know that there will - * never be another request. False when we must wait for a request. - **/ -static INLINE bool dequeueRequest(RequestQueue *queue, - Request **requestPtr, - bool *waitedPtr) -{ - // Because of batching, we expect this to be the most common code path. - Request *request = pollQueues(queue); - if (request != NULL) { - // Return because we found a request - *requestPtr = request; - return true; - } - - if (!READ_ONCE(queue->alive)) { - // Return because we see that shutdown is happening - *requestPtr = NULL; - return true; - } - - // Return indicating that we need to wait. - *requestPtr = NULL; - *waitedPtr = true; - return false; -} - -/*****************************************************************************/ -static void requestQueueWorker(void *arg) -{ - RequestQueue *queue = (RequestQueue *) arg; - unsigned long timeBatch = DEFAULT_WAIT_TIME; - bool dormant = atomic_read(&queue->dormant); - long currentBatch = 0; - - for (;;) { - Request *request; - bool waited = false; - if (dormant) { - /* - * Sleep/wakeup protocol: - * - * The enqueue operation updates "newest" in the - * funnel queue via xchg which is a memory barrier, - * and later checks "dormant" to decide whether to do - * a wakeup of the worker thread. - * - * The worker thread, when deciding to go to sleep, - * sets "dormant" and then examines "newest" to decide - * if the funnel queue is idle. In dormant mode, the - * last examination of "newest" before going to sleep - * is done inside the wait_event_interruptible macro, - * after a point where (one or more) memory barriers - * have been issued. (Preparing to sleep uses spin - * locks.) Even if the "next" field update isn't - * visible yet to make the entry accessible, its - * existence will kick the worker thread out of - * dormant mode and back into timer-based mode. - * - * So the two threads should agree on the ordering of - * the updating of the two fields. - */ - wait_event_interruptible(queue->wqhead, - dequeueRequest(queue, &request, &waited) || - !areQueuesIdle(queue)); - } else { - wait_event_interruptible_hrtimeout(queue->wqhead, - dequeueRequest(queue, &request, - &waited), - ns_to_ktime(timeBatch)); - } - - if (likely(request != NULL)) { - // We got a request. - currentBatch++; - queue->processOne(request); - } else if (!READ_ONCE(queue->alive)) { - // We got no request and we know we are shutting down. - break; - } - - if (dormant) { - // We've been roused from dormancy. Clear the flag so enqueuers can stop - // broadcasting (no fence needed for this transition). - atomic_set(&queue->dormant, false); - dormant = false; - // Reset the timeout back to the default since we don't know how long - // we've been asleep and we also want to be responsive to a new burst. - timeBatch = DEFAULT_WAIT_TIME; - } else if (waited) { - // We waited for this request to show up. Adjust the wait time if the - // last batch of requests was too small or too large.. - if (currentBatch < MINIMUM_BATCH) { - // Adjust the wait time if the last batch of requests was too small. - timeBatch += timeBatch / 4; - if (timeBatch >= MAXIMUM_WAIT_TIME) { - // The timeout is getting long enough that we need to switch into - // dormant mode. - atomic_set(&queue->dormant, true); - dormant = true; - } - } else if (currentBatch > MAXIMUM_BATCH) { - // Adjust the wait time if the last batch of requests was too large. - timeBatch -= timeBatch / 4; - if (timeBatch < MINIMUM_WAIT_TIME) { - // But if the producer is very fast or the scheduler doesn't wake up - // up promptly, waiting for very short times won't make the batches - // smaller. - timeBatch = MINIMUM_WAIT_TIME; - } - } - // And we must now start a new batch count - currentBatch = 0; - } - } - - /* - * Ensure that we see any requests that were guaranteed to have been fully - * enqueued before shutdown was flagged. The corresponding write barrier - * is in requestQueueFinish. - */ - smp_rmb(); - - // Process every request that is still in the queue, and never wait for any - // new requests to show up. - for (;;) { - Request *request = pollQueues(queue); - if (request == NULL) { - break; - } - queue->processOne(request); - } -} - -/**********************************************************************/ -int makeRequestQueue(const char *queueName, - RequestQueueProcessor *processOne, - RequestQueue **queuePtr) -{ - RequestQueue *queue; - int result = ALLOCATE(1, RequestQueue, __func__, &queue); - if (result != UDS_SUCCESS) { - return result; - } - queue->processOne = processOne; - queue->alive = true; - atomic_set(&queue->dormant, false); - init_waitqueue_head(&queue->wqhead); - - result = makeFunnelQueue(&queue->mainQueue); - if (result != UDS_SUCCESS) { - requestQueueFinish(queue); - return result; - } - - result = makeFunnelQueue(&queue->retryQueue); - if (result != UDS_SUCCESS) { - requestQueueFinish(queue); - return result; - } - - result = createThread(requestQueueWorker, queue, queueName, &queue->thread); - if (result != UDS_SUCCESS) { - requestQueueFinish(queue); - return result; - } - - queue->started = true; - smp_mb(); - *queuePtr = queue; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static INLINE void wakeUpWorker(RequestQueue *queue) -{ - // This is the code sequence recommended in - smp_mb(); - if (waitqueue_active(&queue->wqhead)) { - wake_up(&queue->wqhead); - } -} - -/**********************************************************************/ -void requestQueueEnqueue(RequestQueue *queue, Request *request) -{ - bool unbatched = request->unbatched; - funnelQueuePut(request->requeued ? queue->retryQueue : queue->mainQueue, - &request->requestQueueLink); - - /* - * We must wake the worker thread when it is dormant (waiting with no - * timeout). An atomic load (read fence) isn't needed here since we know the - * queue operation acts as one. - */ - if (atomic_read(&queue->dormant) || unbatched) { - wakeUpWorker(queue); - } -} - -/**********************************************************************/ -void requestQueueFinish(RequestQueue *queue) -{ - if (queue == NULL) { - return; - } - - /* - * This memory barrier ensures that any requests we queued will be seen. The - * point is that when dequeueRequest sees the following update to the alive - * flag, it will also be able to see any change we made to a next field in - * the FunnelQueue entry. The corresponding read barrier is in - * requestQueueWorker. - */ - smp_wmb(); - - // Mark the queue as dead. - WRITE_ONCE(queue->alive, false); - - if (queue->started) { - // Wake the worker so it notices that it should exit. - wakeUpWorker(queue); - - // Wait for the worker thread to finish processing any additional pending - // work and exit. - int result = joinThreads(queue->thread); - if (result != UDS_SUCCESS) { - logWarningWithStringError(result, "Failed to join worker thread"); - } - } - - freeFunnelQueue(queue->mainQueue); - freeFunnelQueue(queue->retryQueue); - FREE(queue); -} diff --git a/uds/searchList.c b/uds/searchList.c deleted file mode 100644 index ec2ef70..0000000 --- a/uds/searchList.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/searchList.c#2 $ - */ - -#include "searchList.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" - -/**********************************************************************/ -int makeSearchList(unsigned int capacity, - SearchList **listPtr) -{ - if (capacity == 0) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "search list must have entries"); - } - if (capacity > UINT8_MAX) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "search list capacity must fit in 8 bits"); - } - - // We need three temporary entry arrays for purgeSearchList(). Allocate them - // contiguously with the main array. - unsigned int bytes = (sizeof(SearchList) + (4 * capacity * sizeof(uint8_t))); - SearchList *list; - int result = allocateCacheAligned(bytes, "search list", &list); - if (result != UDS_SUCCESS) { - return result; - } - - list->capacity = capacity; - list->firstDeadEntry = 0; - - // Fill in the indexes of the chapter index cache entries. These will be - // only ever be permuted as the search list is used. - uint8_t i; - for (i = 0; i < capacity; i++) { - list->entries[i] = i; - } - - *listPtr = list; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeSearchList(SearchList **listPtr) -{ - FREE(*listPtr); - *listPtr = NULL; -} - -/**********************************************************************/ -void purgeSearchList(SearchList *searchList, - const CachedChapterIndex chapters[], - uint64_t oldestVirtualChapter) -{ - if (searchList->firstDeadEntry == 0) { - // There are no live entries in the list to purge. - return; - } - - /* - * Partition the previously-alive entries in the list into three temporary - * lists, keeping the current LRU search order within each list. The element - * array was allocated with enough space for all four lists. - */ - uint8_t *entries = &searchList->entries[0]; - uint8_t *alive = &entries[searchList->capacity]; - uint8_t *skipped = &alive[searchList->capacity]; - uint8_t *dead = &skipped[searchList->capacity]; - unsigned int nextAlive = 0; - unsigned int nextSkipped = 0; - unsigned int nextDead = 0; - - int i; - for (i = 0; i < searchList->firstDeadEntry; i++) { - uint8_t entry = entries[i]; - const CachedChapterIndex *chapter = &chapters[entry]; - if ((chapter->virtualChapter < oldestVirtualChapter) - || (chapter->virtualChapter == UINT64_MAX)) { - dead[nextDead++] = entry; - } else if (chapter->skipSearch) { - skipped[nextSkipped++] = entry; - } else { - alive[nextAlive++] = entry; - } - } - - // Copy the temporary lists back to the search list so we wind up with - // [ alive, alive, skippable, new-dead, new-dead, old-dead, old-dead ] - memcpy(entries, alive, nextAlive); - entries += nextAlive; - - memcpy(entries, skipped, nextSkipped); - entries += nextSkipped; - - memcpy(entries, dead, nextDead); - // The first dead entry is now the start of the copied dead list. - searchList->firstDeadEntry = (nextAlive + nextSkipped); -} diff --git a/uds/searchList.h b/uds/searchList.h deleted file mode 100644 index 25d99e9..0000000 --- a/uds/searchList.h +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/searchList.h#1 $ - */ - -#ifndef SEARCH_LIST_H -#define SEARCH_LIST_H - -#include "cachedChapterIndex.h" -#include "compiler.h" -#include "stringUtils.h" -#include "typeDefs.h" - -/** - * A SearchList represents the permutations of the sparse chapter index cache - * entry array. Those permutations express an ordering on the chapter indexes, - * from most recently accessed to least recently accessed, which is the order - * in which the indexes should be searched and the reverse order in which they - * should be evicted from the cache (LRU cache replacement policy). - * - * Cache entries that are dead (virtualChapter == UINT64_MAX) are kept as a - * suffix of the list, avoiding the need to even iterate over them to search, - * and ensuring that dead entries are replaced before any live entries are - * evicted. - * - * The search list is intended to be instantated for each zone thread, - * avoiding any need for synchronization. The structure is allocated on a - * cache boundary to avoid false sharing of memory cache lines between zone - * threads. - **/ -typedef struct searchList { - /** The number of cached chapter indexes and search list entries */ - uint8_t capacity; - - /** The index in the entries array of the first dead cache entry */ - uint8_t firstDeadEntry; - - /** The chapter array indexes representing the chapter search order */ - uint8_t entries[]; -} SearchList; - -/** - * SearchListIterator captures the fields needed to iterate over the live - * entries in a search list and return the CachedChapterIndex pointers that - * the search code actually wants to deal with. - **/ -typedef struct { - /** The search list defining the chapter search iteration order */ - SearchList *list; - - /** The index of the next entry to return from the search list */ - unsigned int nextEntry; - - /** The cached chapters that are referenced by the search list */ - CachedChapterIndex *chapters; -} SearchListIterator; - -/** - * Allocate and initialize a new chapter cache search list with the same - * capacity as the cache. The index of each entry in the cache will appear - * exactly once in the array. All the chapters in the cache are assumed to be - * initially dead, so firstDeadEntry will be zero and no chapters will be - * returned when the search list is iterated. - * - * @param [in] capacity the number of entries in the search list - * @param [out] listPtr a pointer in which to return the new search list - **/ -int makeSearchList(unsigned int capacity, - SearchList **listPtr) - __attribute__((warn_unused_result)); - -/** - * Free a search list and null out the reference to it. - * - * @param listPtr the reference to the search list to free - **/ -void freeSearchList(SearchList **listPtr); - -/** - * Copy the contents of one search list to another. - * - * @param source the list to copy - * @param target the list to replace - **/ -static INLINE void copySearchList(const SearchList *source, - SearchList *target) -{ - *target = *source; - memcpy(target->entries, source->entries, source->capacity); -} - -/** - * Prepare to iterate over the live cache entries a search list. - * - * @param list the list defining the live chapters and the search order - * @param chapters the chapter index entries to return from getNextChapter() - * - * @return an iterator positioned at the start of the search list - **/ -static INLINE SearchListIterator -iterateSearchList(SearchList *list, CachedChapterIndex chapters[]) -{ - SearchListIterator iterator = { - .list = list, - .nextEntry = 0, - .chapters = chapters, - }; - return iterator; -} - -/** - * Check if the search list iterator has another entry to return. - * - * @param iterator the search list iterator - * - * @return true if getNextChapter() may be called - **/ -static INLINE bool hasNextChapter(const SearchListIterator *iterator) -{ - return (iterator->nextEntry < iterator->list->firstDeadEntry); -} - -/** - * Return a pointer to the next live chapter in the search list iteration and - * advance the iterator. This must only be called when hasNextChapter() - * returns true. - * - * @param iterator the search list iterator - * - * @return a pointer to the next live chapter index in the search list order - **/ -static INLINE CachedChapterIndex *getNextChapter(SearchListIterator *iterator) -{ - return &iterator->chapters[iterator->list->entries[iterator->nextEntry++]]; -} - -/** - * Rotate the pointers in a prefix of a search list downwards by one item, - * pushing elements deeper into the list and moving a new chapter to the start - * of the search list. This is the "make most recent" operation on the search - * list. - * - * If the search list provided is [ 0 1 2 3 4 ] and the prefix - * length is 4, then 3 is being moved to the front. - * The search list after the call will be [ 3 0 1 2 4 ] and the - * function will return 3. - * - * @param searchList the chapter index search list to rotate - * @param prefixLength the length of the prefix of the list to rotate - * - * @return the array index of the chapter cache entry that is now at the front - * of the search list - **/ -static INLINE uint8_t rotateSearchList(SearchList *searchList, - uint8_t prefixLength) -{ - // Grab the value of the last entry in the list prefix. - uint8_t mostRecent = searchList->entries[prefixLength - 1]; - - if (prefixLength > 1) { - // Push the first N-1 entries down by one entry, overwriting the entry - // we just grabbed. - memmove(&searchList->entries[1], - &searchList->entries[0], - prefixLength - 1); - - // We now have a hole at the front of the list in which we can place the - // rotated entry. - searchList->entries[0] = mostRecent; - } - - // This function is also used to move a dead chapter to the front of the - // list, in which case the suffix of dead chapters was pushed down too. - if (searchList->firstDeadEntry < prefixLength) { - searchList->firstDeadEntry += 1; - } - - return mostRecent; -} - -/** - * Purge invalid cache entries, marking them as dead and moving them to the - * end of the search list, then push any chapters that have skipSearch set - * down so they follow all the remaining live, valid chapters in the search - * list. This effectively sorts the search list into three regions--active, - * skippable, and dead--while maintaining the LRU ordering that already - * existed (a stable sort). - * - * This operation must only be called during the critical section in - * updateSparseCache() since it effectively changes cache membership. - * - * @param searchList the chapter index search list to purge - * @param chapters the chapter index cache entries - * @param oldestVirtualChapter the oldest virtual chapter - **/ -void purgeSearchList(SearchList *searchList, - const CachedChapterIndex chapters[], - uint64_t oldestVirtualChapter); - -#endif /* SEARCH_LIST_H */ diff --git a/uds/sparseCache.c b/uds/sparseCache.c deleted file mode 100644 index f816d12..0000000 --- a/uds/sparseCache.c +++ /dev/null @@ -1,535 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/sparseCache.c#3 $ - */ - -/** - * The sparse chapter index cache is implemented as a simple array of cache - * entries. Since the cache is small (seven chapters by default), searching - * for a specific virtual chapter is implemented as a linear search. The cache - * replacement policy is least-recently-used (LRU). Again, size of the cache - * allows the LRU order to be maintained by shifting entries in an array list. - * - * The most important property of this cache is the absence of synchronization - * for read operations. Safe concurrent access to the cache by the zone - * threads is controlled by the triage queue and the barrier requests it - * issues to the zone queues. The set of cached chapters does not and must not - * change between the carefully coordinated calls to updateSparseCache() from - * the zone threads. - * - * The critical invariant for that coordination is the cache membership must - * not change between those updates; the calls to sparseCacheContains() from - * the zone threads must all receive the same results for any virtual chapter - * number. To ensure that critical invariant, state changes such as "that - * virtual chapter is no longer in the volume" and "skip searching that - * chapter because it has had too many cache misses" are represented - * separately from the cache membership information (the virtual chapter - * number). - * - * As a result of this invariant, we have the guarantee that every zone thread - * will call updateSparseCache() once and exactly once to request a chapter - * that is not in the cache, and the serialization of the barrier requests - * from the triage queue ensures they will all request the same chapter - * number. This means the only synchronization we need can be provided by a - * pair of thread barriers used only in the updateSparseCache() call, - * providing a critical section where a single zone thread can drive the cache - * update while all the other zone threads are known to be blocked, waiting in - * the second barrier. Outside that critical section, all the zone threads - * implicitly hold a shared lock. Inside it, the "captain" (the thread that - * was uniquely flagged when passing through the first barrier) holds an - * exclusive lock. No other threads may access or modify the cache, except for - * accessing cache statistics and similar queries. - * - * Cache statistics must only be modified by a single thread, conventionally - * the zone zero thread. All fields that might be frequently updated by that - * thread are kept in separate cache-aligned structures so they will not cause - * cache contention via "false sharing" with the fields that are frequently - * accessed by all of the zone threads. - * - * LRU order is kept independently by each zone thread, and each zone uses its - * own list for searching and cache membership queries. The zone zero list is - * used to decide which chapter to evict when the cache is updated, and its - * search list is copied to the other threads at that time. - * - * The virtual chapter number field of the cache entry is the single field - * indicating whether a chapter is a member of the cache or not. The value - * UINT64_MAX is used to represent a null, undefined, or wildcard - * chapter number. When present in the virtual chapter number field - * CachedChapterIndex, it indicates that the cache entry is dead, and all - * the other fields of that entry (other than immutable pointers to cache - * memory) are undefined and irrelevant. Any cache entry that is not marked as - * dead is fully defined and a member of the cache--sparseCacheContains() - * must always return true for any virtual chapter number that appears in any - * of the cache entries. - * - * A chapter index that is a member of the cache may be marked for different - * treatment (disabling search) between calls to updateSparseCache() in two - * different ways. When a chapter falls off the end of the volume, its virtual - * chapter number will be less that the oldest virtual chapter number. Since - * that chapter is no longer part of the volume, there's no point in continuing - * to search that chapter index. Once invalidated, that virtual chapter will - * still be considered a member of the cache, but it will no longer be searched - * for matching chunk names. - * - * The second mechanism for disabling search is the heuristic based on keeping - * track of the number of consecutive search misses in a given chapter index. - * Once that count exceeds a threshold, the skipSearch flag will be set to - * true, causing the chapter to be skipped in the fallback search of the - * entire cache, but still allowing it to be found when searching for a hook - * in that specific chapter. Finding a hook will clear the skipSearch flag, - * once again allowing the non-hook searches to use the cache entry. Again, - * regardless of the state of the skipSearch flag, the virtual chapter must - * still considered to be a member of the cache for sparseCacheContains(). - * - * Barrier requests and the sparse chapter index cache are also described in - * - * https://intranet.permabit.com/wiki/Chapter_Index_Cache_supports_concurrent_access - * - * and in a message to the albireo mailing list on 5/28/2011 titled "true - * barriers with a hook resolution queue". - **/ - -#include "sparseCache.h" - -#include "cachedChapterIndex.h" -#include "chapterIndex.h" -#include "common.h" -#include "index.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "searchList.h" -#include "threads.h" -#include "zone.h" - -enum { - /** The number of consecutive search misses that will disable searching */ - SKIP_SEARCH_THRESHOLD = 20000, - - /** a named constant to use when identifying zone zero */ - ZONE_ZERO = 0 -}; - -/** - * These counter values are essentially fields of the SparseCache, but are - * segregated into this structure because they are frequently modified. We - * group them and align them to keep them on different cache lines from the - * cache fields that are accessed far more often than they are updated. - **/ -typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) sparseCacheCounters { - /** the total number of virtual chapter probes that succeeded */ - uint64_t chapterHits; - - /** the total number of virtual chapter probes that failed */ - uint64_t chapterMisses; - - /** the total number of cache searches that found a possible match */ - uint64_t searchHits; - - /** the total number of cache searches that found no matches */ - uint64_t searchMisses; - - /** the number of cache entries that fell off the end of the volume */ - uint64_t invalidations; - - /** the number of cache entries that were evicted while still valid */ - uint64_t evictions; -} SparseCacheCounters; - -/** - * This is the private structure definition of a SparseCache. - **/ -struct sparseCache { - /** the number of cache entries, which is the size of the chapters array */ - unsigned int capacity; - - /** the number of zone threads using the cache */ - unsigned int zoneCount; - - /** the geometry governing the volume */ - const Geometry *geometry; - - /** the number of search misses in zone zero that will disable searching */ - unsigned int skipSearchThreshold; - - /** pointers to the cache-aligned chapter search order for each zone */ - SearchList *searchLists[MAX_ZONES]; - - /** the thread barriers used to synchronize the zone threads for update */ - Barrier beginCacheUpdate; - Barrier endCacheUpdate; - - /** frequently-updated counter fields (cache-aligned) */ - SparseCacheCounters counters; - - /** the counted array of chapter index cache entries (cache-aligned) */ - CachedChapterIndex chapters[]; -}; - -/** - * Initialize a sparse chapter index cache. - * - * @param cache the sparse cache to initialize - * @param geometry the geometry governing the volume - * @param capacity the number of chapters the cache will hold - * @param zoneCount the number of zone threads using the cache - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int initializeSparseCache(SparseCache *cache, - const Geometry *geometry, - unsigned int capacity, - unsigned int zoneCount) -{ - cache->geometry = geometry; - cache->capacity = capacity; - cache->zoneCount = zoneCount; - - // Scale down the skip threshold by the number of zones since we count the - // chapter search misses only in zone zero. - cache->skipSearchThreshold = (SKIP_SEARCH_THRESHOLD / zoneCount); - - int result = initializeBarrier(&cache->beginCacheUpdate, zoneCount); - if (result != UDS_SUCCESS) { - return result; - } - result = initializeBarrier(&cache->endCacheUpdate, zoneCount); - if (result != UDS_SUCCESS) { - return result; - } - unsigned int i; - for (i = 0; i < capacity; i++) { - result = initializeCachedChapterIndex(&cache->chapters[i], geometry); - if (result != UDS_SUCCESS) { - return result; - } - } - - // Allocate each zone's independent LRU order. - for (i = 0; i < zoneCount; i++) { - result = makeSearchList(capacity, &cache->searchLists[i]); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int makeSparseCache(const Geometry *geometry, - unsigned int capacity, - unsigned int zoneCount, - SparseCache **cachePtr) -{ - unsigned int bytes - = (sizeof(SparseCache) + (capacity * sizeof(CachedChapterIndex))); - - SparseCache *cache; - int result = allocateCacheAligned(bytes, "sparse cache", &cache); - if (result != UDS_SUCCESS) { - return result; - } - - result = initializeSparseCache(cache, geometry, capacity, zoneCount); - if (result != UDS_SUCCESS) { - freeSparseCache(cache); - return result; - } - - *cachePtr = cache; - return UDS_SUCCESS; -} - -/**********************************************************************/ -size_t getSparseCacheMemorySize(const SparseCache *cache) -{ - // Count the DeltaIndexPage as cache memory, but ignore all other overhead. - size_t pageSize = (sizeof(DeltaIndexPage) + cache->geometry->bytesPerPage); - size_t chapterSize = (pageSize * cache->geometry->indexPagesPerChapter); - return (cache->capacity * chapterSize); -} - -/** - * Update counters to reflect a chapter access hit and clear the skipSearch - * flag on the chapter, if set. - * - * @param cache the cache to update - * @param chapter the cache entry to update - **/ -static void scoreChapterHit(SparseCache *cache, - CachedChapterIndex *chapter) -{ - cache->counters.chapterHits += 1; - setSkipSearch(chapter, false); -} - -/** - * Update counters to reflect a chapter access miss. - * - * @param cache the cache to update - **/ -static void scoreChapterMiss(SparseCache *cache) -{ - cache->counters.chapterMisses += 1; -} - -/** - * Check if the cache entry that is about to be replaced is already dead, and - * if it's not, add to tally of evicted or invalidated cache entries. - * - * @param zone the zone used to find the oldest chapter - * @param cache the cache to update - * @param chapter the cache entry about to be replaced - **/ -static void scoreEviction(IndexZone *zone, - SparseCache *cache, - CachedChapterIndex *chapter) -{ - if (chapter->virtualChapter == UINT64_MAX) { - return; - } - if (chapter->virtualChapter < zone->oldestVirtualChapter) { - cache->counters.invalidations += 1; - } else { - cache->counters.evictions += 1; - } -} - -/** - * Update counters to reflect a cache search hit. This bumps the hit - * count, clears the miss count, and clears the skipSearch flag. - * - * @param cache the cache to update - * @param chapter the cache entry to update - **/ -static void scoreSearchHit(SparseCache *cache, - CachedChapterIndex *chapter) -{ - cache->counters.searchHits += 1; - chapter->counters.searchHits += 1; - chapter->counters.consecutiveMisses = 0; - setSkipSearch(chapter, false); -} - -/** - * Update counters to reflect a cache search miss. This bumps the consecutive - * miss count, and if it goes over skipSearchThreshold, sets the skipSearch - * flag on the chapter. - * - * @param cache the cache to update - * @param chapter the cache entry to update - **/ -static void scoreSearchMiss(SparseCache *cache, - CachedChapterIndex *chapter) -{ - cache->counters.searchMisses += 1; - chapter->counters.searchMisses += 1; - chapter->counters.consecutiveMisses += 1; - if (chapter->counters.consecutiveMisses > cache->skipSearchThreshold) { - setSkipSearch(chapter, true); - } -} - -/**********************************************************************/ -void freeSparseCache(SparseCache *cache) -{ - if (cache == NULL) { - return; - } - - unsigned int i; - for (i = 0; i < cache->zoneCount; i++) { - freeSearchList(&cache->searchLists[i]); - } - - for (i = 0; i < cache->capacity; i++) { - CachedChapterIndex *chapter = &cache->chapters[i]; - destroyCachedChapterIndex(chapter); - } - - destroyBarrier(&cache->beginCacheUpdate); - destroyBarrier(&cache->endCacheUpdate); - FREE(cache); -} - - -/**********************************************************************/ -bool sparseCacheContains(SparseCache *cache, - uint64_t virtualChapter, - unsigned int zoneNumber) -{ - /* - * The correctness of the barriers depends on the invariant that between - * calls to updateSparseCache(), the answers this function returns must - * never vary--the result for a given chapter must be identical across - * zones. That invariant must be maintained even if the chapter falls off - * the end of the volume, or if searching it is disabled because of too many - * search misses. - */ - - // Get the chapter search order for this zone thread. - SearchListIterator iterator - = iterateSearchList(cache->searchLists[zoneNumber], cache->chapters); - while (hasNextChapter(&iterator)) { - CachedChapterIndex *chapter = getNextChapter(&iterator); - if (virtualChapter == chapter->virtualChapter) { - if (zoneNumber == ZONE_ZERO) { - scoreChapterHit(cache, chapter); - } - - // Move the chapter to the front of the search list. - rotateSearchList(iterator.list, iterator.nextEntry); - return true; - } - } - - // The specified virtual chapter isn't cached. - if (zoneNumber == ZONE_ZERO) { - scoreChapterMiss(cache); - } - return false; -} - -/**********************************************************************/ -int updateSparseCache(IndexZone *zone, uint64_t virtualChapter) -{ - const Index *index = zone->index; - SparseCache *cache = index->volume->sparseCache; - - // If the chapter is already in the cache, we don't need to do a thing - // except update the search list order, which this check does. - if (sparseCacheContains(cache, virtualChapter, zone->id)) { - return UDS_SUCCESS; - } - - // Wait for every zone thread to have reached its corresponding barrier - // request and invoked this function before starting to modify the cache. - enterBarrier(&cache->beginCacheUpdate, NULL); - - /* - * This is the start of the critical section: the zone zero thread is - * captain, effectively holding an exclusive lock on the sparse cache. All - * the other zone threads must do nothing between the two barriers. They - * will wait at the endCacheUpdate barrier for the captain to finish the - * update. - */ - - int result = UDS_SUCCESS; - if (zone->id == ZONE_ZERO) { - // Purge invalid chapters from the LRU search list. - SearchList *zoneZeroList = cache->searchLists[ZONE_ZERO]; - purgeSearchList(zoneZeroList, cache->chapters, zone->oldestVirtualChapter); - - // First check that the desired chapter is still in the volume. If it's - // not, the hook fell out of the index and there's nothing to do for it. - if (virtualChapter >= index->oldestVirtualChapter) { - // Evict the least recently used live chapter, or replace a dead cache - // entry, all by rotating the the last list entry to the front. - CachedChapterIndex *victim - = &cache->chapters[rotateSearchList(zoneZeroList, cache->capacity)]; - - // Check if the victim is already dead, and if it's not, add to the - // tally of evicted or invalidated cache entries. - scoreEviction(zone, cache, victim); - - // Read the index page bytes and initialize the page array. - result = cacheChapterIndex(victim, virtualChapter, index->volume); - } - - // Copy the new search list state to all the other zone threads so they'll - // get the result of pruning and see the new chapter. - unsigned int z; - for (z = 1; z < cache->zoneCount; z++) { - copySearchList(zoneZeroList, cache->searchLists[z]); - } - } - - // This is the end of the critical section. All cache invariants must have - // been restored--it will be shared/read-only again beyond the barrier. - - enterBarrier(&cache->endCacheUpdate, NULL); - return result; -} - - -/**********************************************************************/ -int searchSparseCache(IndexZone *zone, - const UdsChunkName *name, - uint64_t *virtualChapterPtr, - int *recordPagePtr) -{ - Volume *volume = zone->index->volume; - SparseCache *cache = volume->sparseCache; - unsigned int zoneNumber = zone->id; - // If the caller did not specify a virtual chapter, search the entire cache. - bool searchAll = (*virtualChapterPtr == UINT64_MAX); - unsigned int chaptersSearched = 0; - - // Get the chapter search order for this zone thread, searching the chapters - // from most recently hit to least recently hit. - SearchListIterator iterator - = iterateSearchList(cache->searchLists[zoneNumber], cache->chapters); - while (hasNextChapter(&iterator)) { - CachedChapterIndex *chapter = getNextChapter(&iterator); - - // Skip chapters no longer cached, or that have too many search misses. - if (shouldSkipChapterIndex(zone, chapter, *virtualChapterPtr)) { - continue; - } - - int result = searchCachedChapterIndex(chapter, cache->geometry, - volume->indexPageMap, name, - recordPagePtr); - if (result != UDS_SUCCESS) { - return result; - } - chaptersSearched += 1; - - // Did we find an index entry for the name? - if (*recordPagePtr != NO_CHAPTER_INDEX_ENTRY) { - if (zoneNumber == ZONE_ZERO) { - scoreSearchHit(cache, chapter); - } - - // Move the chapter to the front of the search list. - rotateSearchList(iterator.list, iterator.nextEntry); - - // Return a matching entry as soon as it is found. It might be a false - // collision that has a true match in another chapter, but that's a very - // rare case and not worth the extra search cost or complexity. - *virtualChapterPtr = chapter->virtualChapter; - return UDS_SUCCESS; - } - - if (zoneNumber == ZONE_ZERO) { - scoreSearchMiss(cache, chapter); - } - - if (!searchAll) { - // We just searched the virtual chapter the caller specified and there - // was no match, so we're done. - break; - } - } - - // The name was not found in the cache. - *recordPagePtr = NO_CHAPTER_INDEX_ENTRY; - return UDS_SUCCESS; -} diff --git a/uds/sparseCache.h b/uds/sparseCache.h deleted file mode 100644 index 09c4a1c..0000000 --- a/uds/sparseCache.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/sparseCache.h#1 $ - */ - -#ifndef SPARSE_CACHE_H -#define SPARSE_CACHE_H - -#include "cacheCounters.h" -#include "geometry.h" -#include "indexZone.h" -#include "typeDefs.h" - -/** - * SparseCache is the cache of entire chapter indexes from sparse chapters - * used for searching for chunks after all other search paths have failed. It - * contains only complete chapter indexes; record pages from sparse chapters - * and single index pages used for resolving hooks are kept in the volume page - * cache. - * - * Searching the cache is an unsynchronized operation. Changing the contents - * of the cache is a coordinated process requiring the coordinated - * participation of all zone threads via the careful use of barrier messages - * sent to all the index zones by the triage queue worker thread. - **/ -typedef struct sparseCache SparseCache; - -// Bare declaration to avoid include dependency loops. -struct index; - -/** - * Allocate and initialize a sparse chapter index cache. - * - * @param [in] geometry the geometry governing the volume - * @param [in] capacity the number of chapters the cache will hold - * @param [in] zoneCount the number of zone threads using the cache - * @param [out] cachePtr a pointer in which to return the new cache - * - * @return UDS_SUCCESS or an error code - **/ -int makeSparseCache(const Geometry *geometry, - unsigned int capacity, - unsigned int zoneCount, - SparseCache **cachePtr) - __attribute__((warn_unused_result)); - -/** - * Destroy and free a sparse chapter index cache. - * - * @param cache the cache to free - **/ -void freeSparseCache(SparseCache *cache); - -/** - * Get the number of bytes of memory used by a sparse chapter cache. - * - * @param cache the cache to measure - **/ -size_t getSparseCacheMemorySize(const SparseCache *cache); - - -/** - * Check whether a sparse chapter index is present in the chapter cache. This - * is only intended for use by the zone threads. - * - * @param cache the cache to search for the virtual chapter - * @param virtualChapter the virtual chapter number of the chapter index - * @param zoneNumber the zone number of the calling thread - * - * @return true iff the sparse chapter index is cached - **/ -bool sparseCacheContains(SparseCache *cache, - uint64_t virtualChapter, - unsigned int zoneNumber); - -/** - * Update the sparse cache to contain a chapter index. - * - * This function must be called by all the zone threads with the same chapter - * numbers to correctly enter the thread barriers used to synchronize the - * cache updates. - * - * @param zone the index zone - * @param virtualChapter the virtual chapter number of the chapter index - * - * @return UDS_SUCCESS or an error code if the chapter index could not be - * read or decoded - **/ -int updateSparseCache(IndexZone *zone, uint64_t virtualChapter) - __attribute__((warn_unused_result)); - - -/** - * Search the cached sparse chapter indexes for a chunk name, returning a - * virtual chapter number and record page number that may contain the name. - * - * @param [in] zone the zone containing the volume, sparse - * chapter index cache and the index page - * number map - * @param [in] name the chunk name to search for - * @param [in,out] virtualChapterPtr If UINT64_MAX on input, - * search all cached chapters, else search - * the specified virtual chapter, if cached. - * On output, if a match was found, set to - * the virtual chapter number of the match, - * otherwise set to UINT64_MAX on a miss. - * @param [out] recordPagePtr the record page number of a match, else - * NO_CHAPTER_INDEX_ENTRY if nothing matched - * - * @return UDS_SUCCESS or an error code - **/ -int searchSparseCache(IndexZone *zone, - const UdsChunkName *name, - uint64_t *virtualChapterPtr, - int *recordPagePtr) - __attribute__((warn_unused_result)); - -#endif /* SPARSE_CACHE_H */ diff --git a/uds/stringLinuxKernel.c b/uds/stringLinuxKernel.c deleted file mode 100644 index bf0a255..0000000 --- a/uds/stringLinuxKernel.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/stringLinuxKernel.c#1 $ - */ - -#include - -#include "errors.h" -#include "logger.h" -#include "stringUtils.h" - -/**********************************************************************/ -int stringToSignedLong(const char *nptr, long *num) -{ - while (*nptr == ' ') { - nptr++; - } - return kstrtol(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS; -} - -/**********************************************************************/ -int stringToUnsignedLong(const char *nptr, unsigned long *num) -{ - while (*nptr == ' ') { - nptr++; - } - if (*nptr == '+') { - nptr++; - } - return kstrtoul(nptr, 10, num) ? UDS_INVALID_ARGUMENT : UDS_SUCCESS; -} - -/*****************************************************************************/ -char *nextToken(char *str, const char *delims, char **state) -{ - char *sp = str ? str : *state; - while (*sp && strchr(delims, *sp)) { - ++sp; - } - if (!*sp) { - return NULL; - } - char *ep = sp; - while (*ep && !strchr(delims, *ep)) { - ++ep; - } - if (*ep) { - *ep++ = '\0'; - } - *state = ep; - return sp; -} - -/*****************************************************************************/ -int parseUint64(const char *str, uint64_t *num) -{ - unsigned long value = *num; - int result = stringToUnsignedLong(str, &value); - *num = value; - return result; -} diff --git a/uds/stringUtils.c b/uds/stringUtils.c deleted file mode 100644 index 93d7da1..0000000 --- a/uds/stringUtils.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/stringUtils.c#2 $ - */ - -#include "stringUtils.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" - -/*****************************************************************************/ -int allocSprintf(const char *what, char **strp, const char *fmt, ...) -{ - if (strp == NULL) { - return UDS_INVALID_ARGUMENT; - } - va_list args; -#ifdef __KERNEL__ - // We want the memory allocation to use our own ALLOCATE/FREE wrappers. - va_start(args, fmt); - int count = vsnprintf(NULL, 0, fmt, args) + 1; - va_end(args); - int result = ALLOCATE(count, char, what, strp); - if (result == UDS_SUCCESS) { - va_start(args, fmt); - vsnprintf(*strp, count, fmt, args); - va_end(args); - } -#else - va_start(args, fmt); - int result = vasprintf(strp, fmt, args) == -1 ? ENOMEM : UDS_SUCCESS; - va_end(args); -#endif - if ((result != UDS_SUCCESS) && (what != NULL)) { - logError("cannot allocate %s", what); - } - return result; -} - -/*****************************************************************************/ -int wrapVsnprintf(const char *what, char *buf, size_t bufSize, - int error, const char *fmt, va_list ap, size_t *needed) -{ - if (buf == NULL) { - static char nobuf[1]; - buf = nobuf; - bufSize = 0; - } - int n = vsnprintf(buf, bufSize, fmt, ap); - if (n < 0) { - return logErrorWithStringError(UDS_UNEXPECTED_RESULT, - "%s: vsnprintf failed", what); - } - if (needed) { - *needed = n; - } - if (((size_t) n >= bufSize) && (buf != NULL) && (error != UDS_SUCCESS)) { - return logErrorWithStringError(error, "%s: string too long", what); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int fixedSprintf(const char *what, - char *buf, - size_t bufSize, - int error, - const char *fmt, - ...) -{ - if (buf == NULL) { - return UDS_INVALID_ARGUMENT; - } - va_list args; - va_start(args, fmt); - int result = wrapVsnprintf(what, buf, bufSize, error, fmt, args, NULL); - va_end(args); - return result; -} - -/*****************************************************************************/ -char *vAppendToBuffer(char *buffer, - char *bufEnd, - const char *fmt, - va_list args) -{ - size_t n = vsnprintf(buffer, bufEnd - buffer, fmt, args); - if (n >= (size_t) (bufEnd - buffer)) { - buffer = bufEnd; - } else { - buffer += n; - } - return buffer; -} - -/*****************************************************************************/ -char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - char *pos = vAppendToBuffer(buffer, bufEnd, fmt, ap); - va_end(ap); - return pos; -} - -/*****************************************************************************/ -int stringToSignedInt(const char *nptr, int *num) -{ - long value; - int result = stringToSignedLong(nptr, &value); - if (result != UDS_SUCCESS) { - return result; - } - if ((value < INT_MIN) || (value > INT_MAX)) { - return ERANGE; - } - *num = (int) value; - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int stringToUnsignedInt(const char *nptr, unsigned int *num) -{ - unsigned long value; - int result = stringToUnsignedLong(nptr, &value); - if (result != UDS_SUCCESS) { - return result; - } - if (value > UINT_MAX) { - return ERANGE; - } - *num = (unsigned int) value; - return UDS_SUCCESS; -} diff --git a/uds/stringUtils.h b/uds/stringUtils.h deleted file mode 100644 index bd685bb..0000000 --- a/uds/stringUtils.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/stringUtils.h#2 $ - */ - -#ifndef STRING_UTILS_H -#define STRING_UTILS_H - -#include -#ifdef __KERNEL__ -#include -#include -#else -#include // for vsnprintf -#include // for strtol -#include -#include -#endif - -#include "compiler.h" -#include "typeDefs.h" - -/** - * Convert a boolean value to its corresponding "true" or "false" string. - * - * @param value The boolean value to convert - * - * @return "true" if value is true, "false" otherwise. - **/ -static INLINE const char *boolToString(bool value) -{ - return (value ? "true" : "false"); -} - -/** - * Allocate a string built according to format (our version of asprintf). - * - * @param [in] what A description of what is being allocated, for error - * logging; if NULL doesn't log anything. - * @param [out] strp The pointer in which to store the allocated string. - * @param [in] fmt The sprintf format parameter. - * - * @return UDS_SUCCESS, or the appropriately translated asprintf error - **/ -int allocSprintf(const char *what, char **strp, const char *fmt, ...) - __attribute__((format(printf, 3, 4), warn_unused_result)); - -/** - * Write a printf-style string into a fixed-size buffer, returning - * errors if it would not fit. (our version of snprintf) - * - * @param [in] what A description of what is being written, for error - * logging; if NULL doesn't log anything. - * @param [out] buf The target buffer - * @param [in] bufSize The size of buf - * @param [in] error Error code to return on overflow - * @param [in] fmt The sprintf format parameter. - * @return UDS_SUCCESS or error - **/ -int fixedSprintf(const char *what, char *buf, size_t bufSize, - int error, const char *fmt, ...) - __attribute__((format(printf, 5, 6), warn_unused_result)); - -/** - * Write printf-style string into an existing buffer, returning a specified - * error code if it would not fit, and setting ``needed`` to the amount of - * space that would be required. - * - * @param [in] what A description of what is being written, for logging. - * @param [in] buf The buffer in which to write the string, or NULL to - * merely determine the required space. - * @param [in] bufSize The size of buf. - * @param [in] error The error code to return for exceeding the specified - * space, UDS_SUCCESS if no logging required. - * @param [in] fmt The sprintf format specification. - * @param [in] ap The variable argument pointer (see ). - * @param [out] needed If non-NULL, the actual amount of string space - * required, which may be smaller or larger than bufSize. - * - * @return UDS_SUCCESS if the string fits, the value of the error parameter if - * the string does not fit and a buffer was supplied, or - * UDS_UNEXPECTED_RESULT if vsnprintf fails in some other undocumented - * way. - **/ -int wrapVsnprintf(const char *what, char *buf, size_t bufSize, - int error, const char *fmt, va_list ap, size_t *needed) - __attribute__((format(printf, 5, 0), warn_unused_result)); - -/** - * Helper to append a string to a buffer. - * - * @param buffer the place at which to append the string - * @param bufEnd pointer to the end of the buffer - * @param fmt a printf format string - * - * @return the updated buffer position after the append - * - * if insufficient space is available, the contents are silently truncated - **/ -char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) - __attribute__((format(printf, 3, 4))); - -/** - * Variable-arglist helper to append a string to a buffer. - * - * @param buffer the place at which to append the string - * @param bufEnd pointer to the end of the buffer - * @param fmt a printf format string - * @param args printf arguments - * - * @return the updated buffer position after the append - * - * if insufficient space is available, the contents are silently truncated - **/ -char *vAppendToBuffer(char *buffer, - char *bufEnd, - const char *fmt, - va_list args) - __attribute__((format(printf, 3, 0))); - -/** - * Our version of strtok_r, since some platforma apparently don't define it. - * - * @param str On first call, the string to tokenize. On subsequent - * calls, NULL. - * @param delims The set of delimiter characters. - * @param statePtr The address of a variable which holds the state of - * the tokenization between calls to nextToken. - * - * @return the next token if any, or NULL - **/ -char *nextToken(char *str, const char *delims, char **statePtr); - -/** - * Parse a string representing a decimal uint64_t. - * - * @param str The string. - * @param num Where to put the number. - * - * @return UDS_SUCCESS or the error UDS_INVALID_ARGUMENT if the string - * is not in the correct format. - **/ -int parseUint64(const char *str, uint64_t *num) - __attribute__((warn_unused_result)); - -/** - * Attempt to convert a string to an integer (base 10) - * - * @param nptr Pointer to string to convert - * @param num The resulting integer - * - * @return UDS_SUCCESS or an error code - **/ -int stringToSignedInt(const char *nptr, int *num) - __attribute__((warn_unused_result)); - -/** - * Attempt to convert a string to a long integer (base 10) - * - * @param nptr Pointer to string to convert - * @param num The resulting long integer - * - * @return UDS_SUCCESS or an error code - **/ -int stringToSignedLong(const char *nptr, long *num) - __attribute__((warn_unused_result)); - -/** - * Attempt to convert a string to an unsigned integer (base 10). - * - * @param nptr Pointer to string to convert - * @param num The resulting unsigned integer - * - * @return UDS_SUCCESS or an error code - **/ -int stringToUnsignedInt(const char *nptr, unsigned int *num) - __attribute__((warn_unused_result)); - -/** - * Attempt to convert a string to an unsigned long integer (base 10). - * - * @param nptr Pointer to string to convert - * @param num The resulting long unsigned integer - * - * @return UDS_SUCCESS or an error code - **/ -int stringToUnsignedLong(const char *nptr, unsigned long *num) - __attribute__((warn_unused_result)); - -#endif /* STRING_UTILS_H */ diff --git a/uds/sysfs.c b/uds/sysfs.c deleted file mode 100644 index b2009d7..0000000 --- a/uds/sysfs.c +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/sysfs.c#4 $ - */ - -#include "sysfs.h" - -#include -#include -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" -#include "uds.h" - -static struct { - struct kobject kobj; // /sys/uds - struct kobject parameterKobj; // /sys/uds/parameter - // These flags are used to ensure a clean shutdown - bool flag; // /sys/uds - bool parameterFlag; // /sys/uds/parameter -} objectRoot; - -/**********************************************************************/ -static char *bufferToString(const char *buf, size_t length) -{ - char *string; - if (ALLOCATE(length + 1, char, __func__, &string) != UDS_SUCCESS) { - return NULL; - } - memcpy(string, buf, length); - string[length] = '\0'; - if (string[length - 1] == '\n') { - string[length - 1] = '\0'; - } - return string; -} - -/**********************************************************************/ -// This is the code for a directory in the /sys/ tree that -// contains no regular files (only subdirectories). -/**********************************************************************/ - -/**********************************************************************/ -static void emptyRelease(struct kobject *kobj) -{ - // Many of our sysfs share this release function that does nothing. -} - -/**********************************************************************/ -static ssize_t emptyShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - return 0; -} - -/**********************************************************************/ -static ssize_t emptyStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) -{ - return length; -} - -static struct sysfs_ops emptyOps = { - .show = emptyShow, - .store = emptyStore, -}; - -static struct attribute *emptyAttrs[] = { - NULL, -}; - -static struct kobj_type emptyObjectType = { - .release = emptyRelease, - .sysfs_ops = &emptyOps, - .default_attrs = emptyAttrs, -}; - - -/**********************************************************************/ -// This is the the code for the /sys//parameter directory. -// -//

/log_level UDS_LOG_LEVEL -// -/**********************************************************************/ - -typedef struct { - struct attribute attr; - const char *(*showString)(void); - void (*storeString)(const char *); -} ParameterAttribute; - -/**********************************************************************/ -static ssize_t parameterShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - ParameterAttribute *pa = container_of(attr, ParameterAttribute, attr); - if (pa->showString != NULL) { - return sprintf(buf, "%s\n", pa->showString()); - } else { - return -EINVAL; - } -} - -/**********************************************************************/ -static ssize_t parameterStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) -{ - ParameterAttribute *pa = container_of(attr, ParameterAttribute, attr); - char *string = bufferToString(buf, length); - if (string == NULL) { - return -ENOMEM; - } - int result = UDS_SUCCESS; - if (pa->storeString != NULL) { - pa->storeString(string); - } else { - return -EINVAL; - } - FREE(string); - return result == UDS_SUCCESS ? length : result; -} - -/**********************************************************************/ - -static const char *parameterShowLogLevel(void) -{ - return priorityToString(getLogLevel()); -} - -/**********************************************************************/ - -static void parameterStoreLogLevel(const char *string) -{ - setLogLevel(stringToPriority(string)); -} - -/**********************************************************************/ - -static ParameterAttribute logLevelAttr = { - .attr = { .name = "log_level", .mode = 0600 }, - .showString = parameterShowLogLevel, - .storeString = parameterStoreLogLevel, -}; - -static struct attribute *parameterAttrs[] = { - &logLevelAttr.attr, - NULL, -}; - -static struct sysfs_ops parameterOps = { - .show = parameterShow, - .store = parameterStore, -}; - -static struct kobj_type parameterObjectType = { - .release = emptyRelease, - .sysfs_ops = ¶meterOps, - .default_attrs = parameterAttrs, -}; - -/**********************************************************************/ -int initSysfs(void) -{ - memset(&objectRoot, 0, sizeof(objectRoot)); - kobject_init(&objectRoot.kobj, &emptyObjectType); - int result = kobject_add(&objectRoot.kobj, NULL, THIS_MODULE->name); - if (result == 0) { - objectRoot.flag = true; - kobject_init(&objectRoot.parameterKobj, ¶meterObjectType); - result = kobject_add(&objectRoot.parameterKobj, &objectRoot.kobj, - "parameter"); - if (result == 0) { - objectRoot.parameterFlag = true; - } - } - if (result != 0) { - putSysfs(); - } - return result; -} - -/**********************************************************************/ -void putSysfs() -{ - if (objectRoot.parameterFlag) { - kobject_put(&objectRoot.parameterKobj); - } - if (objectRoot.flag) { - kobject_put(&objectRoot.kobj); - } -} diff --git a/uds/sysfs.h b/uds/sysfs.h deleted file mode 100644 index d5f9ccf..0000000 --- a/uds/sysfs.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/sysfs.h#1 $ - */ - -#ifndef SYSFS_H -#define SYSFS_H - -/** - * Called when the module is loaded to initialize the /sys/\ - * tree. - * - * @return 0 on success, or non-zero on error - **/ -int initSysfs(void); - -/** - * Called when the module is being unloaded to terminate the - * /sys/\ tree. - **/ -void putSysfs(void); - -#endif /* SYSFS_H */ diff --git a/uds/threadCondVarLinuxKernel.c b/uds/threadCondVarLinuxKernel.c deleted file mode 100644 index e3c1517..0000000 --- a/uds/threadCondVarLinuxKernel.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadCondVarLinuxKernel.c#2 $ - */ - -#include "threads.h" -#include "timeUtils.h" -#include "uds-error.h" - -/**********************************************************************/ -int initCond(CondVar *cv) -{ - cv->eventCount = NULL; - return makeEventCount(&cv->eventCount); -} - -/**********************************************************************/ -int signalCond(CondVar *cv) -{ - eventCountBroadcast(cv->eventCount); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int broadcastCond(CondVar *cv) -{ - eventCountBroadcast(cv->eventCount); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int waitCond(CondVar *cv, Mutex *mutex) -{ - EventToken token = eventCountPrepare(cv->eventCount); - unlockMutex(mutex); - eventCountWait(cv->eventCount, token, NULL); - lockMutex(mutex); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int timedWaitCond(CondVar *cv, Mutex *mutex, RelTime timeout) -{ - EventToken token = eventCountPrepare(cv->eventCount); - unlockMutex(mutex); - bool happened = eventCountWait(cv->eventCount, token, &timeout); - lockMutex(mutex); - return happened ? UDS_SUCCESS : ETIMEDOUT; -} - -/**********************************************************************/ -int destroyCond(CondVar *cv) -{ - freeEventCount(cv->eventCount); - cv->eventCount = NULL; - return UDS_SUCCESS; -} diff --git a/uds/threadOnce.c b/uds/threadOnce.c deleted file mode 100644 index 62149ca..0000000 --- a/uds/threadOnce.c +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/threadOnce.c#1 $ - */ - -#include "errors.h" -#include "threads.h" - -enum { - ONCE_NOT_DONE = 0, - ONCE_IN_PROGRESS = 1, - ONCE_COMPLETE = 2, -}; - -/*****************************************************************************/ -int performOnce(OnceState *once, void (*function)(void)) -{ - for (;;) { - switch (atomic_cmpxchg(once, ONCE_NOT_DONE, ONCE_IN_PROGRESS)) { - case ONCE_NOT_DONE: - function(); - atomic_set_release(once, ONCE_COMPLETE); - return UDS_SUCCESS; - case ONCE_IN_PROGRESS: - yieldScheduler(); - break; - case ONCE_COMPLETE: - return UDS_SUCCESS; - default: - return UDS_BAD_STATE; - } - } -} diff --git a/uds/threadOnce.h b/uds/threadOnce.h deleted file mode 100644 index 58b6da3..0000000 --- a/uds/threadOnce.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/threadOnce.h#1 $ - */ - -#ifndef THREAD_ONCE_H -#define THREAD_ONCE_H - -#include "atomicDefs.h" - -#define ONCE_STATE_INITIALIZER ATOMIC_INIT(0) - -typedef atomic_t OnceState; - -/** - * Thread safe once only initialization. - * - * @param onceState pointer to object to record that initialization - * has been performed - * @param initFunction called if onceState does not indicate - * initialization has been performed - * - * @return UDS_SUCCESS or error code - * - * @note Generally the following declaration of onceState is performed in - * at file scope: - * - * static OnceState onceState = ONCE_STATE_INITIALIZER; - **/ -int performOnce(OnceState *onceState, void (*initFunction) (void)); - -#endif /* THREAD_ONCE_H */ diff --git a/uds/threadRegistry.c b/uds/threadRegistry.c deleted file mode 100644 index c37e77a..0000000 --- a/uds/threadRegistry.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadRegistry.c#1 $ - */ - -#include "threadRegistry.h" - -#include -#include - -#include "permassert.h" - -/* - * We need to be careful when using other facilities that may use - * threadRegistry functions in their normal operation. For example, - * we do not want to invoke the logger while holding a lock. - */ - -/*****************************************************************************/ -void registerThread(ThreadRegistry *registry, - RegisteredThread *newThread, - const void *pointer) -{ - INIT_LIST_HEAD(&newThread->links); - newThread->pointer = pointer; - newThread->task = current; - - bool foundIt = false; - RegisteredThread *thread; - write_lock(®istry->lock); - list_for_each_entry(thread, ®istry->links, links) { - if (thread->task == current) { - // This should not have been there. - // We'll complain after releasing the lock. - list_del_init(&thread->links); - foundIt = true; - break; - } - } - list_add_tail(&newThread->links, ®istry->links); - write_unlock(®istry->lock); - ASSERT_LOG_ONLY(!foundIt, "new thread not already in registry"); -} - -/*****************************************************************************/ -void unregisterThread(ThreadRegistry *registry) -{ - bool foundIt = false; - RegisteredThread *thread; - write_lock(®istry->lock); - list_for_each_entry(thread, ®istry->links, links) { - if (thread->task == current) { - list_del_init(&thread->links); - foundIt = true; - break; - } - } - write_unlock(®istry->lock); - ASSERT_LOG_ONLY(foundIt, "thread found in registry"); -} - -/*****************************************************************************/ -void initializeThreadRegistry(ThreadRegistry *registry) -{ - INIT_LIST_HEAD(®istry->links); - rwlock_init(®istry->lock); -} - -/*****************************************************************************/ -const void *lookupThread(ThreadRegistry *registry) -{ - const void *result = NULL; - read_lock(®istry->lock); - RegisteredThread *thread; - list_for_each_entry(thread, ®istry->links, links) { - if (thread->task == current) { - result = thread->pointer; - break; - } - } - read_unlock(®istry->lock); - return result; -} diff --git a/uds/threadRegistry.h b/uds/threadRegistry.h deleted file mode 100644 index ec1832d..0000000 --- a/uds/threadRegistry.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadRegistry.h#1 $ - */ - -#ifndef THREAD_REGISTRY_H -#define THREAD_REGISTRY_H 1 - -#include -#include - -/* - * We don't expect this set to ever get really large, so a linked list - * is adequate. - */ - -typedef struct threadRegistry { - struct list_head links; - rwlock_t lock; -} ThreadRegistry; - -typedef struct registeredThread { - struct list_head links; - const void *pointer; - struct task_struct *task; -} RegisteredThread; - -/*****************************************************************************/ - -/** - * Initialize a registry of threads and associated data pointers. - * - * @param registry The registry to initialize - **/ -void initializeThreadRegistry(ThreadRegistry *registry); - -/** - * Register the current thread and associate it with a data pointer. - * - * This call will log messages if the thread is already registered. - * - * @param registry The thread registry - * @param newThread RegisteredThread structure to use for the current thread - * @param pointer The value to associated with the current thread - **/ -void registerThread(ThreadRegistry *registry, - RegisteredThread *newThread, - const void *pointer); - -/** - * Remove the registration for the current thread. - * - * A message may be logged if the thread was not registered. - * - * @param registry The thread registry - **/ -void unregisterThread(ThreadRegistry *registry); - -/** - * Fetch a pointer that may have been registered for the current - * thread. If the thread is not registered, a null pointer is - * returned. - * - * @param registry The thread registry - * - * @return the registered pointer, if any, or NULL - **/ -const void *lookupThread(ThreadRegistry *registry); - -#endif /* THREAD_REGISTRY_H */ diff --git a/uds/threads.h b/uds/threads.h deleted file mode 100644 index 793355c..0000000 --- a/uds/threads.h +++ /dev/null @@ -1,453 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/threads.h#4 $ - */ - -#ifndef THREADS_H -#define THREADS_H - -#include "compiler.h" -#include "threadOnce.h" -#include "timeUtils.h" -#include "uds-error.h" - -#ifdef __KERNEL__ -#include -#include -#include -#include "util/eventCount.h" -#else -#include -#include -#include -#endif - -#ifdef __KERNEL__ -typedef struct { EventCount *eventCount; } CondVar; -typedef struct mutex Mutex; -typedef struct semaphore Semaphore; -typedef struct kernelThread *Thread; -typedef pid_t ThreadId; - -typedef struct { - Semaphore mutex; // Mutex for this barrier object - Semaphore wait; // Semaphore for threads waiting at the barrier - int arrived; // Number of threads which have arrived - int threadCount; // Total number of threads using this barrier -} Barrier; -#else -typedef pthread_barrier_t Barrier; -typedef pthread_cond_t CondVar; -typedef pthread_mutex_t Mutex; -typedef sem_t Semaphore; -typedef pthread_t Thread; -typedef pid_t ThreadId; - -#ifndef NDEBUG -#define MUTEX_INITIALIZER PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP -#else -#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER -#endif - -extern const bool DO_ASSERTIONS; -#endif - -#ifdef __KERNEL__ -/** - * Apply a function to every thread that we have created. - * - * @param applyFunc The function to apply - * @param argument The first argument to applyFunc - * - **/ -void applyToThreads(void applyFunc(void *, struct task_struct *), - void *argument); -#endif - -/** - * Create a thread, logging any cause of failure. - * - * @param threadFunc function to run in new thread - * @param threadData private data for new thread - * @param name name of the new thread - * @param newThread where to store the new thread id - * - * @return success or failure indication - **/ -int createThread(void (*threadFunc)(void *), - void *threadData, - const char *name, - Thread *newThread) - __attribute__((warn_unused_result)); - -/** - * Retrieve the current numbers of cores. - * - * This is either the total number or the number of cores that this - * process has been limited to. - * - * @return number of cores - **/ -unsigned int getNumCores(void); - -/** - * Return the id of the current thread. - * - * @return the thread id - **/ -ThreadId getThreadId(void) __attribute__((warn_unused_result)); - -#ifndef __KERNEL__ -/** - * Get the name of the current thread. - * - * @param name a buffer of size at least 16 to write the name to - **/ -void getThreadName(char *name); -#endif - -/** - * Wait for termination of another thread. - * - * - * @param th The thread for which to wait. - * - * @return UDS_SUCCESS or error code - **/ -int joinThreads(Thread th); - -#ifdef __KERNEL__ -/** - * Exit the current thread. This is a kernel-only function that is intended to - * be an alternative to using BUG() or BUG_ON(). - **/ -__attribute__((noreturn)) -void exitThread(void); -#endif - -/** - * Initialize a thread synchronization barrier (also known as a rendezvous). - * - * @param barrier the barrier to initialize - * @param threadCount the number of threads that must enter the barrier before - * any threads are permitted to leave it - * - * @return UDS_SUCCESS or an error code - **/ -int initializeBarrier(Barrier *barrier, unsigned int threadCount) - __attribute__((warn_unused_result)); - -/** - * Destroy a thread synchronization barrier. - * - * @param barrier the barrier to destroy - * - * @return UDS_SUCCESS or an error code - **/ -int destroyBarrier(Barrier *barrier); - -/** - * Enter a thread synchronization barrier, waiting for the configured number - * of threads to have entered before exiting the barrier. Exactly one thread - * will be arbitrarily selected to be flagged as the "winner" of a barrier. - * - * @param barrier the barrier to enter - * @param winner if non-NULL, a pointer to the flag indicating whether the - * calling thread was the unique winner - * - * @return UDS_SUCCESS or an error code - **/ -int enterBarrier(Barrier *barrier, bool *winner); - -/** - * Initialize a condition variable with default attributes. - * - * @param cond condition variable to init - * - * @return UDS_SUCCESS or error code - **/ -int initCond(CondVar *cond) __attribute__((warn_unused_result)); - -/** - * Signal a condition variable. - * - * @param cond condition variable to signal - * - * @return UDS_SUCCESS or error code - **/ -int signalCond(CondVar *cond); - -/** - * Broadcast a condition variable. - * - * @param cond condition variable to broadcast - * - * @return UDS_SUCCESS or error code - **/ -int broadcastCond(CondVar *cond); - -/** - * Wait on a condition variable. - * - * @param cond condition variable to wait on - * @param mutex mutex to release while waiting - * - * @return UDS_SUCCESS or error code - **/ -int waitCond(CondVar *cond, Mutex *mutex); - -/** - * Wait on a condition variable with a timeout. - * - * @param cond condition variable to wait on - * @param mutex mutex to release while waiting - * @param timeout the relative time until the timeout expires - * - * @return error code (ETIMEDOUT if the deadline is hit) - **/ -int timedWaitCond(CondVar *cond, Mutex *mutex, RelTime timeout); - -/** - * Destroy a condition variable. - * - * @param cond condition variable to destroy - * - * @return UDS_SUCCESS or error code - **/ -int destroyCond(CondVar *cond); - -#ifndef __KERNEL__ -/** - * Initialize a mutex, optionally asserting if the mutex initialization fails. - * This function should only be called directly in places where making - * assertions is not safe. - * - * @param mutex the mutex to initialize - * @param assertOnError if true, an error initializing the - * mutex will make an assertion - * - * @return UDS_SUCCESS or an error code - **/ -int initializeMutex(Mutex *mutex, bool assertOnError); -#endif - -/** - * Initialize the default type (error-checking during development) mutex. - * - * @param mutex the mutex to initialize - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -#ifdef __KERNEL__ -static INLINE int initMutex(Mutex *mutex) -{ - mutex_init(mutex); - return UDS_SUCCESS; -} -#else -int initMutex(Mutex *mutex); -#endif - -/** - * Destroy a mutex (with error checking during development). - * - * @param mutex mutex to destroy - * - * @return UDS_SUCCESS or error code - **/ -#ifdef __KERNEL__ -static INLINE int destroyMutex(Mutex *mutex) -{ - return UDS_SUCCESS; -} -#else -int destroyMutex(Mutex *mutex); -#endif - -/** - * Lock a mutex, with optional error checking during development. - * - * @param mutex mutex to lock - **/ -#ifdef __KERNEL__ -static INLINE void lockMutex(Mutex *mutex) -{ - mutex_lock(mutex); -} -#else -void lockMutex(Mutex *mutex); -#endif - -/** - * Unlock a mutex, with optional error checking during development. - * - * @param mutex mutex to unlock - **/ -#ifdef __KERNEL__ -static INLINE void unlockMutex(Mutex *mutex) -{ - mutex_unlock(mutex); -} -#else -void unlockMutex(Mutex *mutex); -#endif - -/** - * Initialize a semaphore used among threads in the same process. - * - * @param semaphore the semaphore to initialize - * @param value the initial value of the semaphore - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -#ifdef __KERNEL__ -static INLINE int initializeSemaphore(Semaphore *semaphore, unsigned int value) -{ - sema_init(semaphore, value); - return UDS_SUCCESS; -} -#else -int initializeSemaphore(Semaphore *semaphore, unsigned int value); -#endif - -/** - * Destroy a semaphore used among threads in the same process. - * - * @param semaphore the semaphore to destroy - * - * @return UDS_SUCCESS or an error code - **/ -#ifdef __KERNEL__ -static INLINE int destroySemaphore(Semaphore *semaphore) -{ - return UDS_SUCCESS; -} -#else -int destroySemaphore(Semaphore *semaphore); -#endif - -/** - * Acquire a permit from a semaphore, waiting if none are currently available. - * - * @param semaphore the semaphore to acquire - **/ -#ifdef __KERNEL__ -static INLINE void acquireSemaphore(Semaphore *semaphore) -{ - // Do not use down(semaphore). Instead use down_interruptible so that we do - // not get 120 second stall messages in kern.log. - while (down_interruptible(semaphore) != 0) { - } -} -#else -void acquireSemaphore(Semaphore *semaphore); -#endif - -/** - * Attempt to acquire a permit from a semaphore. - * - * If a permit is available, it is claimed and the function immediately - * returns true. If a timeout is zero or negative, the function immediately - * returns false. Otherwise, this will wait either a permit to become - * available (returning true) or the relative timeout to expire (returning - * false). - * - * @param semaphore the semaphore to decrement - * @param timeout the relative time until the timeout expires - * - * @return true if a permit was acquired, otherwise false - **/ -__attribute__((warn_unused_result)) -#ifdef __KERNEL__ -static INLINE bool attemptSemaphore(Semaphore *semaphore, RelTime timeout) -{ - if (timeout <= 0) { - // No timeout, just try to grab the semaphore. - return down_trylock(semaphore) == 0; - } else { - unsigned int jiffies = usecs_to_jiffies(relTimeToMicroseconds(timeout)); - return down_timeout(semaphore, jiffies) == 0; - } -} -#else -bool attemptSemaphore(Semaphore *semaphore, RelTime timeout); -#endif - -/** - * Release a semaphore, incrementing the number of available permits. - * - * @param semaphore the semaphore to increment - **/ -#ifdef __KERNEL__ -static INLINE void releaseSemaphore(Semaphore *semaphore) -{ - up(semaphore); -} -#else -void releaseSemaphore(Semaphore *semaphore); -#endif - -/** - * Yield the time slice in the given thread. - * - * @return UDS_SUCCESS or an error code - **/ -int yieldScheduler(void); - -#ifndef __KERNEL__ -/** - * Allocate a thread specific key for thread specific data. - * - * @param key points to location for new key - * @param destr_function destructor function called when thread exits - * - * @return UDS_SUCCESS or error code - **/ -int createThreadKey(pthread_key_t *key, void (*destr_function) (void *)); - -/** - * Delete a thread specific key for thread specific data. - * - * @param key key to delete - * - * @return UDS_SUCCESS or error code - **/ -int deleteThreadKey(pthread_key_t key); - -/** - * Set pointer for thread specific data. - * - * @param key key to be associated with pointer - * @param pointer data associated with key - * - * @return UDS_SUCCESS or error code - **/ -int setThreadSpecific(pthread_key_t key, const void *pointer); - -/** - * Get pointer for thread specific data. - * - * @param key key identifying the thread specific data - **/ -void *getThreadSpecific(pthread_key_t key); -#endif - -#endif /* THREADS_H */ diff --git a/uds/threadsLinuxKernel.c b/uds/threadsLinuxKernel.c deleted file mode 100644 index 7ac972d..0000000 --- a/uds/threadsLinuxKernel.c +++ /dev/null @@ -1,220 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/threadsLinuxKernel.c#4 $ - */ - -#include -#include -#include - -#include "memoryAlloc.h" -#include "logger.h" -#include "threads.h" -#include "uds-error.h" - -static struct hlist_head kernelThreadList; -static struct mutex kernelThreadMutex; -static OnceState kernelThreadOnce; - -typedef struct kernelThread { - void (*threadFunc)(void *); - void *threadData; - struct hlist_node threadLinks; - struct task_struct *threadTask; - struct completion threadDone; -} KernelThread; - -/**********************************************************************/ -static void kernelThreadInit(void) -{ - mutex_init(&kernelThreadMutex); -} - -/**********************************************************************/ -static int threadStarter(void *arg) -{ - KernelThread *kt = arg; - kt->threadTask = current; - performOnce(&kernelThreadOnce, kernelThreadInit); - mutex_lock(&kernelThreadMutex); - hlist_add_head(&kt->threadLinks, &kernelThreadList); - mutex_unlock(&kernelThreadMutex); - RegisteredThread allocatingThread; - registerAllocatingThread(&allocatingThread, NULL); - kt->threadFunc(kt->threadData); - unregisterAllocatingThread(); - complete(&kt->threadDone); - return 0; -} - -/**********************************************************************/ -int createThread(void (*threadFunc)(void *), - void *threadData, - const char *name, - Thread *newThread) -{ - char *nameColon = strchr(name, ':'); - char *myNameColon = strchr(current->comm, ':'); - KernelThread *kt; - int result = ALLOCATE(1, KernelThread, __func__, &kt); - if (result != UDS_SUCCESS) { - logWarning("Error allocating memory for %s", name); - return result; - } - kt->threadFunc = threadFunc; - kt->threadData = threadData; - init_completion(&kt->threadDone); - struct task_struct *thread; - /* - * Start the thread, with an appropriate thread name. - * - * If the name supplied contains a colon character, use that name. This - * causes uds module threads to have names like "uds:callbackW" and the main - * test runner thread to be named "zub:runtest". - * - * Otherwise if the current thread has a name containing a colon character, - * prefix the name supplied with the name of the current thread up to (and - * including) the colon character. Thus when the "kvdo0:dedupeQ" thread - * opens an index session, all the threads associated with that index will - * have names like "kvdo0:foo". - * - * Otherwise just use the name supplied. This should be a rare occurrence. - */ - if ((nameColon == NULL) && (myNameColon != NULL)) { - thread = kthread_run(threadStarter, kt, "%.*s:%s", - (int) (myNameColon - current->comm), current->comm, - name); - } else { - thread = kthread_run(threadStarter, kt, "%s", name); - } - if (IS_ERR(thread)) { - FREE(kt); - return UDS_ENOTHREADS; - } - *newThread = kt; - return UDS_SUCCESS; -} -/**********************************************************************/ -int joinThreads(Thread kt) -{ - while (wait_for_completion_interruptible(&kt->threadDone) != 0) { - } - mutex_lock(&kernelThreadMutex); - hlist_del(&kt->threadLinks); - mutex_unlock(&kernelThreadMutex); - FREE(kt); - return UDS_SUCCESS; -} - -/**********************************************************************/ -void applyToThreads(void applyFunc(void *, struct task_struct *), - void *argument) -{ - KernelThread *kt; - performOnce(&kernelThreadOnce, kernelThreadInit); - mutex_lock(&kernelThreadMutex); - hlist_for_each_entry(kt, &kernelThreadList, threadLinks) { - applyFunc(argument, kt->threadTask); - } - mutex_unlock(&kernelThreadMutex); -} - -/**********************************************************************/ -void exitThread(void) -{ - KernelThread *kt; - struct completion *completion = NULL; - performOnce(&kernelThreadOnce, kernelThreadInit); - mutex_lock(&kernelThreadMutex); - hlist_for_each_entry(kt, &kernelThreadList, threadLinks) { - if (kt->threadTask == current) { - completion = &kt->threadDone; - break; - } - } - mutex_unlock(&kernelThreadMutex); - unregisterAllocatingThread(); - complete_and_exit(completion, 1); -} - -/**********************************************************************/ -ThreadId getThreadId(void) -{ - return current->pid; -} - -/**********************************************************************/ -unsigned int getNumCores(void) -{ - return num_online_cpus(); -} - -/**********************************************************************/ -int initializeBarrier(Barrier *barrier, unsigned int threadCount) -{ - barrier->arrived = 0; - barrier->threadCount = threadCount; - int result = initializeSemaphore(&barrier->mutex, 1); - if (result != UDS_SUCCESS) { - return result; - } - return initializeSemaphore(&barrier->wait, 0); -} - -/**********************************************************************/ -int destroyBarrier(Barrier *barrier) -{ - int result = destroySemaphore(&barrier->mutex); - if (result != UDS_SUCCESS) { - return result; - } - return destroySemaphore(&barrier->wait); -} - -/**********************************************************************/ -int enterBarrier(Barrier *barrier, bool *winner) -{ - acquireSemaphore(&barrier->mutex); - bool lastThread = ++barrier->arrived == barrier->threadCount; - if (lastThread) { - // This is the last thread to arrive, so wake up the others - int i; - for (i = 1; i < barrier->threadCount; i++) { - releaseSemaphore(&barrier->wait); - } - // Then reinitialize for the next cycle - barrier->arrived = 0; - releaseSemaphore(&barrier->mutex); - } else { - // This is NOT the last thread to arrive, so just wait - releaseSemaphore(&barrier->mutex); - acquireSemaphore(&barrier->wait); - } - if (winner != NULL) { - *winner = lastThread; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int yieldScheduler(void) -{ - yield(); - return UDS_SUCCESS; -} diff --git a/uds/timeUtils.c b/uds/timeUtils.c deleted file mode 100644 index ddf3b2b..0000000 --- a/uds/timeUtils.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/timeUtils.c#4 $ - */ - -#include "stringUtils.h" -#include "timeUtils.h" - -#ifdef __KERNEL__ -#include -#include // for getnstimeofday on Vivid -#else -#include -#endif - -#ifndef __KERNEL__ -static const struct timespec invalidTime = { - .tv_sec = -1, - .tv_nsec = LONG_MAX -}; - -static const long BILLION = 1000 * 1000 * 1000; -#endif - -#ifndef __KERNEL__ -/*****************************************************************************/ -AbsTime currentTime(clockid_t clock) -{ - struct timespec ts; - if (clock_gettime(clock, &ts) != 0) { - ts = invalidTime; - } - return ts; -} -#endif - -#ifndef __KERNEL__ -/*****************************************************************************/ -/** - * Return a time offset from the specified time. - * - * @param time A time. - * @param reltime The relative time - * - * @return the sum of the time and the offset, possibly rounded up to the - * next representable instant. - * - * @note timeDifference(a, deltaTime(a, n)) may only be approx == -n - * depending on the system-specific time resolution - **/ -static AbsTime deltaTime(AbsTime time, RelTime reltime) -{ - if (!isValidTime(time)) { - return time; - } - if ((reltime >= 0) && (reltime < 10 * BILLION)) { - reltime += time.tv_nsec; - while (reltime >= BILLION) { - reltime -= BILLION; - time.tv_sec++; - } - time.tv_nsec = reltime; - return time; - } - // may not be accurate for times before the Epoch... - // (is the ns time positive or negative for negative time_t?) - int64_t ns = time.tv_sec * BILLION + time.tv_nsec; - if ((ns < INT64_MIN / 2) || - (ns > INT64_MAX / 2) || - (reltime < INT64_MIN / 2) || - (reltime > INT64_MAX / 2)) { - return invalidTime; - } - ns += reltime; - return (AbsTime) { .tv_sec = ns / BILLION, .tv_nsec = ns % BILLION }; -} -#endif - -#ifndef __KERNEL__ -/*****************************************************************************/ -AbsTime futureTime(clockid_t clock, RelTime reltime) -{ - return deltaTime(currentTime(clock), reltime); -} -#endif - -#ifndef __KERNEL__ -/*****************************************************************************/ -bool isValidTime(AbsTime time) -{ - if (time.tv_nsec < 0 || time.tv_nsec >= BILLION) { - return false; - } - return true; -} -#endif - -/*****************************************************************************/ -uint64_t nowUsec(void) -{ -#ifdef __KERNEL__ - static const AbsTime epoch = 0; -#else - static const AbsTime epoch = { 0, 0 }; -#endif - return relTimeToMicroseconds(timeDifference(currentTime(CLOCK_REALTIME), - epoch)); -} - - - -#ifndef __KERNEL__ -/*****************************************************************************/ -RelTime timeDifference(AbsTime a, AbsTime b) -{ - if (isValidTime(a) && isValidTime(b)) { - int64_t ans = a.tv_sec * BILLION + a.tv_nsec; - int64_t bns = b.tv_sec * BILLION + b.tv_nsec; - return ans - bns; - } else if (isValidTime(a)) { - return INT64_MAX; - } else if (isValidTime(b)) { - return INT64_MIN; - } else { - return 0; - } -} -#endif diff --git a/uds/timeUtils.h b/uds/timeUtils.h deleted file mode 100644 index 8d159f4..0000000 --- a/uds/timeUtils.h +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/timeUtils.h#5 $ - */ - -#ifndef TIME_UTILS_H -#define TIME_UTILS_H - -#include "compiler.h" -#include "typeDefs.h" - -#ifdef __KERNEL__ -#include -#include -#else -#include -#include -#endif - -// Absolute time. -#ifdef __KERNEL__ -typedef int64_t AbsTime; -#else -typedef struct timespec AbsTime; -#endif - -// Relative time, the length of a time interval, or the difference between -// two times. A signed 64-bit number of nanoseconds. -typedef int64_t RelTime; - -#ifndef __KERNEL__ -/** - * Return true if the time is valid. - * - * @param time a time - * - * @return true if the time is valid - * - * @note an invalid time is generally returned from a failed attempt - * to get the time from the system - **/ -bool isValidTime(AbsTime time); -#endif - -/** - * Return the current time according to the specified clock type. - * - * @param clock Either CLOCK_REALTIME or CLOCK_MONOTONIC - * - * @return the current time according to the clock in question - * - * @note the precision of the clock is system specific - **/ -#ifdef __KERNEL__ -static INLINE AbsTime currentTime(clockid_t clock) -{ - // clock is always a constant, so gcc reduces this to a single call - return clock == CLOCK_MONOTONIC ? ktime_get_ns() : ktime_get_real_ns(); -} -#else -AbsTime currentTime(clockid_t clock); -#endif - -#ifndef __KERNEL__ -/** - * Return the timestamp a certain number of nanoseconds in the future. - * - * @param clock Either CLOCK_REALTIME or CLOCK_MONOTONIC - * @param reltime The relative time to the clock value - * - * @return the timestamp for that time (potentially rounded to the next - * representable instant for the system in question) - **/ -AbsTime futureTime(clockid_t clock, RelTime reltime); -#endif - -/** - * Return the difference between two timestamps. - * - * @param a A time - * @param b Another time, based on the same clock as a. - * - * @return the relative time between the two timestamps - **/ -#ifdef __KERNEL__ -static INLINE RelTime timeDifference(AbsTime a, AbsTime b) -{ - return a - b; -} -#else -RelTime timeDifference(AbsTime a, AbsTime b); -#endif - - - -/** - * Convert seconds to a RelTime value - * - * @param seconds A number of seconds - * - * @return the equivalent number of seconds as a RelTime - **/ -static INLINE RelTime secondsToRelTime(int64_t seconds) -{ - return (RelTime) seconds * (1000 * 1000 * 1000); -} - -/** - * Convert milliseconds to a RelTime value - * - * @param milliseconds A number of milliseconds - * - * @return the equivalent number of milliseconds as a RelTime - **/ -static INLINE RelTime millisecondsToRelTime(int64_t milliseconds) -{ - return (RelTime) milliseconds * (1000 * 1000); -} - -/** - * Convert microseconds to a RelTime value - * - * @param microseconds A number of microseconds - * - * @return the equivalent number of microseconds as a RelTime - **/ -static INLINE RelTime microsecondsToRelTime(int64_t microseconds) -{ - return (RelTime) microseconds * 1000; -} - -/** - * Convert nanoseconds to a RelTime value - * - * @param nanoseconds A number of nanoseconds - * - * @return the equivalent number of nanoseconds as a RelTime - **/ -static INLINE RelTime nanosecondsToRelTime(int64_t nanoseconds) -{ - return (RelTime) nanoseconds; -} - -/** - * Convert a RelTime value to milliseconds - * - * @param reltime The relative time - * - * @return the equivalent number of milliseconds - **/ -static INLINE int64_t relTimeToSeconds(RelTime reltime) -{ - return reltime / (1000 * 1000 * 1000); -} - -/** - * Convert a RelTime value to milliseconds - * - * @param reltime The relative time - * - * @return the equivalent number of milliseconds - **/ -static INLINE int64_t relTimeToMilliseconds(RelTime reltime) -{ - return reltime / (1000 * 1000); -} - -/** - * Convert a RelTime value to microseconds - * - * @param reltime The relative time - * - * @return the equivalent number of microseconds - **/ -static INLINE int64_t relTimeToMicroseconds(RelTime reltime) -{ - return reltime / 1000; -} - -/** - * Convert a RelTime value to nanoseconds - * - * @param reltime The relative time - * - * @return the equivalent number of nanoseconds - **/ -static INLINE int64_t relTimeToNanoseconds(RelTime reltime) -{ - return reltime; -} - -/** - * Return the wall clock time in microseconds. The actual value is time - * since the epoch (see "man gettimeofday"), but the typical use is to call - * this twice and compute the difference, giving the elapsed time between - * the two calls. - * - * @return the time in microseconds - **/ -uint64_t nowUsec(void) __attribute__((warn_unused_result)); - -/** - * Convert from an AbsTime to a time_t - * - * @param time an AbsTime time - * - * @return a time_t time - **/ -static INLINE time_t asTimeT(AbsTime time) -{ -#ifdef __KERNEL__ - return time / 1000000000; -#else - return time.tv_sec; -#endif -} - -/** - * Convert from a time_t to an AbsTime, - * - * @param time a time_t time - * - * @return an AbsTime time - **/ -static INLINE AbsTime fromTimeT(time_t time) -{ -#ifdef __KERNEL__ - return time * 1000000000; -#else - AbsTime abs; - abs.tv_sec = time; - abs.tv_nsec = 0; - return abs; -#endif -} - -#ifndef __KERNEL__ -/** - * Convert from an AbsTime to a struct timespec - * - * @param time an AbsTime time - * - * @return a time_t time - **/ -static INLINE struct timespec asTimeSpec(AbsTime time) -{ - return time; -} -#endif - -#ifndef __KERNEL__ -/** - * Convert from an AbsTime to a struct timeval - * - * @param time an AbsTime time - * - * @return a time_t time - **/ -static INLINE struct timeval asTimeVal(AbsTime time) -{ - struct timeval tv = { time.tv_sec, time.tv_nsec / 1000 }; - return tv; -} -#endif - -#endif /* TIME_UTILS_H */ diff --git a/uds/typeDefs.h b/uds/typeDefs.h deleted file mode 100644 index 927bd23..0000000 --- a/uds/typeDefs.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/typeDefs.h#1 $ - */ - -#ifndef LINUX_KERNEL_TYPE_DEFS_H -#define LINUX_KERNEL_TYPE_DEFS_H - -/* - * General system type definitions. This file is parallel to the other - * typeDefs.h files in this project. We pick up what we can from the system - * include files, and explicitly define the other things we need. - */ - -#include -#include -#include - -#define CHAR_BIT 8 - -#define INT64_MAX (9223372036854775807L) -#define UCHAR_MAX ((unsigned char)~0ul) -#define UINT8_MAX ((uint8_t)~0ul) -#define UINT16_MAX ((uint16_t)~0ul) -#define UINT64_MAX ((uint64_t)~0ul) - -// Some recent versions of define this for us -#ifndef SIZE_MAX -#define SIZE_MAX ((size_t)~0ul) -#endif - -#define PRId64 "lld" -#define PRIu16 "u" -#define PRIu32 "u" -#define PRIu64 "llu" - -typedef unsigned long uintmax_t; -#define PRIuMAX "lu" - -typedef unsigned char byte; - -#endif /* LINUX_KERNEL_TYPE_DEFS_H */ diff --git a/uds/uds-block.h b/uds/uds-block.h deleted file mode 100644 index e1b8e61..0000000 --- a/uds/uds-block.h +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/uds-block.h#1 $ - */ - -/** - * @file - * @brief Definitions for the UDS block interface - **/ -#ifndef UDS_BLOCK_H -#define UDS_BLOCK_H - -#include "uds.h" - -/** General UDS block constants. */ -enum { - /** The maximum metadata size for a block. */ - UDS_MAX_BLOCK_DATA_SIZE = UDS_MAX_METADATA_SIZE -}; - -/** - * Metadata to associate with a blockName. - **/ -struct udsChunkData { - unsigned char data[UDS_MAX_BLOCK_DATA_SIZE]; -}; - -/** - * Represents a block address on disk. - * - * #UdsBlockAddress objects allow the Application Software and UDS - * to refer to specific disk blocks. It might be, for instance, the - * logical block address divided by the block size. - * - * These objects are stored persistently in the index and are also cached. - * Therefore, make every effort to ensure that these objects are as small as - * possible. - **/ -typedef void *UdsBlockAddress; - -/** @{ */ -/** @name Deduplication */ - -typedef struct udsRequest UdsRequest; - -/** - * Callback function invoked to inform the Application Software that an - * operation started by #udsStartChunkOperation has completed. - * - * @param [in] request The operation that finished. When the callback - * function is called, this UdsRequest structure can be - * reused or freed. - **/ -typedef void UdsChunkCallback(UdsRequest *request); - -/** - * Request structure passed to #udsStartChunkOperation to begin an operation, - * and returned to the Application Software when the callback function is - * invoked. - **/ -struct udsRequest { - /* - * The name of the block. - * Set before starting an operation. - * Unchanged at time of callback. - */ - UdsChunkName chunkName; - /* - * The metadata found in the index that was associated with the block - * (sometimes called the canonical address). - * Set before the callback. - */ - struct udsChunkData oldMetadata; - /* - * The new metadata to associate with the name of the block (sometimes called - * the duplicate address). - * Set before starting a #UDS_POST or #UDS_QUERY operation. - * Unchanged at time of callback. - */ - struct udsChunkData newMetadata; - /* - * The callback method to be invoked when the operation finishes. - * Set before starting an operation. - * Unchanged at time of callback. - */ - UdsChunkCallback *callback; - /* - * The index session. - * Set before starting an operation. - * Unchanged at time of callback. - */ - struct uds_index_session *session; - /* - * The operation type, which is one of #UDS_DELETE, #UDS_POST, #UDS_QUERY or - * #UDS_UPDATE. - * Set before starting an operation. - * Unchanged at time of callback. - */ - UdsCallbackType type; - /* - * The operation status, which is either #UDS_SUCCESS or an error code. - * Set before the callback. - */ - int status; - /* - * If true, the name of the block was found in the index. - * Set before the callback. - */ - bool found; - /* - * If true, move the entry to the end of the deduplication window. - * Set before starting a #UDS_QUERY operation. - * Unchanged at time of callback. - */ - bool update; - long private[25]; -}; - -/** - * Start a UDS index chunk operation. The request type field must - * be set to the type of operation. This is an asynchronous interface to the - * block-oriented UDS API. The callback is invoked upon completion. - * - * The #UDS_DELETE operation type deletes the mapping for a particular block. - * #UDS_DELETE is typically used when UDS provides invalid advice. - * - * The #UDS_POST operation type indexes a block name and associates it with a - * particular address. The caller provides the block's name. UDS then checks - * this name against its index. - *
    - *
  • If the block is new, it is stored in the index.
  • - *
  • If the block is a duplicate of an indexed block, UDS returns the - * canonical block address via the callback.
  • - *
- * - * The #UDS_QUERY operation type checks to see if a block name exists in the - * index. The caller provides the block's name. UDS then checks - * this name against its index. - *
    - *
  • If the block is new, no action is taken.
  • - - *
  • If the block is a duplicate of an indexed block, UDS returns the - * canonical block address via the callback. If the update - * field is set, the entry is moved to the end of the deduplication - * window.
- * - * The #UDS_UPDATE operation type updates the mapping for a particular block. - * #UDS_UPDATE is typically used if the callback function provides invalid - * advice. - * - * @param [in] request The operation. The type, - * chunkName, newMetadata, - * context, callback, and - * update fields must be set. At callback - * time, the oldMetadata, - * status, and found fields will - * be set. - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsStartChunkOperation(UdsRequest *request); -/** @} */ - -#endif /* UDS_BLOCK_H */ diff --git a/uds/uds-error.h b/uds/uds-error.h deleted file mode 100644 index 7658982..0000000 --- a/uds/uds-error.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/uds-error.h#3 $ - */ - -/** - * @file - * @brief UDS error code definitions - **/ -#ifndef UDS_ERROR_H -#define UDS_ERROR_H - - -/** - * Valid return status codes for API routines. - **/ -enum udsStatusCodes { - /** Successful return */ - UDS_SUCCESS = 0, - - /** Used as a base value for reporting errors */ - UDS_ERROR_CODE_BASE = 1024, - /** The UDS library is not initialized */ - UDS_UNINITIALIZED = UDS_ERROR_CODE_BASE + 0, - /** The UDS library is shutting down */ - UDS_SHUTTINGDOWN = UDS_ERROR_CODE_BASE + 1, - /** Could not load scanner modules */ - UDS_EMODULE_LOAD = UDS_ERROR_CODE_BASE + 2, - /** Could not create a new thread */ - UDS_ENOTHREADS = UDS_ERROR_CODE_BASE + 3, - /** Could not find the specified library context */ - UDS_NOCONTEXT = UDS_ERROR_CODE_BASE + 4, - /** The specified library context is disabled */ - UDS_DISABLED = UDS_ERROR_CODE_BASE + 5, - /** Some saved index component is corrupt */ - UDS_CORRUPT_COMPONENT = UDS_ERROR_CODE_BASE + 6, - UDS_CORRUPT_FILE = UDS_CORRUPT_COMPONENT, - /** Unknown error */ - UDS_UNKNOWN_ERROR = UDS_ERROR_CODE_BASE + 7, - /** Unused */ - UDS_UNUSED_CODE_8 = UDS_ERROR_CODE_BASE + 8, - /** Unused */ - UDS_UNUSED_CODE_9 = UDS_ERROR_CODE_BASE + 9, - /** The index configuration or volume format is no longer supported */ - UDS_UNSUPPORTED_VERSION = UDS_ERROR_CODE_BASE + 10, - /** Index session not available */ - UDS_NO_INDEXSESSION = UDS_ERROR_CODE_BASE + 11, - /** Index data in memory is corrupt */ - UDS_CORRUPT_DATA = UDS_ERROR_CODE_BASE + 12, - /** Short read due to truncated file */ - UDS_SHORT_READ = UDS_ERROR_CODE_BASE + 13, - /** Unused */ - UDS_UNUSED_CODE_14 = UDS_ERROR_CODE_BASE + 14, - /** Internal resource limits exceeded */ - UDS_RESOURCE_LIMIT_EXCEEDED = UDS_ERROR_CODE_BASE + 15, - /** Memory overflow due to storage failure */ - UDS_VOLUME_OVERFLOW = UDS_ERROR_CODE_BASE + 16, - /** Unused */ - UDS_UNUSED_CODE_17 = UDS_ERROR_CODE_BASE + 17, - /** Unused */ - UDS_UNUSED_CODE_18 = UDS_ERROR_CODE_BASE + 18, - /** Unused */ - UDS_UNUSED_CODE_19 = UDS_ERROR_CODE_BASE + 19, - /** Configuration pointer required */ - UDS_CONF_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 20, - /** Index stats pointer required */ - UDS_INDEX_STATS_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 21, - /** Context stats pointer required */ - UDS_CONTEXT_STATS_PTR_REQUIRED = UDS_ERROR_CODE_BASE + 22, - /** Unused */ - UDS_UNUSED_CODE_23 = UDS_ERROR_CODE_BASE + 23, - /** Unused */ - UDS_UNUSED_CODE_24 = UDS_ERROR_CODE_BASE + 24, - /** Unused */ - UDS_UNUSED_CODE_25 = UDS_ERROR_CODE_BASE + 25, - /** Unused */ - UDS_UNUSED_CODE_26 = UDS_ERROR_CODE_BASE + 26, - /** Unused */ - UDS_UNUSED_CODE_27 = UDS_ERROR_CODE_BASE + 27, - /** Memory configuration not supported */ - UDS_INVALID_MEMORY_SIZE = UDS_ERROR_CODE_BASE + 28, - /** Unused */ - UDS_UNUSED_CODE_29 = UDS_ERROR_CODE_BASE + 29, - /** Index name required */ - UDS_INDEX_NAME_REQUIRED = UDS_ERROR_CODE_BASE + 30, - /** Configuration required */ - UDS_CONF_REQUIRED = UDS_ERROR_CODE_BASE + 31, - /** Unused */ - UDS_UNUSED_CODE_32 = UDS_ERROR_CODE_BASE + 32, - /** Unused */ - UDS_UNUSED_CODE_33 = UDS_ERROR_CODE_BASE + 33, - /** Unused */ - UDS_UNUSED_CODE_34 = UDS_ERROR_CODE_BASE + 34, - /** Unused */ - UDS_UNUSED_CODE_35 = UDS_ERROR_CODE_BASE + 35, - /** Unused */ - UDS_UNUSED_CODE_36 = UDS_ERROR_CODE_BASE + 36, - /** Essential files for index not found */ - UDS_NO_INDEX = UDS_ERROR_CODE_BASE + 37, - /** Checkpoint frequency out of range */ - UDS_BAD_CHECKPOINT_FREQUENCY = UDS_ERROR_CODE_BASE + 38, - /** Wrong type of index configuration */ - UDS_WRONG_INDEX_CONFIG = UDS_ERROR_CODE_BASE + 39, - /** Unused */ - UDS_UNUSED_CODE_40 = UDS_ERROR_CODE_BASE + 40, - /** Unused */ - UDS_UNUSED_CODE_41 = UDS_ERROR_CODE_BASE + 41, - /** Unused */ - UDS_UNUSED_CODE_42 = UDS_ERROR_CODE_BASE + 42, - /** Unused */ - UDS_UNUSED_CODE_43 = UDS_ERROR_CODE_BASE + 43, - /** Premature end of file in scanned file */ - UDS_END_OF_FILE = UDS_ERROR_CODE_BASE + 44, - /** Attempt to access unsaved index */ - UDS_INDEX_NOT_SAVED_CLEANLY = UDS_ERROR_CODE_BASE + 45, - /** Unused */ - UDS_UNUSED_CODE_46 = UDS_ERROR_CODE_BASE + 46, - /** There is not sufficient space to create the index */ - UDS_INSUFFICIENT_INDEX_SPACE = UDS_ERROR_CODE_BASE + 47, - /** Unused */ - UDS_UNUSED_CODE_48 = UDS_ERROR_CODE_BASE + 48, - /** Unused */ - UDS_UNUSED_CODE_49 = UDS_ERROR_CODE_BASE + 49, - /** Index is suspended */ - UDS_SUSPENDED = UDS_ERROR_CODE_BASE + 50, - /** Unused */ - UDS_UNUSED_CODE_51 = UDS_ERROR_CODE_BASE + 51, - /** Index session is already initialized */ - UDS_INDEXSESSION_IN_USE = UDS_ERROR_CODE_BASE + 52, - /** Callback required */ - UDS_CALLBACK_REQUIRED = UDS_ERROR_CODE_BASE + 53, - /** Wrong operation type */ - UDS_INVALID_OPERATION_TYPE = UDS_ERROR_CODE_BASE + 54, - /** One more than the last UDS_ERROR_CODE */ - UDS_ERROR_CODE_LAST, - /** One more than this block can use */ - UDS_ERROR_CODE_BLOCK_END = UDS_ERROR_CODE_BASE + 1024 -}; - -#endif /* UDS_ERROR_H */ diff --git a/uds/uds-platform.h b/uds/uds-platform.h deleted file mode 100644 index 0df39ef..0000000 --- a/uds/uds-platform.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/uds-platform.h#1 $ - */ - -/** - * @file - * @brief Platform definitions for albireo - **/ -#ifndef UDS_PLATFORM_H -#define UDS_PLATFORM_H - - -#ifdef __KERNEL__ -#include -#else -#include -#include -#include -#include -#include -#endif - -#endif /* UDS_PLATFORM_H */ diff --git a/uds/uds.h b/uds/uds.h deleted file mode 100644 index 42e2863..0000000 --- a/uds/uds.h +++ /dev/null @@ -1,528 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/uds.h#2 $ - */ - -/** - * @mainpage UDS API Reference - *
Copyright (c) 2020 Red Hat, Inc.
- **/ - -/** - * @file - * @brief General UDS definitions - **/ -#ifndef UDS_H -#define UDS_H - -#include "uds-platform.h" - -#ifdef UDS_DISABLE_ATTR_WARN_UNUSED_RESULT -#define UDS_ATTR_WARN_UNUSED_RESULT -#else -#define UDS_ATTR_WARN_UNUSED_RESULT __attribute__((warn_unused_result)) -#endif - -/** - * Valid request types as described in callbacks. - **/ -typedef enum { - /** - * Callback type for operations that post mappings to the UDS - * index. When the chunk-hash being added already exists, the - * existing metadata is not overwritten. Regardless, the - * recency of the chunk is updated. - **/ - UDS_POST, - - /** - * Callback type for operations that update mappings in the UDS - * index. If the indicated entry does not have any mapping in the - * index, one is created. In either case, the recency of - * the chunk is updated. - **/ - UDS_UPDATE, - - /** - * Callback type for operations that delete mappings from the - * UDS index. */ - UDS_DELETE, - - /** - * Callback type for operations that query mappings in the UDS - * index. When a mapping is found, the recency of the mapping - * is updated unless it's the no-update call. - **/ - UDS_QUERY -} UdsCallbackType; - -/** - * Valid types for opening an index. - **/ -typedef enum { - /** - * Load an existing index. If the index was not saved cleanly, try to - * recover and rebuild the index. - **/ - UDS_LOAD = 0, - - /** - * Create a new index. - **/ - UDS_CREATE = 1, - - /** - * Load an existing index, but only if it was cleanly saved. - **/ - UDS_NO_REBUILD = 2, -} UdsOpenIndexType; - -/** General UDS constants. */ -enum { - /** The chunk name size in bytes (128 bits = 16 bytes). */ - UDS_CHUNK_NAME_SIZE = 16, - /** The maximum metadata size in bytes. */ - UDS_MAX_METADATA_SIZE = 16, -}; - -/** - * Type representing memory configuration which is either a positive - * integer number of gigabytes or one of the three special constants - * for configurations which are smaller than 1 gigabyte. - **/ -typedef unsigned int UdsMemoryConfigSize; - -extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_256MB; -extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_512MB; -extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_768MB; - -/** - * The maximum configurable amount of memory. - **/ -extern const UdsMemoryConfigSize UDS_MEMORY_CONFIG_MAX; - -/** The name (hash) of a chunk. */ -typedef struct udsChunkName { - /** The name (hash) of a chunk. */ - unsigned char name[UDS_CHUNK_NAME_SIZE]; -} UdsChunkName; - -/** - * An active index session. - **/ -struct uds_index_session; - -/** - * The data used to configure a new index. - **/ -typedef struct udsConfiguration *UdsConfiguration; -typedef uint64_t UdsNonce; - -/** - * The data used to configure a new index session. - **/ -struct uds_parameters { - // Tne number of threads used to process index requests. - int zone_count; - // The number of threads used to read volume pages. - int read_threads; - // The number of chapters to write between checkpoints. - int checkpoint_frequency; -}; -#define UDS_PARAMETERS_INITIALIZER { \ - .zone_count = 0, \ - .read_threads = 2, \ - .checkpoint_frequency = 0, \ - } - -/** - * Index statistics - * - * These statistics capture the current index characteristics, - * including resource usage. - **/ -typedef struct udsIndexStats { - /** The total number of chunk names stored in the index */ - uint64_t entriesIndexed; - /** An estimate of the index's memory usage */ - uint64_t memoryUsed; - /** The number of collisions recorded in the master index */ - uint64_t collisions; - /** The number of entries discarded from the index since index startup */ - uint64_t entriesDiscarded; - /** The number of checkpoints done this session */ - uint64_t checkpoints; -} UdsIndexStats; - -/** - * Context statistics - * - * These statistics capture a library context's characteristics either since - * it was initialized or since its statistics were last reset, whichever - * is more recent. - **/ -typedef struct udsContextStats { - /** The time at which context statistics were last fetched */ - time_t currentTime; - /** - * The number of post calls since context statistics were last reset that - * found an existing entry - **/ - uint64_t postsFound; - /** - * The number of post calls since context statistics were last reset that - * added an entry - **/ - uint64_t postsNotFound; - /** - * The number of post calls since context statistics were last reset that - * found an existing entry is current enough to only exist in memory and not - * have been commited to disk yet. - **/ - uint64_t inMemoryPostsFound; - /** - * The number of post calls since context statistics were last reset that - * found an existing entry in the dense portion of the index. - **/ - uint64_t densePostsFound; - /** - * The number of post calls since context statistics were last reset that - * found an existing entry in the sparse portion of the index (if one - * exists). - **/ - uint64_t sparsePostsFound; - /** - * The number of update calls since context statistics were last reset that - * updated an existing entry - **/ - uint64_t updatesFound; - /** - * The number of update calls since context statistics were last reset that - * added a new entry - **/ - uint64_t updatesNotFound; - /** - * The number of delete requests since context statistics were last reset - * that deleted an existing entry - **/ - uint64_t deletionsFound; - /** - * The number of delete requests since context statistics were last reset - * that did nothing. - **/ - uint64_t deletionsNotFound; - /** - * The number of query calls since context statistics were last reset that - * found existing entry - **/ - uint64_t queriesFound; - /** - * The number of query calls since context statistics were last reset that - * did not find an entry - **/ - uint64_t queriesNotFound; - /** - * The total number of library requests (the sum of posts, updates, - * deletions, and queries) since context - * statistics were last reset - **/ - uint64_t requests; -} UdsContextStats; - -/** - * Initializes an index configuration. - * - * @param [out] conf The new configuration - * @param [in] memGB The maximum memory allocation, in GB - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsInitializeConfiguration(UdsConfiguration *conf, - UdsMemoryConfigSize memGB); - -/** - * Sets or clears an index configuration's sparse indexing settings. - * - * @param [in,out] conf The configuration to change - * @param [in] sparse If true, request a sparse - * index; if false, request - * a default index. - * - **/ -void udsConfigurationSetSparse(UdsConfiguration conf, bool sparse); - -/** - * Tests whether an index configuration specifies sparse indexing. - * - * @param [in] conf The configuration to check - * - * @return Returns true if the configuration - * is sparse, or false if not - **/ -UDS_ATTR_WARN_UNUSED_RESULT -bool udsConfigurationGetSparse(UdsConfiguration conf); - -/** - * Sets an index configuration's nonce. - * - * @param [in,out] conf The configuration to change - * @param [in] nonce The 64 bit nonce. - * - **/ -void udsConfigurationSetNonce(UdsConfiguration conf, UdsNonce nonce); - -/** - * Gets an index configuration's nonce. - * - * @param [in] conf The configuration to check - * - * @return The 64 bit nonce. - **/ -UDS_ATTR_WARN_UNUSED_RESULT -UdsNonce udsConfigurationGetNonce(UdsConfiguration conf); - -/** - * Fetches a configuration's maximum memory allocation. - * - * @param [in] conf The configuration to check - * - * @return The amount of memory allocated, in GB - **/ -UDS_ATTR_WARN_UNUSED_RESULT -UdsMemoryConfigSize udsConfigurationGetMemory(UdsConfiguration conf); - -/** - * Fetches a configuration's chapters per volume value. - * - * @param [in] conf The configuration to check - * - * @return The number of chapters per volume - **/ -UDS_ATTR_WARN_UNUSED_RESULT -unsigned int udsConfigurationGetChaptersPerVolume(UdsConfiguration conf); - -/** - * Frees memory used by a configuration. - * - * @param [in,out] conf The configuration for which memory is being freed - **/ -void udsFreeConfiguration(UdsConfiguration conf); - -/** - * Compute the size required to store the index on persistent storage. This - * size is valid for any index stored in a single file or on a single block - * device. This size should be used when configuring a block device on which - * to store an index. - * - * @param [in] config A UdsConfiguration for an index. - * @param [in] numCheckpoints The maximum number of checkpoints. - * @param [out] indexSize The number of bytes required to store - * the index. - * - * @return UDS_SUCCESS or an error code. - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsComputeIndexSize(const UdsConfiguration config, - unsigned int numCheckpoints, - uint64_t *indexSize); - -/** - * Opens an index session. - * - * Creates a session for an index. #udsOpenIndex must be called before - * the index can be used. - * - * Destroy the session with #udsDestroyIndexSession. - * - * @param [out] session A pointer to the new session - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsCreateIndexSession(struct uds_index_session **session); - -/** - * Fetches the UDS library version. - * - * @return The library version - **/ -UDS_ATTR_WARN_UNUSED_RESULT -const char *udsGetVersion(void); - -#ifdef __KERNEL__ -/** - * The name argument to #udsOpenIndex is a text string that names the index. - * The name should have the form "path", where path is the name of the block - * device. The path should not contain white space. The names can optionally - * contain size and/or offset options which give the number of bytes in the - * index and the byte offset to the start of the index. For example, the name - * "/dev/sda8 offset=409600 size=2048000000" is an index that is stored in - * 2040000000 bytes of /dev/sda8 starting at byte 409600. - **/ -#else -/** - * The name argument to #udsOpenIndex is a text string that names the index. - * The name should have the form "path", where path is the name of the file or - * block device. The path should not contain white space. The name can - * optionally contain size and/or offset options which give the number of bytes - * in the index and the byte offset to the start of the index. For example, - * the name "/dev/sda8 offset=409600 size=2048000000" is an index that is - * stored in 2040000000 bytes of /dev/sda8 starting at byte 409600. - **/ -#endif - -/** - * Opens an index with an existing session. This operation will fail if the - * index session is suspended, or if there is already an open index. - * - * The index should be closed with #udsCloseIndex. - * - * @param openType The type of open, which is one of #UDS_LOAD, #UDS_CREATE, - * or #UDS_NO_REBUILD. - * @param name The name of the index - * @param params The index session parameters. If NULL, the default - * session parameters will be used. - * @param conf The index configuration - * @param session The index session - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsOpenIndex(UdsOpenIndexType openType, - const char *name, - const struct uds_parameters *params, - UdsConfiguration conf, - struct uds_index_session *session); - -/** - * Waits until all callbacks for index operations are complete, and prevents - * new index operations from starting. Index operations will return - * UDS_SUSPENDED until #udsResumeIndexSession is called. Optionally saves all - * index data before returning. - * - * @param session The session to suspend - * @param save Whether to save index data - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsSuspendIndexSession(struct uds_index_session *session, bool save); - -/** - * Allows new index operations for an index, whether it was suspended or not. - * - * @param session The session to resume - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsResumeIndexSession(struct uds_index_session *session); - -/** - * Waits until all callbacks for index operations are complete. - * - * @param [in] session The session to flush - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsFlushIndexSession(struct uds_index_session *session); - -/** - * Closes an index. This operation will fail if the index session is - * suspended. - * - * Saves changes to the index so that #udsOpenIndex can re-open it. - * - * @param [in] session The session containing the index to close - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsCloseIndex(struct uds_index_session *session); - -/** - * Destroys an index session. - * - * Saves changes to the index and closes the index if one is open. - * Use #udsDestroyIndexSession for index sessions created by - * #udsCreateIndexSession. - * - * @param [in] session The session to destroy - * - * @return Either #UDS_SUCCESS or an error code - **/ -int udsDestroyIndexSession(struct uds_index_session *session); - -/** - * Returns the configuration for the given index session. - * - * @param [in] session The session - * @param [out] conf The index configuration - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsGetIndexConfiguration(struct uds_index_session *session, - UdsConfiguration *conf); - -/** - * Fetches index statistics for the given index session. - * - * @param [in] session The session - * @param [out] stats The index statistics structure to fill - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsGetIndexStats(struct uds_index_session *session, UdsIndexStats *stats); - -/** - * Fetches index session statistics for the given index session. - * - * @param [in] session The session - * @param [out] stats The context statistics structure to fill - * - * @return Either #UDS_SUCCESS or an error code - **/ -UDS_ATTR_WARN_UNUSED_RESULT -int udsGetIndexSessionStats(struct uds_index_session *session, - UdsContextStats *stats); - -/** - * Convert an error code to a string. - * - * @param errnum The error code - * @param buf The buffer to hold the error string - * @param buflen The length of the buffer - * - * @return A pointer to buf - **/ -UDS_ATTR_WARN_UNUSED_RESULT -const char *udsStringError(int errnum, char *buf, size_t buflen); - -/** - * Suggested buffer size for udsStringError. - **/ -enum { - UDS_STRING_ERROR_BUFSIZE = 128 -}; - -#endif /* UDS_H */ diff --git a/uds/udsMain.c b/uds/udsMain.c deleted file mode 100644 index 8d4f411..0000000 --- a/uds/udsMain.c +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/udsMain.c#12 $ - */ - -#include "uds.h" - -#include "config.h" -#include "geometry.h" -#include "indexLayout.h" -#include "indexRouter.h" -#include "indexSession.h" -#include "loadType.h" -#include "logger.h" -#include "memoryAlloc.h" - -const UdsMemoryConfigSize UDS_MEMORY_CONFIG_MAX = 1024; -const UdsMemoryConfigSize UDS_MEMORY_CONFIG_256MB = (UdsMemoryConfigSize) -256; -const UdsMemoryConfigSize UDS_MEMORY_CONFIG_512MB = (UdsMemoryConfigSize) -512; -const UdsMemoryConfigSize UDS_MEMORY_CONFIG_768MB = (UdsMemoryConfigSize) -768; - -/* - * =========================================================================== - * UDS system management - * =========================================================================== - */ - -/**********************************************************************/ -int udsInitializeConfiguration(UdsConfiguration *userConfig, - UdsMemoryConfigSize memGB) -{ - if (userConfig == NULL) { - return logErrorWithStringError(UDS_CONF_PTR_REQUIRED, - "received a NULL config pointer"); - } - - /* Set the configuration parameters that change with memory size. If you - * change these values, you should also: - * - * Change Configuration_x1, which tests these values and expects to see them - * - * Bump the index configuration version number. This bump ensures that - * the test infrastructure will be forced to test the new configuration. - */ - - unsigned int chaptersPerVolume, recordPagesPerChapter; - if (memGB == UDS_MEMORY_CONFIG_256MB) { - chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; - recordPagesPerChapter = SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (memGB == UDS_MEMORY_CONFIG_512MB) { - chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; - recordPagesPerChapter = 2 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (memGB == UDS_MEMORY_CONFIG_768MB) { - chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; - recordPagesPerChapter = 3 * SMALL_RECORD_PAGES_PER_CHAPTER; - } else if (memGB == 1) { - chaptersPerVolume = DEFAULT_CHAPTERS_PER_VOLUME; - recordPagesPerChapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; - } else if ((memGB > 1) && (memGB <= UDS_MEMORY_CONFIG_MAX)) { - chaptersPerVolume = memGB * DEFAULT_CHAPTERS_PER_VOLUME; - recordPagesPerChapter = DEFAULT_RECORD_PAGES_PER_CHAPTER; - } else { - return UDS_INVALID_MEMORY_SIZE; - } - - int result = ALLOCATE(1, struct udsConfiguration, "udsConfiguration", - userConfig); - if (result != UDS_SUCCESS) { - return result; - } - - (*userConfig)->recordPagesPerChapter = recordPagesPerChapter; - (*userConfig)->chaptersPerVolume = chaptersPerVolume; - (*userConfig)->sparseChaptersPerVolume = DEFAULT_SPARSE_CHAPTERS_PER_VOLUME; - (*userConfig)->cacheChapters = DEFAULT_CACHE_CHAPTERS; - (*userConfig)->checkpointFrequency = DEFAULT_CHECKPOINT_FREQUENCY; - (*userConfig)->masterIndexMeanDelta = DEFAULT_MASTER_INDEX_MEAN_DELTA; - (*userConfig)->bytesPerPage = DEFAULT_BYTES_PER_PAGE; - (*userConfig)->sparseSampleRate = DEFAULT_SPARSE_SAMPLE_RATE; - (*userConfig)->nonce = 0; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void udsConfigurationSetSparse(UdsConfiguration userConfig, bool sparse) -{ - bool prevSparse = (userConfig->sparseChaptersPerVolume != 0); - if (sparse == prevSparse) { - // nothing to do - return; - } - - unsigned int prevChaptersPerVolume = userConfig->chaptersPerVolume; - if (sparse) { - // Index 10TB with 4K blocks, 95% sparse, fit in dense (1TB) footprint - userConfig->chaptersPerVolume = 10 * prevChaptersPerVolume; - userConfig->sparseChaptersPerVolume = 9 * prevChaptersPerVolume - + prevChaptersPerVolume / 2; - userConfig->sparseSampleRate = 32; - } else { - userConfig->chaptersPerVolume = prevChaptersPerVolume / 10; - userConfig->sparseChaptersPerVolume = 0; - userConfig->sparseSampleRate = 0; - } -} - -/**********************************************************************/ -bool udsConfigurationGetSparse(UdsConfiguration userConfig) -{ - return userConfig->sparseChaptersPerVolume > 0; -} - -/**********************************************************************/ -void udsConfigurationSetNonce(UdsConfiguration userConfig, UdsNonce nonce) -{ - userConfig->nonce = nonce; -} - -/**********************************************************************/ -UdsNonce udsConfigurationGetNonce(UdsConfiguration userConfig) -{ - return userConfig->nonce; -} - -/**********************************************************************/ -unsigned int udsConfigurationGetMemory(UdsConfiguration userConfig) -{ - enum { - CHAPTERS = DEFAULT_CHAPTERS_PER_VOLUME, - SMALL_PAGES = CHAPTERS * SMALL_RECORD_PAGES_PER_CHAPTER, - LARGE_PAGES = CHAPTERS * DEFAULT_RECORD_PAGES_PER_CHAPTER - }; - unsigned int pages = (userConfig->chaptersPerVolume - * userConfig->recordPagesPerChapter); - if (userConfig->sparseChaptersPerVolume != 0) { - pages /= 10; - } - switch (pages) { - case SMALL_PAGES: return UDS_MEMORY_CONFIG_256MB; - case 2 * SMALL_PAGES: return UDS_MEMORY_CONFIG_512MB; - case 3 * SMALL_PAGES: return UDS_MEMORY_CONFIG_768MB; - default: return pages / LARGE_PAGES; - } -} - -/**********************************************************************/ -unsigned int -udsConfigurationGetChaptersPerVolume(UdsConfiguration userConfig) -{ - return userConfig->chaptersPerVolume; -} - -/**********************************************************************/ -void udsFreeConfiguration(UdsConfiguration userConfig) -{ - FREE(userConfig); -} - -/**********************************************************************/ -int udsCreateIndexSession(struct uds_index_session **session) -{ - if (session == NULL) { - return UDS_NO_INDEXSESSION; - } - - struct uds_index_session *indexSession = NULL; - int result = makeEmptyIndexSession(&indexSession); - if (result != UDS_SUCCESS) { - return result; - } - - *session = indexSession; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static -int initializeIndexSessionWithLayout(struct uds_index_session *indexSession, - IndexLayout *layout, - const struct uds_parameters *userParams, - LoadType loadType) -{ - int result = ((loadType == LOAD_CREATE) - ? writeIndexConfig(layout, &indexSession->userConfig) - : verifyIndexConfig(layout, &indexSession->userConfig)); - if (result != UDS_SUCCESS) { - return result; - } - - Configuration *indexConfig; - result = makeConfiguration(&indexSession->userConfig, &indexConfig); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "Failed to allocate config"); - return result; - } - - // Zero the stats for the new index. - memset(&indexSession->stats, 0, sizeof(indexSession->stats)); - - result = makeIndexRouter(layout, indexConfig, userParams, loadType, - &indexSession->loadContext, enterCallbackStage, - &indexSession->router); - freeConfiguration(indexConfig); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "Failed to make router"); - return result; - } - - logUdsConfiguration(&indexSession->userConfig); - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int initializeIndexSession(struct uds_index_session *indexSession, - const char *name, - const struct uds_parameters *userParams, - LoadType loadType) -{ - IndexLayout *layout; - int result = makeIndexLayout(name, loadType == LOAD_CREATE, - &indexSession->userConfig, &layout); - if (result != UDS_SUCCESS) { - return result; - } - - result = initializeIndexSessionWithLayout(indexSession, layout, userParams, - loadType); - putIndexLayout(&layout); - return result; -} - -/**********************************************************************/ -int udsOpenIndex(UdsOpenIndexType openType, - const char *name, - const struct uds_parameters *userParams, - UdsConfiguration userConfig, - struct uds_index_session *session) -{ - if (name == NULL) { - return UDS_INDEX_NAME_REQUIRED; - } - if (userConfig == NULL) { - return UDS_CONF_REQUIRED; - } - if (session == NULL) { - return UDS_NO_INDEXSESSION; - } - - int result = startLoadingIndexSession(session); - if (result != UDS_SUCCESS) { - return result; - } - - session->userConfig = *userConfig; - - // Map the external openType to the internal loadType - LoadType loadType = openType == UDS_CREATE ? LOAD_CREATE - : openType == UDS_NO_REBUILD ? LOAD_LOAD - : LOAD_REBUILD; - logNotice("%s: %s", getLoadType(loadType), name); - - result = initializeIndexSession(session, name, userParams, loadType); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "Failed %s", getLoadType(loadType)); - saveAndFreeIndex(session); - } - - finishLoadingIndexSession(session, result); - return sansUnrecoverable(result); -} - -/**********************************************************************/ -const char *udsGetVersion(void) -{ -#ifdef UDS_VERSION - return UDS_VERSION; -#else - return "internal version"; -#endif -} - -/**********************************************************************/ -const char *udsStringError(int errnum, char *buf, size_t buflen) -{ - if (buf == NULL) { - return NULL; - } - - return stringError(errnum, buf, buflen); -} diff --git a/uds/udsModule.c b/uds/udsModule.c deleted file mode 100644 index 007f1a8..0000000 --- a/uds/udsModule.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/kernelLinux/uds/udsModule.c#32 $ - */ - -#include - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "murmur/MurmurHash3.h" -#include "sysfs.h" -#include "timeUtils.h" -#include "uds.h" -#include "uds-block.h" -#include "util/funnelQueue.h" - -/**********************************************************************/ -static int __init dedupeInit(void) -{ - memoryInit(); - logInfo("loaded version %s", UDS_VERSION); - initSysfs(); - return 0; -} - -/**********************************************************************/ -static void __exit dedupeExit(void) -{ - putSysfs(); - memoryExit(); - logInfo("unloaded version %s", UDS_VERSION); -} - -/**********************************************************************/ -module_init(dedupeInit); -module_exit(dedupeExit); - -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_256MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_512MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_768MB); -EXPORT_SYMBOL_GPL(UDS_MEMORY_CONFIG_MAX); -EXPORT_SYMBOL_GPL(udsInitializeConfiguration); -EXPORT_SYMBOL_GPL(udsComputeIndexSize); -EXPORT_SYMBOL_GPL(udsConfigurationSetNonce); -EXPORT_SYMBOL_GPL(udsConfigurationGetNonce); -EXPORT_SYMBOL_GPL(udsConfigurationSetSparse); -EXPORT_SYMBOL_GPL(udsConfigurationGetSparse); -EXPORT_SYMBOL_GPL(udsConfigurationGetMemory); -EXPORT_SYMBOL_GPL(udsConfigurationGetChaptersPerVolume); -EXPORT_SYMBOL_GPL(udsFreeConfiguration); -EXPORT_SYMBOL_GPL(udsGetVersion); -EXPORT_SYMBOL_GPL(udsCreateIndexSession); -EXPORT_SYMBOL_GPL(udsOpenIndex); -EXPORT_SYMBOL_GPL(udsSuspendIndexSession); -EXPORT_SYMBOL_GPL(udsResumeIndexSession); -EXPORT_SYMBOL_GPL(udsCloseIndex); -EXPORT_SYMBOL_GPL(udsDestroyIndexSession); -EXPORT_SYMBOL_GPL(udsFlushIndexSession); -EXPORT_SYMBOL_GPL(udsGetIndexConfiguration); -EXPORT_SYMBOL_GPL(udsGetIndexStats); -EXPORT_SYMBOL_GPL(udsGetIndexSessionStats); -EXPORT_SYMBOL_GPL(udsStringError); -EXPORT_SYMBOL_GPL(udsStartChunkOperation); - -EXPORT_SYMBOL_GPL(allocSprintf); -EXPORT_SYMBOL_GPL(allocateMemory); -EXPORT_SYMBOL_GPL(allocateMemoryNowait); -EXPORT_SYMBOL_GPL(assertionFailed); -EXPORT_SYMBOL_GPL(assertionFailedLogOnly); -EXPORT_SYMBOL_GPL(availableSpace); -EXPORT_SYMBOL_GPL(bufferLength); -EXPORT_SYMBOL_GPL(bufferUsed); -EXPORT_SYMBOL_GPL(clearBuffer); -EXPORT_SYMBOL_GPL(compactBuffer); -EXPORT_SYMBOL_GPL(contentLength); -EXPORT_SYMBOL_GPL(copyBytes); -EXPORT_SYMBOL_GPL(currentTime); -EXPORT_SYMBOL_GPL(duplicateString); -EXPORT_SYMBOL_GPL(ensureAvailableSpace); -EXPORT_SYMBOL_GPL(equalBuffers); -EXPORT_SYMBOL_GPL(fixedSprintf); -EXPORT_SYMBOL_GPL(freeBuffer); -EXPORT_SYMBOL_GPL(freeFunnelQueue); -EXPORT_SYMBOL_GPL(freeMemory); -EXPORT_SYMBOL_GPL(funnelQueuePoll); -EXPORT_SYMBOL_GPL(getBoolean); -EXPORT_SYMBOL_GPL(getBufferContents); -EXPORT_SYMBOL_GPL(getByte); -EXPORT_SYMBOL_GPL(getBytesFromBuffer); -EXPORT_SYMBOL_GPL(getMemoryStats); -EXPORT_SYMBOL_GPL(getUInt16BEFromBuffer); -EXPORT_SYMBOL_GPL(getUInt16LEFromBuffer); -EXPORT_SYMBOL_GPL(getUInt16LEsFromBuffer); -EXPORT_SYMBOL_GPL(getUInt32BEFromBuffer); -EXPORT_SYMBOL_GPL(getUInt32BEsFromBuffer); -EXPORT_SYMBOL_GPL(getUInt32LEFromBuffer); -EXPORT_SYMBOL_GPL(getUInt64BEsFromBuffer); -EXPORT_SYMBOL_GPL(getUInt64LEFromBuffer); -EXPORT_SYMBOL_GPL(getUInt64LEsFromBuffer); -EXPORT_SYMBOL_GPL(growBuffer); -EXPORT_SYMBOL_GPL(hasSameBytes); -EXPORT_SYMBOL_GPL(isFunnelQueueEmpty); -EXPORT_SYMBOL_GPL(makeBuffer); -EXPORT_SYMBOL_GPL(makeFunnelQueue); -EXPORT_SYMBOL_GPL(MurmurHash3_x64_128); -EXPORT_SYMBOL_GPL(nowUsec); -EXPORT_SYMBOL_GPL(peekByte); -EXPORT_SYMBOL_GPL(putBoolean); -EXPORT_SYMBOL_GPL(putBuffer); -EXPORT_SYMBOL_GPL(putByte); -EXPORT_SYMBOL_GPL(putBytes); -EXPORT_SYMBOL_GPL(putInt64LEIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt16BEIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt16LEIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt16LEsIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt32BEIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt32BEsIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt32LEIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt64BEsIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt64LEIntoBuffer); -EXPORT_SYMBOL_GPL(putUInt64LEsIntoBuffer); -EXPORT_SYMBOL_GPL(reallocateMemory); -EXPORT_SYMBOL_GPL(registerAllocatingThread); -EXPORT_SYMBOL_GPL(reportMemoryUsage); -EXPORT_SYMBOL_GPL(resetBufferEnd); -EXPORT_SYMBOL_GPL(rewindBuffer); -EXPORT_SYMBOL_GPL(skipForward); -EXPORT_SYMBOL_GPL(uncompactedAmount); -EXPORT_SYMBOL_GPL(unregisterAllocatingThread); -EXPORT_SYMBOL_GPL(wrapBuffer); -EXPORT_SYMBOL_GPL(zeroBytes); - -/**********************************************************************/ - - -/**********************************************************************/ - -MODULE_DESCRIPTION("deduplication engine"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); -MODULE_VERSION(UDS_VERSION); diff --git a/uds/util/eventCount.c b/uds/util/eventCount.c deleted file mode 100644 index 7efeac6..0000000 --- a/uds/util/eventCount.c +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/util/eventCount.c#2 $ - */ - -/** - * This EventCount implementation uses a posix semaphore for portability, - * although a futex would be slightly superior to use and easy to substitute. - * It is designed to make signalling as cheap as possible, since that is the - * code path likely triggered on most updates to a lock-free data structure. - * Waiters are likely going to sleep, so optimizing for that case isn't - * necessary. - * - * The critical field is the state, which is really two fields that can be - * atomically updated in unison: an event counter and a waiter count. Every - * call to eventCountPrepare() issues a wait token by atomically incrementing - * the waiter count. The key invariant is a strict accounting of the number of - * tokens issued. Every token returned by eventCountPrepare() is a contract - * that the caller will call acquireSemaphore() and a signaller will call - * releaseSemaphore(), each exactly once. Atomic updates to the state field - * ensure that each token is counted once and that tokens are not lost. - * Cancelling a token attempts to take a fast-path by simply decrementing the - * waiters field, but if the token has already been claimed by a signaller, - * the canceller must still wait on the semaphore to consume the transferred - * token. - * - * The state field is 64 bits, partitioned into a 16-bit waiter field and a - * 48-bit counter. We are unlikely to have 2^16 threads, much less 2^16 - * threads waiting on any single event transition. 2^48 microseconds is - * several years, so a token holder would have to wait that long for the - * counter to wrap around, and then call eventCountWait() at the exact right - * time to see the re-used counter, in order to lose a wakeup due to counter - * wrap-around. Using a 32-bit state field would greatly increase that chance, - * but if forced to do so, the implementation could likely tolerate it since - * callers are supposed to hold tokens for miniscule periods of time. - * Fortunately, x64 has 64-bit compare-and-swap, and the performance of - * interlocked 64-bit operations appears to be about the same as for 32-bit - * ones, so being paranoid and using 64 bits costs us nothing. - * - * Here are some sequences of calls and state transitions: - * - * action postcondition - * counter waiters semaphore - * initialized 0 0 0 - * prepare 0 1 0 - * wait (blocks) 0 1 0 - * signal 1 0 1 - * wait (unblocks) 1 0 0 - * - * signal (fast-path) 1 0 0 - * signal (fast-path) 1 0 0 - * - * prepare A 1 1 0 - * prepare B 1 2 0 - * signal 2 0 2 - * wait B (fast-path) 2 0 1 - * wait A (fast-path) 2 0 0 - * - * prepare 2 1 0 - * cancel (fast-path) 2 0 0 - * - * prepare 2 1 0 - * signal 3 0 1 - * cancel (must wait) 3 0 0 - * - * The EventCount structure is aligned, sized, and allocated to cache line - * boundaries to avoid any false sharing between the EventCount and other - * shared state. The state field and semaphore should fit on a single cache - * line. The instrumentation counters increase the size of the structure so it - * rounds up to use two (64-byte x86) cache lines. - * - * XXX Need interface to access or display instrumentation counters. - **/ - -#include "eventCount.h" - -#include "atomicDefs.h" -#include "common.h" -#include "compiler.h" -#include "cpu.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "threads.h" - -enum { - ONE_WAITER = 1, // value used to increment the waiters field - ONE_EVENT = (1 << 16), // value used to increment the event counter - WAITERS_MASK = (ONE_EVENT - 1), // bit mask to access the waiters field - EVENTS_MASK = ~WAITERS_MASK, // bit mask to access the event counter -}; - -struct eventCount { - // Atomically mutable state: - // low 16 bits: the number of wait tokens not posted to the semaphore - // high 48 bits: current event counter - atomic64_t state; - - // Semaphore used to block threads when waiting is required. - Semaphore semaphore; - - // Instrumentation counters. - - // Declare alignment so we don't share a cache line. -} __attribute__((aligned(CACHE_LINE_BYTES))); - -/** - * Test the event field in two tokens for equality. - * - * @return true iff the tokens contain the same event field value - **/ -static INLINE bool sameEvent(EventToken token1, EventToken token2) -{ - return ((token1 & EVENTS_MASK) == (token2 & EVENTS_MASK)); -} - -/**********************************************************************/ -void eventCountBroadcast(EventCount *ec) -{ - - // Even if there are no waiters (yet), we will need a memory barrier. - smp_mb(); - - uint64_t waiters; - uint64_t state = atomic64_read(&ec->state); - uint64_t oldState = state; - do { - // Check if there are any tokens that have not yet been been transferred - // to the semaphore. This is the fast no-waiters path. - waiters = (state & WAITERS_MASK); - if (waiters == 0) { - // Fast path first time through--no need to signal or post if there are - // no observers. - return; - } - - /* - * Attempt to atomically claim all the wait tokens and bump the event count - * using an atomic compare-and-swap. This operation contains a memory - * barrier. - */ - EventToken newState = ((state & ~WAITERS_MASK) + ONE_EVENT); - oldState = state; - state = atomic64_cmpxchg(&ec->state, oldState, newState); - // The cmpxchg fails when we lose a race with a new waiter or another - // signaller, so try again. - } while (unlikely(state != oldState)); - - - /* - * Wake the waiters by posting to the semaphore. This effectively transfers - * the wait tokens to the semaphore. There's sadly no bulk post for posix - * semaphores, so we've got to loop to do them all. - */ - while (waiters-- > 0) { - releaseSemaphore(&ec->semaphore); - } -} - -/** - * Attempt to cancel a prepared wait token by decrementing the - * number of waiters in the current state. This can only be done - * safely if the event count hasn't been bumped. - * - * @param ec the event count on which the wait token was issued - * @param token the wait to cancel - * - * @return true if the wait was cancelled, false if the caller must - * still wait on the semaphore - **/ -static INLINE bool fastCancel(EventCount *ec, EventToken token) -{ - EventToken currentToken = atomic64_read(&ec->state); - while (sameEvent(currentToken, token)) { - // Try to decrement the waiter count via compare-and-swap as if we had - // never prepared to wait. - EventToken et = atomic64_cmpxchg(&ec->state, currentToken, - currentToken - 1); - if (et == currentToken) { - return true; - } - currentToken = et; - } - return false; -} - -/** - * Consume a token from the semaphore, waiting (with an optional timeout) if - * one is not currently available. Also attempts to count the number of times - * we'll actually have to wait because there are no tokens (permits) available - * in the semaphore, and the number of times the wait times out. - * - * @param ec the event count instance - * @param timeout an optional timeout value to pass to attemptSemaphore() - * - * @return true if a token was consumed, otherwise false only if a timeout - * was specified and we timed out - **/ -static bool consumeWaitToken(EventCount *ec, const RelTime *timeout) -{ - // Try to grab a token without waiting. - if (attemptSemaphore(&ec->semaphore, 0)) { - return true; - } - - - if (timeout == NULL) { - acquireSemaphore(&ec->semaphore); - } else if (!attemptSemaphore(&ec->semaphore, *timeout)) { - return false; - } - return true; -} - -/**********************************************************************/ -int makeEventCount(EventCount **ecPtr) -{ - // The event count will be allocated on a cache line boundary so there will - // not be false sharing of the line with any other data structure. - EventCount *ec = NULL; - int result = ALLOCATE(1, EventCount, "event count", &ec); - if (result != UDS_SUCCESS) { - return result; - } - - atomic64_set(&ec->state, 0); - result = initializeSemaphore(&ec->semaphore, 0); - if (result != UDS_SUCCESS) { - FREE(ec); - return result; - } - - *ecPtr = ec; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeEventCount(EventCount *ec) -{ - if (ec == NULL) { - return; - } - destroySemaphore(&ec->semaphore); - FREE(ec); -} - -/**********************************************************************/ -EventToken eventCountPrepare(EventCount *ec) -{ - return atomic64_add_return(ONE_WAITER, &ec->state); -} - -/**********************************************************************/ -void eventCountCancel(EventCount *ec, EventToken token) -{ - // Decrement the waiter count if the event hasn't been signalled. - if (fastCancel(ec, token)) { - return; - } - // A signaller has already transferred (or promised to transfer) our token - // to the semaphore, so we must consume it from the semaphore by waiting. - eventCountWait(ec, token, NULL); -} - -/**********************************************************************/ -bool eventCountWait(EventCount *ec, EventToken token, const RelTime *timeout) -{ - - for (;;) { - // Wait for a signaller to transfer our wait token to the semaphore. - if (!consumeWaitToken(ec, timeout)) { - // The wait timed out, so we must cancel the token instead. Try to - // decrement the waiter count if the event hasn't been signalled. - if (fastCancel(ec, token)) { - return false; - } - /* - * We timed out, but a signaller came in before we could cancel the - * wait. We have no choice but to wait for the semaphore to be posted. - * Since signaller has promised to do it, the wait will be short. The - * timeout and the signal happened at about the same time, so either - * outcome could be returned. It's simpler to ignore the timeout. - */ - timeout = NULL; - continue; - } - - // A wait token has now been consumed from the semaphore. - - // Stop waiting if the count has changed since the token was acquired. - if (!sameEvent(token, atomic64_read(&ec->state))) { - return true; - } - - // We consumed someone else's wait token. Put it back in the semaphore, - // which will wake another waiter, hopefully one who can stop waiting. - releaseSemaphore(&ec->semaphore); - - // Attempt to give an earlier waiter a shot at the semaphore. - yieldScheduler(); - } -} diff --git a/uds/util/eventCount.h b/uds/util/eventCount.h deleted file mode 100644 index e3f2a33..0000000 --- a/uds/util/eventCount.h +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/util/eventCount.h#1 $ - */ - -#ifndef EVENT_COUNT_H -#define EVENT_COUNT_H - -#include "timeUtils.h" -#include "typeDefs.h" - -/** - * An EventCount is a lock-free equivalent of a condition variable. - * - * Using an EventCount, a lock-free producer/consumer can wait for a state - * change (adding an item to an empty queue, for example) without spinning or - * falling back on the use of mutex-based locks. Signalling is cheap when - * there are no waiters (a memory fence), and preparing to wait is - * also inexpensive (an atomic add instruction). - * - * A lock-free producer should call eventCountBroadcast() after any mutation - * to the lock-free data structure that a consumer might be waiting on. The - * consumers should poll for work like this: - * - * for (;;) { - * // Fast path--no additional cost to consumer. - * if (lockFreeDequeue(&item)) { - * return item; - * } - * // Two-step wait: get current token and poll state, either cancelling - * // the wait or waiting for the token to be signalled. - * EventToken token = eventCountPrepare(ec); - * if (lockFreeDequeue(&item)) { - * eventCountCancel(ec, token); - * return item; - * } - * eventCountWait(ec, token, NULL); - * // State has changed, but must check condition again, so loop. - * } - * - * Once eventCountPrepare() is called, the caller should neither dally, sleep, - * nor perform long-running or blocking actions before passing the token to - * eventCountCancel() or eventCountWait(). The implementation is optimized for - * a short polling window, and will not perform well if there are outstanding - * tokens that have been signalled but not waited upon. - **/ - -typedef struct eventCount EventCount; - -typedef unsigned int EventToken; - -/** - * Allocate and initialize an EventCount. - * - * @param ecPtr a pointer to hold the new EventCount - **/ -__attribute__((warn_unused_result)) -int makeEventCount(EventCount **ecPtr); - -/** - * Free an EventCount. It must no longer be in use. - * - * @param ec the EventCount to free - **/ -void freeEventCount(EventCount *ec); - -/** - * Wake all threads that are waiting for the next event. - * - * @param ec the EventCount to signal - **/ -void eventCountBroadcast(EventCount *ec); - -/** - * Prepare to wait for the EventCount to change by capturing a token of its - * current state. The caller MUST eventually either call eventCountWait() or - * eventCountCancel() exactly once for each token obtained. - * - * @param ec the EventCount on which to prepare to wait - * - * @return an EventToken to be passed to the next eventCountWait() call - **/ -EventToken eventCountPrepare(EventCount *ec) - __attribute__((warn_unused_result)); - -/** - * Cancel a wait token that has been prepared but not waited upon. This must - * be called after eventCountPrepare() when eventCountWait() is not going to - * be invoked on the token. - * - * @param ec the EventCount from which a wait token was obtained - * @param token the wait token that will never be passed to eventCountWait() - **/ -void eventCountCancel(EventCount *ec, EventToken token); - -/** - * Check if the current event count state corresponds to the provided token, - * and if it is, wait for a signal that the state has changed. If an optional - * timeout is provided, the wait will terminate after the timeout has elapsed. - * Timing out automatically cancels the wait token, so callers must not - * attempt to cancel the token on timeout. - * - * @param ec the EventCount on which to wait - * @param token the EventToken returned by eventCountPrepare() - * @param timeout either NULL or a relative timeout for the wait operation - * - * @return true if the state has already changed or if signalled, otherwise - * false if a timeout was provided and the wait timed out - **/ -bool eventCountWait(EventCount *ec, EventToken token, const RelTime *timeout); - -#endif /* EVENT_COUNT_H */ diff --git a/uds/util/funnelQueue.c b/uds/util/funnelQueue.c deleted file mode 100644 index 017e405..0000000 --- a/uds/util/funnelQueue.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/util/funnelQueue.c#2 $ - */ - -#include "funnelQueue.h" - -#include "memoryAlloc.h" -#include "permassert.h" -#include "uds.h" - -/**********************************************************************/ -int makeFunnelQueue(FunnelQueue **queuePtr) -{ - // Allocate the queue on a cache line boundary so the producer and consumer - // fields in the structure will land on separate cache lines. - FunnelQueue *queue; - int result = ALLOCATE(1, FunnelQueue, "funnel queue", &queue); - if (result != UDS_SUCCESS) { - return result; - } - - // Initialize the stub entry and put it in the queue, establishing the - // invariant that queue->newest and queue->oldest are never null. - queue->stub.next = NULL; - queue->newest = &queue->stub; - queue->oldest = &queue->stub; - - *queuePtr = queue; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeFunnelQueue(FunnelQueue *queue) -{ - FREE(queue); -} - -/**********************************************************************/ -static FunnelQueueEntry *getOldest(FunnelQueue *queue) -{ - /* - * Barrier requirements: We need a read barrier between reading a "next" - * field pointer value and reading anything it points to. There's an - * accompanying barrier in funnelQueuePut between its caller setting up the - * entry and making it visible. - */ - FunnelQueueEntry *oldest = queue->oldest; - FunnelQueueEntry *next = oldest->next; - - if (oldest == &queue->stub) { - // When the oldest entry is the stub and it has no successor, the queue is - // logically empty. - if (next == NULL) { - return NULL; - } - // The stub entry has a successor, so the stub can be dequeued and ignored - // without breaking the queue invariants. - oldest = next; - queue->oldest = oldest; - smp_read_barrier_depends(); - next = oldest->next; - } - - // We have a non-stub candidate to dequeue. If it lacks a successor, we'll - // need to put the stub entry back on the queue first. - if (next == NULL) { - FunnelQueueEntry *newest = queue->newest; - if (oldest != newest) { - // Another thread has already swung queue->newest atomically, but not - // yet assigned previous->next. The queue is really still empty. - return NULL; - } - - // Put the stub entry back on the queue, ensuring a successor will - // eventually be seen. - funnelQueuePut(queue, &queue->stub); - - // Check again for a successor. - next = oldest->next; - if (next == NULL) { - // We lost a race with a producer who swapped queue->newest before we - // did, but who hasn't yet updated previous->next. Try again later. - return NULL; - } - } - return oldest; -} - -/**********************************************************************/ -FunnelQueueEntry *funnelQueuePoll(FunnelQueue *queue) -{ - FunnelQueueEntry *oldest = getOldest(queue); - if (oldest == NULL) { - return oldest; - } - - /* - * Dequeue the oldest entry and return it. Only one consumer thread may call - * this function, so no locking, atomic operations, or fences are needed; - * queue->oldest is owned by the consumer and oldest->next is never used by - * a producer thread after it is swung from NULL to non-NULL. - */ - queue->oldest = oldest->next; - /* - * Make sure the caller sees the proper stored data for this entry. - * - * Since we've already fetched the entry pointer we stored in - * "queue->oldest", this also ensures that on entry to the next call we'll - * properly see the dependent data. - */ - smp_rmb(); - /* - * If "oldest" is a very light-weight work item, we'll be looking - * for the next one very soon, so prefetch it now. - */ - prefetchAddress(queue->oldest, true); - oldest->next = NULL; - return oldest; -} - -/**********************************************************************/ -bool isFunnelQueueEmpty(FunnelQueue *queue) -{ - return getOldest(queue) == NULL; -} - -/**********************************************************************/ -bool isFunnelQueueIdle(FunnelQueue *queue) -{ - /* - * Oldest is not the stub, so there's another entry, though if next is - * NULL we can't retrieve it yet. - */ - if (queue->oldest != &queue->stub) { - return false; - } - - /* - * Oldest is the stub, but newest has been updated by _put(); either - * there's another, retrievable entry in the list, or the list is - * officially empty but in the intermediate state of having an entry - * added. - * - * Whether anything is retrievable depends on whether stub.next has - * been updated and become visible to us, but for idleness we don't - * care. And due to memory ordering in _put(), the update to newest - * would be visible to us at the same time or sooner. - */ - if (queue->newest != &queue->stub) { - return false; - } - - // Otherwise, we're idle. - return true; -} diff --git a/uds/util/funnelQueue.h b/uds/util/funnelQueue.h deleted file mode 100644 index 083d00b..0000000 --- a/uds/util/funnelQueue.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/util/funnelQueue.h#2 $ - */ - -#ifndef FUNNEL_QUEUE_H -#define FUNNEL_QUEUE_H - -#include "atomicDefs.h" -#include "compiler.h" -#include "cpu.h" -#include "typeDefs.h" - -/** - * A FunnelQueue is a simple lock-free (almost) queue that accepts entries - * from multiple threads (multi-producer) and delivers them to a single thread - * (single-consumer). "Funnel" is an attempt to evoke the image of requests - * from more than one producer being "funneled down" to a single consumer. - * - * This is an unsynchronized but thread-safe data structure when used as - * intended. There is no mechanism to ensure that only one thread is consuming - * from the queue, so if that is done mistakenly, it will not be trapped, and - * the resulting behavior is undefined. Clients must not directly access or - * manipulate the internals, which are only exposed for the purpose of - * allowing the very simple enqueue operation to be in-lined. - * - * The implementation requires that a FunnelQueueEntry structure (a link - * pointer) be embedded in the queue entries, and pointers to those structures - * are used exclusively by the queue. No macros are defined to template the - * queue, so the offset of the FunnelQueueEntry in the records placed in the - * queue must all have a fixed offset so the client can derive their structure - * pointer from the entry pointer returned by funnelQueuePoll(). - * - * Callers are wholly responsible for allocating and freeing the entries. - * Entries may be freed as soon as they are returned since this queue is not - * susceptible to the "ABA problem" present in many lock-free data structures. - * The queue is dynamically allocated to ensure cache-line alignment, but no - * other dynamic allocation is used. - * - * The algorithm is not actually 100% lock-free. There is a single point in - * funnelQueuePut() at which a pre-empted producer will prevent the consumers - * from seeing items added to the queue by later producers, and only if the - * queue is short enough or the consumer fast enough for it to reach what was - * the end of the queue at the time of the pre-empt. - * - * The consumer function, funnelQueuePoll(), will return NULL when the queue - * is empty. To wait for data to consume, spin (if safe) or combine the queue - * with an EventCount to signal the presence of new entries. - **/ - -/** - * The queue link structure that must be embedded in client entries. - **/ -typedef struct funnelQueueEntry { - // The next (newer) entry in the queue. - struct funnelQueueEntry * volatile next; -} FunnelQueueEntry; - -/** - * The dynamically allocated queue structure, which is aligned to a cache line - * boundary when allocated. This should be consider opaque; it is exposed here - * so funnelQueuePut() can be in-lined. - **/ -typedef struct __attribute__((aligned(CACHE_LINE_BYTES))) funnelQueue { - // The producers' end of the queue--an atomically exchanged pointer that - // will never be NULL. - FunnelQueueEntry * volatile newest; - - // The consumer's end of the queue. Owned by the consumer and never NULL. - FunnelQueueEntry *oldest __attribute__((aligned(CACHE_LINE_BYTES))); - - // A re-usable dummy entry used to provide the non-NULL invariants above. - FunnelQueueEntry stub; -} FunnelQueue; - -/** - * Construct and initialize a new, empty queue. - * - * @param queuePtr a pointer in which to store the queue - * - * @return UDS_SUCCESS or an error code - **/ -int makeFunnelQueue(FunnelQueue **queuePtr) - __attribute__((warn_unused_result)); - -/** - * Free a queue. - * - * This will not free any entries in the queue. The caller must ensure that - * either the queue will be empty or that any entries in the queue will not be - * leaked by dropping the references from queue. - * - * @param queue the queue to free - **/ -void freeFunnelQueue(FunnelQueue *queue); - -/** - * Put an entry on the end of the queue. - * - * The entry pointer must be to the FunnelQueueEntry embedded in the caller's - * data structure. The caller must be able to derive the address of the start - * of their data structure from the pointer that passed in here, so every - * entry in the queue must have the FunnelQueueEntry at the same offset within - * the client's structure. - * - * @param queue the queue on which to place the entry - * @param entry the entry to be added to the queue - **/ -static INLINE void funnelQueuePut(FunnelQueue *queue, FunnelQueueEntry *entry) -{ - /* - * Barrier requirements: All stores relating to the entry ("next" pointer, - * containing data structure fields) must happen before the previous->next - * store making it visible to the consumer. Also, the entry's "next" field - * initialization to NULL must happen before any other producer threads can - * see the entry (the xchg) and try to update the "next" field. - * - * xchg implements a full barrier. - */ - entry->next = NULL; - /* - * The xchg macro in the PPC kernel calls a function that takes a void* - * argument, triggering a warning about dropping the volatile qualifier. - */ -#pragma GCC diagnostic push -#if __GNUC__ >= 5 -#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers" -#endif - FunnelQueueEntry *previous = xchg(&queue->newest, entry); -#pragma GCC diagnostic pop - // Pre-empts between these two statements hide the rest of the queue from - // the consumer, preventing consumption until the following assignment runs. - previous->next = entry; -} - -/** - * Poll a queue, removing the oldest entry if the queue is not empty. This - * function must only be called from a single consumer thread. - * - * @param queue the queue from which to remove an entry - * - * @return the oldest entry in the queue, or NULL if the queue is empty. - **/ -FunnelQueueEntry *funnelQueuePoll(FunnelQueue *queue) - __attribute__((warn_unused_result)); - -/** - * Check whether the funnel queue is empty or not. This function must only be - * called from a single consumer thread, as with funnelQueuePoll. - * - * If the queue is in a transition state with one or more entries being added - * such that the list view is incomplete, it may not be possible to retrieve an - * entry with the funnelQueuePoll() function. In such states this function will - * report an empty indication. - * - * @param queue the queue which to check for entries. - * - * @return true iff queue contains no entry which can be retrieved - **/ -bool isFunnelQueueEmpty(FunnelQueue *queue) - __attribute__((warn_unused_result)); - -/** - * Check whether the funnel queue is idle or not. This function must only be - * called from a single consumer thread, as with funnel_queue_poll. - * - * If the queue has entries available to be retrieved, it is not idle. If the - * queue is in a transition state with one or more entries being added such - * that the list view is incomplete, it may not be possible to retrieve an - * entry with the funnel_queue_poll() function, but the queue will not be - * considered idle. - * - * @param queue the queue which to check for entries. - * - * @return true iff queue contains no entry which can be retrieved nor is - * known to be having an entry added - **/ -bool isFunnelQueueIdle(FunnelQueue *queue) - __attribute__((warn_unused_result)); - -#endif /* FUNNEL_QUEUE_H */ diff --git a/uds/util/radixSort.c b/uds/util/radixSort.c deleted file mode 100644 index cae4f90..0000000 --- a/uds/util/radixSort.c +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/util/radixSort.c#2 $ - */ - -/* - * Radix sort is implemented using an American Flag sort, an unstable, - * in-place 8-bit radix exchange sort. - * - * Adapted from the algorithm in the paper by Peter M. McIlroy, Keith Bostic, - * and M. Douglas McIlroy, "Engineering Radix Sort". - * http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf - */ - -#include "radixSort.h" - -#include "compiler.h" -#include "memoryAlloc.h" -#include "stringUtils.h" -#include "typeDefs.h" -#include "uds.h" - -enum { - // Piles smaller than this are handled with a simple insertion sort. - INSERTION_SORT_THRESHOLD = 12 -}; - -// Sort keys are pointers to immutable fixed-length arrays of bytes. -typedef const uint8_t * Key; - -/** - * The keys are separated into piles based on the byte in each - * keys at the current offset, so the number of keys with each - * byte must be counted. - **/ -typedef struct { - uint16_t used; // number of non-empty bins - uint16_t first; // index (key byte) of the first non-empty bin - uint16_t last; // index (key byte) of the last non-empty bin - uint32_t size[256]; // size[byte] == # of occurrences of byte -} Histogram; - -/** - * Sub-tasks are manually managed on a stack, both for performance - * and to put a logarithmic bound on the stack space needed. - **/ -typedef struct { - Key *firstKey; // Pointers to first and last keys to sort, inclusive. - Key *lastKey; - uint16_t offset; // The offset into the key at which to continue sorting. - uint16_t length; // The number of bytes remaining in the sort keys. -} Task; - -struct radixSorter { - unsigned int count; - Histogram bins; - Key *pile[256]; - Task *endOfStack; - Task isList[256]; - Task stack[]; -}; - -/** - * Compare a segment of two fixed-length keys starting an offset. - * - * @param key1 the first key - * @param key2 the second key - * @param offset the offset into the keys of the first byte to compare - * @param length the number of bytes remaining in each key - **/ -static INLINE int compare(Key key1, Key key2, uint16_t offset, uint16_t length) -{ - return memcmp(&key1[offset], &key2[offset], length); -} - -/** - * Insert the next unsorted key into an array of sorted keys. - * - * @param task the description of the keys being sorted - * @param next the pointer to the unsorted key to insert into - * the array of sorted key pointers preceding it - **/ -static INLINE void insertKey(const Task task, Key *next) -{ - // Pull the unsorted key out, freeing up the array slot. - Key unsorted = *next; - // Compare the key to the preceding sorted entries, shifting - // down the ones that are larger. - while ((--next >= task.firstKey) - && (compare(unsorted, next[0], task.offset, task.length) < 0)) { - next[1] = next[0]; - } - // Insert the key into the last slot that was cleared, sorting it. - next[1] = unsorted; -} - -/** - * Sort a range of key segments using an insertion sort. This simple sort is - * faster than the 256-way radix sort when the number of keys to sort is - * small. - * - * @param task the description of the keys to sort - **/ -static INLINE void insertionSort(const Task task) -{ - // (firstKey .. firstKey) is trivially sorted. Repeatedly insert the next - // key into the sorted list of keys preceding it, and voila! - Key *next; - for (next = task.firstKey + 1; next <= task.lastKey; next++) { - insertKey(task, next); - } -} - -/** - * Push a sorting task onto the task stack, increasing the stack pointer. - **/ -static INLINE void pushTask(Task **stackPointer, - Key *firstKey, - uint32_t count, - uint16_t offset, - uint16_t length) -{ - Task *task = (*stackPointer)++; - task->firstKey = firstKey; - task->lastKey = &firstKey[count - 1]; - task->offset = offset; - task->length = length; -} - -/**********************************************************************/ -static INLINE void swapKeys(Key *a, Key *b) -{ - Key c = *a; - *a = *b; - *b = c; -} - -/** - * Count the number of times each byte value appears in in the arrays of keys - * to sort at the current offset, keeping track of the number of non-empty - * bins, and the index of the first and last non-empty bin. - * - * @param task the description of the keys to sort - * @param bins the histogram bins receiving the counts - **/ -static INLINE void measureBins(const Task task, Histogram *bins) -{ - // Set bogus values that will will be replaced by min and max, respectively. - bins->first = UINT8_MAX; - bins->last = 0; - - // Subtle invariant: bins->used and bins->size[] are zero because the - // sorting code clears it all out as it goes. Even though this structure is - // re-used, we don't need to pay to zero it before starting a new tally. - - Key *keyPtr; - for (keyPtr = task.firstKey; keyPtr <= task.lastKey; keyPtr++) { - // Increment the count for the byte in the key at the current offset. - uint8_t bin = (*keyPtr)[task.offset]; - uint32_t size = ++bins->size[bin]; - - // Track non-empty bins when the count transitions from zero to one. - if (size == 1) { - bins->used += 1; - if (bin < bins->first) { - bins->first = bin; - } - if (bin > bins->last) { - bins->last = bin; - } - } - } -} - -/** - * Convert the bin sizes to pointers to where each pile goes. - * - * pile[0] = firstKey + bin->size[0], - * pile[1] = pile[0] + bin->size[1], etc. - * - * After the keys are moved to the appropriate pile, we'll need to sort - * each of the piles by the next radix position. A new task is put on the - * stack for each pile containing lots of keys, or a new task is is put on - * the list for each pile containing few keys. - * - * @param stack pointer the top of the stack - * @param endOfStack the end of the stack - * @param list pointer the head of the list - * @param pile array that will be filled pointers to the end of each pile - * @param bins the histogram of the sizes of each pile - * @param firstKey the first key of the stack - * @param offset the next radix position to sort by - * @param length the number of bytes remaining in the sort keys - * - * @return UDS_SUCCESS or an error code - **/ -static INLINE int pushBins(Task **stack, - Task *endOfStack, - Task **list, - Key *pile[], - Histogram *bins, - Key *firstKey, - uint16_t offset, - uint16_t length) -{ - Key *pileStart = firstKey; - int bin; - for (bin = bins->first; ; bin++) { - uint32_t size = bins->size[bin]; - // Skip empty piles. - if (size == 0) { - continue; - } - // There's no need to sort empty keys. - if (length > 0) { - if (size > INSERTION_SORT_THRESHOLD) { - if (*stack >= endOfStack) { - return UDS_BAD_STATE; - } - pushTask(stack, pileStart, size, offset, length); - } else if (size > 1) { - pushTask(list, pileStart, size, offset, length); - } - } - pileStart += size; - pile[bin] = pileStart; - if (--bins->used == 0) { - break; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int makeRadixSorter(unsigned int count, RadixSorter **sorter) -{ - unsigned int stackSize = count / INSERTION_SORT_THRESHOLD; - RadixSorter *radixSorter; - int result = ALLOCATE_EXTENDED(RadixSorter, stackSize, Task, __func__, - &radixSorter); - if (result != UDS_SUCCESS) { - return result; - } - radixSorter->count = count; - radixSorter->endOfStack = radixSorter->stack + stackSize; - *sorter = radixSorter; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeRadixSorter(RadixSorter *sorter) -{ - FREE(sorter); -} - -/**********************************************************************/ -int radixSort(RadixSorter *sorter, - const unsigned char *keys[], - unsigned int count, - unsigned short length) -{ - // All zero-length keys are identical and therefore already sorted. - if ((count == 0) || (length == 0)) { - return UDS_SUCCESS; - } - - // The initial task is to sort the entire length of all the keys. - Task start = { - .firstKey = keys, - .lastKey = &keys[count - 1], - .offset = 0, - .length = length, - }; - - if (count <= INSERTION_SORT_THRESHOLD) { - insertionSort(start); - return UDS_SUCCESS; - } - - if (count > sorter->count) { - return UDS_INVALID_ARGUMENT; - } - - Histogram *bins = &sorter->bins; - Key **pile = sorter->pile; - Task *sp = sorter->stack; - - /* - * Repeatedly consume a sorting task from the stack and process it, pushing - * new sub-tasks onto to the stack for each radix-sorted pile. When all - * tasks and sub-tasks have been processed, the stack will be empty and all - * the keys in the starting task will be fully sorted. - */ - for (*sp = start; sp >= sorter->stack; sp--) { - const Task task = *sp; - measureBins(task, bins); - - // Now that we know how large each bin is, generate pointers for each of - // the piles and push a new task to sort each pile by the next radix byte. - Task *lp = sorter->isList; - int result = pushBins(&sp, sorter->endOfStack, &lp, pile, bins, - task.firstKey, task.offset + 1, task.length - 1); - if (result != UDS_SUCCESS) { - memset(bins, 0, sizeof(*bins)); - return result; - } - // Now bins->used is zero again. - - // Don't bother processing the last pile--when piles 0..N-1 are all in - // place, then pile N must also be in place. - Key *end = task.lastKey - bins->size[bins->last]; - bins->size[bins->last] = 0; - - Key *fence; - for (fence = task.firstKey; fence <= end; ) { - uint8_t bin; - Key key = *fence; - // The radix byte of the key tells us which pile it belongs in. Swap it - // for an unprocessed item just below that pile, and repeat. - while (--pile[bin = key[task.offset]] > fence) { - swapKeys(pile[bin], &key); - } - // The pile reached the fence. Put the key at the bottom of that pile. - // completing it, and advance the fence to the next pile. - *fence = key; - fence += bins->size[bin]; - bins->size[bin] = 0; - } - // Now bins->size[] is all zero again. - - // When the number of keys in a task gets small enough, its faster to use - // an insertion sort than to keep subdividing into tiny piles. - while (--lp >= sorter->isList) { - insertionSort(*lp); - } - } - return UDS_SUCCESS; -} diff --git a/uds/util/radixSort.h b/uds/util/radixSort.h deleted file mode 100644 index 55f19ba..0000000 --- a/uds/util/radixSort.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/util/radixSort.h#1 $ - */ - -#ifndef RADIX_SORT_H -#define RADIX_SORT_H - -/* - * The implementation uses one large object allocated on the heap. This - * large object can be reused as many times as desired. There is no - * further heap usage by the sorting. - */ -typedef struct radixSorter RadixSorter; - -/** - * Reserve the heap storage needed by the radixSort routine. The amount of - * heap space is logarithmically proportional to the number of keys. - * - * @param count The maximum number of keys to be sorted - * @param sorter The RadixSorter object is returned here - * - * @return UDS_SUCCESS or an error code - **/ -int makeRadixSorter(unsigned int count, RadixSorter **sorter) - __attribute__((warn_unused_result)); - -/** - * Free the heap storage needed by the radixSort routine. - * - * @param sorter The RadixSorter object to free - **/ -void freeRadixSorter(RadixSorter *sorter); - -/** - * Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. - * - * The sort implementation is unstable--relative ordering of equal keys is not - * preserved. The implementation does not use any heap allocation. - * - * @param [in] sorter the heap storage used by the sorting - * @param keys the array of key pointers to sort (modified in place) - * @param [in] count the number of keys - * @param [in] length the length of every key, in bytes - * - * @return UDS_SUCCESS or an error code - **/ -int radixSort(RadixSorter *sorter, - const unsigned char *keys[], - unsigned int count, - unsigned short length) - __attribute__((warn_unused_result)); - -#endif /* RADIX_SORT_H */ diff --git a/uds/volume.c b/uds/volume.c deleted file mode 100644 index 4f320c5..0000000 --- a/uds/volume.c +++ /dev/null @@ -1,1383 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/volume.c#23 $ - */ - -#include "volume.h" - -#include "cacheCounters.h" -#include "chapterIndex.h" -#include "compiler.h" -#include "errors.h" -#include "geometry.h" -#include "hashUtils.h" -#include "indexConfig.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "recordPage.h" -#include "request.h" -#include "sparseCache.h" -#include "stringUtils.h" -#include "threads.h" - -enum { - MAX_BAD_CHAPTERS = 100, // max number of contiguous bad chapters - DEFAULT_VOLUME_READ_THREADS = 2, // Default number of reader threads - MAX_VOLUME_READ_THREADS = 16, // Maximum number of reader threads -}; - -/**********************************************************************/ -static unsigned int getReadThreads(const struct uds_parameters *userParams) -{ - unsigned int readThreads = (userParams == NULL - ? DEFAULT_VOLUME_READ_THREADS - : userParams->read_threads); - if (readThreads < 1) { - readThreads = 1; - } - if (readThreads > MAX_VOLUME_READ_THREADS) { - readThreads = MAX_VOLUME_READ_THREADS; - } - return readThreads; -} - -/**********************************************************************/ -static INLINE unsigned int mapToPageNumber(Geometry *geometry, - unsigned int physicalPage) -{ - return ((physicalPage - 1) % geometry->pagesPerChapter); -} - -/**********************************************************************/ -static INLINE unsigned int mapToChapterNumber(Geometry *geometry, - unsigned int physicalPage) -{ - return ((physicalPage - 1) / geometry->pagesPerChapter); -} - -/**********************************************************************/ -static INLINE bool isRecordPage(Geometry *geometry, unsigned int physicalPage) -{ - return (((physicalPage - 1) % geometry->pagesPerChapter) - >= geometry->indexPagesPerChapter); -} - -/**********************************************************************/ -static INLINE unsigned int getZoneNumber(Request *request) -{ - return (request == NULL) ? 0 : request->zoneNumber; -} - -/**********************************************************************/ -int mapToPhysicalPage(const Geometry *geometry, int chapter, int page) -{ - // Page zero is the header page, so the first index page in the - // first chapter is physical page one. - return (1 + (geometry->pagesPerChapter * chapter) + page); -} - -/**********************************************************************/ -static void waitForReadQueueNotFull(Volume *volume, Request *request) -{ - unsigned int zoneNumber = getZoneNumber(request); - InvalidateCounter invalidateCounter = getInvalidateCounter(volume->pageCache, - zoneNumber); - if (searchPending(invalidateCounter)) { - // Increment the invalidate counter to avoid deadlock where the reader - // threads cannot make progress because they are waiting on the counter - // and the index thread cannot because the read queue is full. - endPendingSearch(volume->pageCache, zoneNumber); - } - - while (readQueueIsFull(volume->pageCache)) { - logDebug("Waiting until read queue not full"); - signalCond(&volume->readThreadsCond); - waitCond(&volume->readThreadsReadDoneCond, &volume->readThreadsMutex); - } - - if (searchPending(invalidateCounter)) { - // Increment again so we get back to an odd value. - beginPendingSearch(volume->pageCache, pageBeingSearched(invalidateCounter), - zoneNumber); - } -} - -/**********************************************************************/ -int enqueuePageRead(Volume *volume, Request *request, int physicalPage) -{ - // Don't allow new requests if we are shutting down, but make sure - // to process any requests that are still in the pipeline. - if ((volume->readerState & READER_STATE_EXIT) != 0) { - logInfo("failed to queue read while shutting down"); - return UDS_SHUTTINGDOWN; - } - - // Mark the page as queued in the volume cache, for chapter invalidation to - // be able to cancel a read. - // If we are unable to do this because the queues are full, flush them first - int result; - while ((result = enqueueRead(volume->pageCache, request, physicalPage)) - == UDS_SUCCESS) { - logDebug("Read queues full, waiting for reads to finish"); - waitForReadQueueNotFull(volume, request); - } - - if (result == UDS_QUEUED) { - /* signal a read thread */ - signalCond(&volume->readThreadsCond); - } - - return result; -} - -/**********************************************************************/ -static INLINE void waitToReserveReadQueueEntry(Volume *volume, - unsigned int *queuePos, - Request **requestList, - unsigned int *physicalPage, - bool *invalid) -{ - while (((volume->readerState & READER_STATE_EXIT) == 0) - && (((volume->readerState & READER_STATE_STOP) != 0) - || !reserveReadQueueEntry(volume->pageCache, queuePos, - requestList, physicalPage, invalid))) { - waitCond(&volume->readThreadsCond, &volume->readThreadsMutex); - } -} - -/**********************************************************************/ -static int initChapterIndexPage(const Volume *volume, - byte *indexPage, - unsigned int chapter, - unsigned int indexPageNumber, - DeltaIndexPage *chapterIndexPage) -{ - Geometry *geometry = volume->geometry; - - int result = initializeChapterIndexPage(chapterIndexPage, geometry, - indexPage, volume->nonce); - if (volume->lookupMode == LOOKUP_FOR_REBUILD) { - return result; - } - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "Reading chapter index page for chapter %u" - " page %u", - chapter, indexPageNumber); - } - - IndexPageBounds bounds; - result = getListNumberBounds(volume->indexPageMap, chapter, - indexPageNumber, &bounds); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t ciVirtual = chapterIndexPage->virtualChapterNumber; - unsigned int ciChapter = mapToPhysicalChapter(geometry, ciVirtual); - if ((chapter == ciChapter) - && (bounds.lowestList == chapterIndexPage->lowestListNumber) - && (bounds.highestList == chapterIndexPage->highestListNumber)) { - return UDS_SUCCESS; - } - - logWarning("Index page map updated to %llu", - getLastUpdate(volume->indexPageMap)); - logWarning("Page map expects that chapter %u page %u has range %u to %u, " - "but chapter index page has chapter %" PRIu64 - " with range %u to %u", - chapter, indexPageNumber, bounds.lowestList, bounds.highestList, - ciVirtual, chapterIndexPage->lowestListNumber, - chapterIndexPage->highestListNumber); - return ASSERT_WITH_ERROR_CODE(false, - UDS_CORRUPT_DATA, - "index page map mismatch with chapter index"); -} - -/**********************************************************************/ -static int initializeIndexPage(const Volume *volume, - unsigned int physicalPage, - CachedPage *page) -{ - unsigned int chapter = mapToChapterNumber(volume->geometry, physicalPage); - unsigned int indexPageNumber = mapToPageNumber(volume->geometry, - physicalPage); - int result = initChapterIndexPage(volume, getPageData(&page->cp_pageData), - chapter, indexPageNumber, - &page->cp_indexPage); - return result; -} - -/**********************************************************************/ -static void readThreadFunction(void *arg) -{ - Volume *volume = arg; - unsigned int queuePos; - Request *requestList; - unsigned int physicalPage; - bool invalid = false; - - logDebug("reader starting"); - lockMutex(&volume->readThreadsMutex); - while (true) { - waitToReserveReadQueueEntry(volume, &queuePos, &requestList, &physicalPage, - &invalid); - if ((volume->readerState & READER_STATE_EXIT) != 0) { - break; - } - - volume->busyReaderThreads++; - - bool recordPage = isRecordPage(volume->geometry, physicalPage); - - CachedPage *page = NULL; - int result = UDS_SUCCESS; - if (!invalid) { - // Find a place to put the read queue page we reserved above. - result = selectVictimInCache(volume->pageCache, &page); - if (result == UDS_SUCCESS) { - unlockMutex(&volume->readThreadsMutex); - result = readVolumePage(&volume->volumeStore, physicalPage, - &page->cp_pageData); - if (result != UDS_SUCCESS) { - logWarning("Error reading page %u from volume", physicalPage); - cancelPageInCache(volume->pageCache, physicalPage, page); - } - lockMutex(&volume->readThreadsMutex); - } else { - logWarning("Error selecting cache victim for page read"); - } - - if (result == UDS_SUCCESS) { - if (!volume->pageCache->readQueue[queuePos].invalid) { - if (!recordPage) { - result = initializeIndexPage(volume, physicalPage, page); - if (result != UDS_SUCCESS) { - logWarning("Error initializing chapter index page"); - cancelPageInCache(volume->pageCache, physicalPage, page); - } - } - - if (result == UDS_SUCCESS) { - result = putPageInCache(volume->pageCache, physicalPage, page); - if (result != UDS_SUCCESS) { - logWarning("Error putting page %u in cache", physicalPage); - cancelPageInCache(volume->pageCache, physicalPage, page); - } - } - } else { - logWarning("Page %u invalidated after read", physicalPage); - cancelPageInCache(volume->pageCache, physicalPage, page); - invalid = true; - } - } - } else { - logDebug("Requeuing requests for invalid page"); - } - - if (invalid) { - result = UDS_SUCCESS; - page = NULL; - } - - while (requestList != NULL) { - Request *request = requestList; - requestList = request->nextRequest; - - /* - * If we've read in a record page, we're going to do an immediate search, - * in an attempt to speed up processing when we requeue the request, so - * that it doesn't have to go back into the getRecordFromZone code again. - * However, if we've just read in an index page, we don't want to search. - * We want the request to be processed again and getRecordFromZone to be - * run. We have added new fields in request to allow the index code to - * know whether it can stop processing before getRecordFromZone is called - * again. - */ - if ((result == UDS_SUCCESS) && (page != NULL) && recordPage) { - if (searchRecordPage(getPageData(&page->cp_pageData), - &request->chunkName, volume->geometry, - &request->oldMetadata)) { - request->slLocation = LOC_IN_DENSE; - } else { - request->slLocation = LOC_UNAVAILABLE; - } - request->slLocationKnown = true; - } - - // reflect any read failures in the request status - request->status = result; - restartRequest(request); - } - - releaseReadQueueEntry(volume->pageCache, queuePos); - - volume->busyReaderThreads--; - broadcastCond(&volume->readThreadsReadDoneCond); - } - unlockMutex(&volume->readThreadsMutex); - logDebug("reader done"); -} - -/**********************************************************************/ -static int readPageLocked(Volume *volume, - Request *request, - unsigned int physicalPage, - bool syncRead, - CachedPage **pagePtr) -{ - syncRead |= ((volume->lookupMode == LOOKUP_FOR_REBUILD) - || (request == NULL) - || (request->session == NULL)); - - int result = UDS_SUCCESS; - - CachedPage *page = NULL; - if (syncRead) { - // Find a place to put the page. - result = selectVictimInCache(volume->pageCache, &page); - if (result != UDS_SUCCESS) { - logWarning("Error selecting cache victim for page read"); - return result; - } - result = readVolumePage(&volume->volumeStore, physicalPage, - &page->cp_pageData); - if (result != UDS_SUCCESS) { - logWarning("Error reading page %u from volume", physicalPage); - cancelPageInCache(volume->pageCache, physicalPage, page); - return result; - } - if (!isRecordPage(volume->geometry, physicalPage)) { - result = initializeIndexPage(volume, physicalPage, page); - if (result != UDS_SUCCESS) { - if (volume->lookupMode != LOOKUP_FOR_REBUILD) { - logWarning("Corrupt index page %u", physicalPage); - } - cancelPageInCache(volume->pageCache, physicalPage, page); - return result; - } - } - result = putPageInCache(volume->pageCache, physicalPage, page); - if (result != UDS_SUCCESS) { - logWarning("Error putting page %u in cache", physicalPage); - cancelPageInCache(volume->pageCache, physicalPage, page); - return result; - } - } else { - result = enqueuePageRead(volume, request, physicalPage); - if (result != UDS_SUCCESS) { - return result; - } - } - - *pagePtr = page; - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getPageLocked(Volume *volume, - Request *request, - unsigned int physicalPage, - CacheProbeType probeType, - CachedPage **pagePtr) -{ - CachedPage *page = NULL; - int result = getPageFromCache(volume->pageCache, physicalPage, probeType, - &page); - if (result != UDS_SUCCESS) { - return result; - } - if (page == NULL) { - result = readPageLocked(volume, request, physicalPage, true, &page); - if (result != UDS_SUCCESS) { - return result; - } - } else if (getZoneNumber(request) == 0) { - // Only 1 zone is responsible for updating LRU - makePageMostRecent(volume->pageCache, page); - } - - *pagePtr = page; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getPageProtected(Volume *volume, - Request *request, - unsigned int physicalPage, - CacheProbeType probeType, - CachedPage **pagePtr) -{ - CachedPage *page = NULL; - int result = getPageFromCache(volume->pageCache, physicalPage, - probeType | CACHE_PROBE_IGNORE_FAILURE, - &page); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int zoneNumber = getZoneNumber(request); - // If we didn't find a page we need to enqueue a read for it, in which - // case we need to grab the mutex. - if (page == NULL) { - endPendingSearch(volume->pageCache, zoneNumber); - lockMutex(&volume->readThreadsMutex); - - /* - * Do the lookup again while holding the read mutex (no longer the fast - * case so this should be ok to repeat). We need to do this because an - * page may have been added to the page map by the reader thread between - * the time searched above and the time we went to actually try to enqueue - * it below. This could result in us enqueuing another read for an page - * which is already in the cache, which would mean we end up with two - * entries in the cache for the same page. - */ - result - = getPageFromCache(volume->pageCache, physicalPage, probeType, &page); - if (result != UDS_SUCCESS) { - /* - * In non-success cases (anything not UDS_SUCCESS, meaning both - * UDS_QUEUED and "real" errors), the caller doesn't get a - * handle on a cache page, so it can't continue the search, and - * we don't need to prevent other threads from messing with the - * cache. - * - * However, we do need to set the "search pending" flag because - * the callers expect it to always be set on return, even if - * they can't actually do the search. - * - * Doing the calls in this order ought to be faster, since we - * let other threads have the reader thread mutex (which can - * require a syscall) ASAP, and set the "search pending" state - * that can block the reader thread as the last thing. - */ - unlockMutex(&volume->readThreadsMutex); - beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); - return result; - } - - // If we found the page now, we can release the mutex and proceed - // as if this were the fast case. - if (page != NULL) { - /* - * If we found a page (*pagePtr != NULL and return - * UDS_SUCCESS), then we're telling the caller where to look for - * the cache page, and need to switch to "reader thread - * unlocked" and "search pending" state in careful order so no - * other thread can mess with the data before our caller gets to - * look at it. - */ - beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); - unlockMutex(&volume->readThreadsMutex); - } - } - - if (page == NULL) { - result = readPageLocked(volume, request, physicalPage, false, &page); - if (result != UDS_SUCCESS) { - /* - * This code path is used frequently in the UDS_QUEUED case, so - * the performance gain from unlocking first, while "search - * pending" mode is off, turns out to be significant in some - * cases. - */ - unlockMutex(&volume->readThreadsMutex); - beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); - return result; - } - - // See above re: ordering requirement. - beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); - unlockMutex(&volume->readThreadsMutex); - } else { - if (getZoneNumber(request) == 0 ) { - // Only 1 zone is responsible for updating LRU - makePageMostRecent(volume->pageCache, page); - } - } - - *pagePtr = page; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int getPage(Volume *volume, - unsigned int chapter, - unsigned int pageNumber, - CacheProbeType probeType, - byte **dataPtr, - DeltaIndexPage **indexPagePtr) -{ - unsigned int physicalPage - = mapToPhysicalPage(volume->geometry, chapter, pageNumber); - - lockMutex(&volume->readThreadsMutex); - CachedPage *page = NULL; - int result = getPageLocked(volume, NULL, physicalPage, probeType, &page); - unlockMutex(&volume->readThreadsMutex); - - if (dataPtr != NULL) { - *dataPtr = (page != NULL) ? getPageData(&page->cp_pageData) : NULL; - } - if (indexPagePtr != NULL) { - *indexPagePtr = (page != NULL) ? &page->cp_indexPage : NULL; - } - return result; -} - -/** - * Search for a chunk name in a cached index page or chapter index, returning - * the record page number from a chapter index match. - * - * @param volume the volume containing the index page to search - * @param request the request originating the search (may be NULL for - * a direct query from volume replay) - * @param name the name of the block or chunk - * @param chapter the chapter to search - * @param indexPageNumber the index page number of the page to search - * @param recordPageNumber pointer to return the chapter record page number - * (value will be NO_CHAPTER_INDEX_ENTRY if the name - * was not found) - * - * @return UDS_SUCCESS or an error code - **/ -static int searchCachedIndexPage(Volume *volume, - Request *request, - const UdsChunkName *name, - unsigned int chapter, - unsigned int indexPageNumber, - int *recordPageNumber) -{ - unsigned int zoneNumber = getZoneNumber(request); - unsigned int physicalPage - = mapToPhysicalPage(volume->geometry, chapter, indexPageNumber); - - /* - * Make sure the invalidate counter is updated before we try and read from - * the page map. This prevents this thread from reading a page in the - * page map which has already been marked for invalidation by the reader - * thread, before the reader thread has noticed that the invalidateCounter - * has been incremented. - */ - beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); - - CachedPage *page = NULL; - int result = getPageProtected(volume, request, physicalPage, - cacheProbeType(request, true), &page); - if (result != UDS_SUCCESS) { - endPendingSearch(volume->pageCache, zoneNumber); - return result; - } - - result - = ASSERT_LOG_ONLY(searchPending(getInvalidateCounter(volume->pageCache, - zoneNumber)), - "Search is pending for zone %u", zoneNumber); - if (result != UDS_SUCCESS) { - return result; - } - - result = searchChapterIndexPage(&page->cp_indexPage, volume->geometry, name, - recordPageNumber); - endPendingSearch(volume->pageCache, zoneNumber); - return result; -} - -/**********************************************************************/ -int searchCachedRecordPage(Volume *volume, - Request *request, - const UdsChunkName *name, - unsigned int chapter, - int recordPageNumber, - UdsChunkData *duplicate, - bool *found) -{ - *found = false; - - if (recordPageNumber == NO_CHAPTER_INDEX_ENTRY) { - // No record for that name can exist in the chapter. - return UDS_SUCCESS; - } - - Geometry *geometry = volume->geometry; - int result = ASSERT(((recordPageNumber >= 0) - && ((unsigned int) recordPageNumber - < geometry->recordPagesPerChapter)), - "0 <= %d <= %u", - recordPageNumber, geometry->recordPagesPerChapter); - if (result != UDS_SUCCESS) { - return result; - } - - unsigned int pageNumber = geometry->indexPagesPerChapter + recordPageNumber; - - unsigned int zoneNumber = getZoneNumber(request); - int physicalPage - = mapToPhysicalPage(volume->geometry, chapter, pageNumber); - - /* - * Make sure the invalidate counter is updated before we try and read from - * the page map. This prevents this thread from reading a page in the page - * map which has already been marked for invalidation by the reader thread, - * before the reader thread has noticed that the invalidateCounter has been - * incremented. - */ - beginPendingSearch(volume->pageCache, physicalPage, zoneNumber); - - CachedPage *recordPage; - result = getPageProtected(volume, request, physicalPage, - cacheProbeType(request, false), &recordPage); - if (result != UDS_SUCCESS) { - endPendingSearch(volume->pageCache, zoneNumber); - return result; - } - - if (searchRecordPage(getPageData(&recordPage->cp_pageData), name, geometry, - duplicate)) { - *found = true; - } - endPendingSearch(volume->pageCache, zoneNumber); - return UDS_SUCCESS; -} - -/**********************************************************************/ -int readChapterIndexFromVolume(const Volume *volume, - uint64_t virtualChapter, - struct volume_page volumePages[], - DeltaIndexPage indexPages[]) -{ - const Geometry *geometry = volume->geometry; - unsigned int physicalChapter = mapToPhysicalChapter(geometry, - virtualChapter); - int physicalPage = mapToPhysicalPage(geometry, physicalChapter, 0); - prefetchVolumePages(&volume->volumeStore, physicalPage, - geometry->indexPagesPerChapter); - - unsigned int i; - struct volume_page volumePage; - int result = initializeVolumePage(geometry, &volumePage); - for (i = 0; i < geometry->indexPagesPerChapter; i++) { - int result = readVolumePage(&volume->volumeStore, physicalPage + i, - &volumePages[i]); - if (result != UDS_SUCCESS) { - break; - } - byte *indexPage = getPageData(&volumePages[i]); - result = initChapterIndexPage(volume, indexPage, physicalChapter, i, - &indexPages[i]); - if (result != UDS_SUCCESS) { - break; - } - } - destroyVolumePage(&volumePage); - return result; -} - -/**********************************************************************/ -int searchVolumePageCache(Volume *volume, - Request *request, - const UdsChunkName *name, - uint64_t virtualChapter, - UdsChunkData *metadata, - bool *found) -{ - unsigned int physicalChapter - = mapToPhysicalChapter(volume->geometry, virtualChapter); - unsigned int indexPageNumber; - int result = findIndexPageNumber(volume->indexPageMap, name, physicalChapter, - &indexPageNumber); - if (result != UDS_SUCCESS) { - return result; - } - - int recordPageNumber; - result = searchCachedIndexPage(volume, request, name, physicalChapter, - indexPageNumber, &recordPageNumber); - if (result == UDS_SUCCESS) { - result = searchCachedRecordPage(volume, request, name, physicalChapter, - recordPageNumber, metadata, found); - } - - return result; -} - -/**********************************************************************/ -int forgetChapter(Volume *volume, - uint64_t virtualChapter, - InvalidationReason reason) -{ - logDebug("forgetting chapter %llu", virtualChapter); - unsigned int physicalChapter - = mapToPhysicalChapter(volume->geometry, virtualChapter); - lockMutex(&volume->readThreadsMutex); - int result - = invalidatePageCacheForChapter(volume->pageCache, physicalChapter, - volume->geometry->pagesPerChapter, - reason); - unlockMutex(&volume->readThreadsMutex); - return result; -} - -/** - * Donate index page data to the page cache for an index page that was just - * written to the volume. The caller must already hold the reader thread - * mutex. - * - * @param volume the volume - * @param physicalChapter the physical chapter number of the index page - * @param indexPageNumber the chapter page number of the index page - * @param scratchPage the index page data - **/ -static int donateIndexPageLocked(Volume *volume, - unsigned int physicalChapter, - unsigned int indexPageNumber, - struct volume_page *scratchPage) -{ - unsigned int physicalPage - = mapToPhysicalPage(volume->geometry, physicalChapter, indexPageNumber); - - // Find a place to put the page. - CachedPage *page = NULL; - int result = selectVictimInCache(volume->pageCache, &page); - if (result != UDS_SUCCESS) { - return result; - } - - // Exchange the scratch page with the cache page - swapVolumePages(&page->cp_pageData, scratchPage); - - result = initChapterIndexPage(volume, getPageData(&page->cp_pageData), - physicalChapter, indexPageNumber, - &page->cp_indexPage); - if (result != UDS_SUCCESS) { - logWarning("Error initialize chapter index page"); - cancelPageInCache(volume->pageCache, physicalPage, page); - return result; - } - - result = putPageInCache(volume->pageCache, physicalPage, page); - if (result != UDS_SUCCESS) { - logWarning("Error putting page %u in cache", physicalPage); - cancelPageInCache(volume->pageCache, physicalPage, page); - return result; - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int writeIndexPages(Volume *volume, - int physicalPage, - OpenChapterIndex *chapterIndex, - byte **pages) -{ - Geometry *geometry = volume->geometry; - unsigned int physicalChapterNumber - = mapToPhysicalChapter(geometry, chapterIndex->virtualChapterNumber); - unsigned int deltaListNumber = 0; - - unsigned int indexPageNumber; - for (indexPageNumber = 0; - indexPageNumber < geometry->indexPagesPerChapter; - indexPageNumber++) { - int result = prepareToWriteVolumePage(&volume->volumeStore, - physicalPage + indexPageNumber, - &volume->scratchPage); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, "failed to prepare index page"); - } - - // Pack as many delta lists into the index page as will fit. - unsigned int listsPacked; - bool lastPage = ((indexPageNumber + 1) == geometry->indexPagesPerChapter); - result = packOpenChapterIndexPage(chapterIndex, - getPageData(&volume->scratchPage), - deltaListNumber, lastPage, &listsPacked); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, "failed to pack index page"); - } - - result = writeVolumePage(&volume->volumeStore, - physicalPage + indexPageNumber, - &volume->scratchPage); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to write chapter index page"); - } - - if (pages != NULL) { - memcpy(pages[indexPageNumber], getPageData(&volume->scratchPage), - geometry->bytesPerPage); - } - - // Tell the index page map the list number of the last delta list that was - // packed into the index page. - if (listsPacked == 0) { - logDebug("no delta lists packed on chapter %u page %u", - physicalChapterNumber, indexPageNumber); - } else { - deltaListNumber += listsPacked; - } - result = updateIndexPageMap(volume->indexPageMap, - chapterIndex->virtualChapterNumber, - physicalChapterNumber, - indexPageNumber, deltaListNumber - 1); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, - "failed to update index page map"); - } - - // Donate the page data for the index page to the page cache. - lockMutex(&volume->readThreadsMutex); - result = donateIndexPageLocked(volume, physicalChapterNumber, - indexPageNumber, &volume->scratchPage); - unlockMutex(&volume->readThreadsMutex); - if (result != UDS_SUCCESS) { - return result; - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int writeRecordPages(Volume *volume, - int physicalPage, - const UdsChunkRecord records[], - byte **pages) -{ - Geometry *geometry = volume->geometry; - // Skip over the index pages, which come before the record pages - physicalPage += geometry->indexPagesPerChapter; - // The record array from the open chapter is 1-based. - const UdsChunkRecord *nextRecord = &records[1]; - - unsigned int recordPageNumber; - for (recordPageNumber = 0; - recordPageNumber < geometry->recordPagesPerChapter; - recordPageNumber++) { - int result = prepareToWriteVolumePage(&volume->volumeStore, - physicalPage + recordPageNumber, - &volume->scratchPage); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to prepare record page"); - } - - // Sort the next page of records and copy them to the record page as a - // binary tree stored in heap order. - result = encodeRecordPage(volume, nextRecord, - getPageData(&volume->scratchPage)); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to encode record page %u", - recordPageNumber); - } - nextRecord += geometry->recordsPerPage; - - result = writeVolumePage(&volume->volumeStore, - physicalPage + recordPageNumber, - &volume->scratchPage); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "failed to write chapter record page"); - } - - if (pages != NULL) { - memcpy(pages[recordPageNumber], getPageData(&volume->scratchPage), - geometry->bytesPerPage); - } - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int writeChapter(Volume *volume, - OpenChapterIndex *chapterIndex, - const UdsChunkRecord records[]) -{ - // Determine the position of the virtual chapter in the volume file. - Geometry *geometry = volume->geometry; - unsigned int physicalChapterNumber - = mapToPhysicalChapter(geometry, chapterIndex->virtualChapterNumber); - int physicalPage = mapToPhysicalPage(geometry, physicalChapterNumber, 0); - - // Pack and write the delta chapter index pages to the volume. - int result = writeIndexPages(volume, physicalPage, chapterIndex, NULL); - if (result != UDS_SUCCESS) { - return result; - } - // Sort and write the record pages to the volume. - result = writeRecordPages(volume, physicalPage, records, NULL); - if (result != UDS_SUCCESS) { - return result; - } - releaseVolumePage(&volume->scratchPage); - // Flush the data to permanent storage. - return syncVolumeStore(&volume->volumeStore); -} - -/**********************************************************************/ -size_t getCacheSize(Volume *volume) -{ - size_t size = getPageCacheSize(volume->pageCache); - if (isSparse(volume->geometry)) { - size += getSparseCacheMemorySize(volume->sparseCache); - } - return size; -} - -/**********************************************************************/ -static int probeChapter(Volume *volume, - unsigned int chapterNumber, - uint64_t *virtualChapterNumber) -{ - const Geometry *geometry = volume->geometry; - unsigned int expectedListNumber = 0; - uint64_t lastVCN = UINT64_MAX; - - prefetchVolumePages(&volume->volumeStore, - mapToPhysicalPage(geometry, chapterNumber, 0), - geometry->indexPagesPerChapter); - - unsigned int i; - for (i = 0; i < geometry->indexPagesPerChapter; ++i) { - DeltaIndexPage *page; - int result = getPage(volume, chapterNumber, i, CACHE_PROBE_INDEX_FIRST, - NULL, &page); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t vcn = page->virtualChapterNumber; - if (lastVCN == UINT64_MAX) { - lastVCN = vcn; - } else if (vcn != lastVCN) { - logError("inconsistent chapter %u index page %u: expected vcn %" - PRIu64 ", got vcn %llu", - chapterNumber, i, lastVCN, vcn); - return UDS_CORRUPT_COMPONENT; - } - - if (expectedListNumber != page->lowestListNumber) { - logError("inconsistent chapter %u index page %u: expected list number %u" - ", got list number %u", - chapterNumber, i, expectedListNumber, page->lowestListNumber); - return UDS_CORRUPT_COMPONENT; - } - expectedListNumber = page->highestListNumber + 1; - - result = validateChapterIndexPage(page, geometry); - if (result != UDS_SUCCESS) { - return result; - } - } - - if (lastVCN == UINT64_MAX) { - logError("no chapter %u virtual chapter number determined", chapterNumber); - return UDS_CORRUPT_COMPONENT; - } - if (chapterNumber != lastVCN % geometry->chaptersPerVolume) { - logError("chapter %u vcn %llu is out of phase (%u)", - chapterNumber, lastVCN, geometry->chaptersPerVolume); - return UDS_CORRUPT_COMPONENT; - } - *virtualChapterNumber = lastVCN; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int probeWrapper(void *aux, - unsigned int chapterNumber, - uint64_t *virtualChapterNumber) -{ - Volume *volume = aux; - int result = probeChapter(volume, chapterNumber, virtualChapterNumber); - if ((result == UDS_CORRUPT_COMPONENT) || (result == UDS_CORRUPT_DATA)) { - *virtualChapterNumber = UINT64_MAX; - return UDS_SUCCESS; - } - return result; -} - -/**********************************************************************/ -static int findRealEndOfVolume(Volume *volume, - unsigned int limit, - unsigned int *limitPtr) -{ - /* - * Start checking from the end of the volume. As long as we hit corrupt - * data, start skipping larger and larger amounts until we find real data. - * If we find real data, reduce the span and try again until we find - * the exact boundary. - */ - unsigned int span = 1; - unsigned int tries = 0; - while (limit > 0) { - unsigned int chapter = (span > limit) ? 0 : limit - span; - uint64_t vcn = 0; - int result = probeChapter(volume, chapter, &vcn); - if (result == UDS_SUCCESS) { - if (span == 1) { - break; - } - span /= 2; - tries = 0; - } else if (result == UDS_CORRUPT_COMPONENT) { - limit = chapter; - if (++tries > 1) { - span *= 2; - } - } else { - return logErrorWithStringError(result, "cannot determine end of volume"); - } - } - - if (limitPtr != NULL) { - *limitPtr = limit; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -int findVolumeChapterBoundaries(Volume *volume, - uint64_t *lowestVCN, - uint64_t *highestVCN, - bool *isEmpty) -{ - unsigned int chapterLimit = volume->geometry->chaptersPerVolume; - - int result = findRealEndOfVolume(volume, chapterLimit, &chapterLimit); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot find end of volume"); - } - - if (chapterLimit == 0) { - *lowestVCN = 0; - *highestVCN = 0; - *isEmpty = true; - return UDS_SUCCESS; - } - - *isEmpty = false; - return findVolumeChapterBoundariesImpl(chapterLimit, MAX_BAD_CHAPTERS, - lowestVCN, highestVCN, probeWrapper, - volume); -} - -/**********************************************************************/ -int findVolumeChapterBoundariesImpl(unsigned int chapterLimit, - unsigned int maxBadChapters, - uint64_t *lowestVCN, - uint64_t *highestVCN, - int (*probeFunc)(void *aux, - unsigned int chapter, - uint64_t *vcn), - void *aux) -{ - if (chapterLimit == 0) { - *lowestVCN = 0; - *highestVCN = 0; - return UDS_SUCCESS; - } - - /* - * This method assumes there is at most one run of contiguous bad chapters - * caused by unflushed writes. Either the bad spot is at the beginning and - * end, or somewhere in the middle. Wherever it is, the highest and lowest - * VCNs are adjacent to it. Otherwise the volume is cleanly saved and - * somewhere in the middle of it the highest VCN immediately preceeds the - * lowest one. - */ - - uint64_t firstVCN = UINT64_MAX; - - // doesn't matter if this results in a bad spot (UINT64_MAX) - int result = (*probeFunc)(aux, 0, &firstVCN); - if (result != UDS_SUCCESS) { - return UDS_SUCCESS; - } - - /* - * Binary search for end of the discontinuity in the monotonically - * increasing virtual chapter numbers; bad spots are treated as a span of - * UINT64_MAX values. In effect we're searching for the index of the - * smallest value less than firstVCN. In the case we go off the end it means - * that chapter 0 has the lowest vcn. - */ - - unsigned int leftChapter = 0; - unsigned int rightChapter = chapterLimit; - - while (leftChapter < rightChapter) { - unsigned int chapter = (leftChapter + rightChapter) / 2; - uint64_t probeVCN; - - result = (*probeFunc)(aux, chapter, &probeVCN); - if (result != UDS_SUCCESS) { - return result; - } - if (firstVCN <= probeVCN) { - leftChapter = chapter + 1; - } else { - rightChapter = chapter; - } - } - - uint64_t lowest = UINT64_MAX; - uint64_t highest = UINT64_MAX; - - result = ASSERT(leftChapter == rightChapter, "leftChapter == rightChapter"); - if (result != UDS_SUCCESS) { - return result; - } - - leftChapter %= chapterLimit; // in case we're at the end - - // At this point, leftChapter is the chapter with the lowest virtual chapter - // number. - - result = (*probeFunc)(aux, leftChapter, &lowest); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT((lowest != UINT64_MAX), "invalid lowest chapter"); - if (result != UDS_SUCCESS) { - return result; - } - - // We now circularly scan backwards, moving over any bad chapters until we - // find the chapter with the highest vcn (the first good chapter we - // encounter). - - unsigned int badChapters = 0; - - for (;;) { - rightChapter = (rightChapter + chapterLimit - 1) % chapterLimit; - result = (*probeFunc)(aux, rightChapter, &highest); - if (result != UDS_SUCCESS) { - return result; - } - if (highest != UINT64_MAX) { - break; - } - if (++badChapters >= maxBadChapters) { - logError("too many bad chapters in volume: %u", badChapters); - return UDS_CORRUPT_COMPONENT; - } - } - - *lowestVCN = lowest; - *highestVCN = highest; - return UDS_SUCCESS; -} - -/** - * Allocate a volume. - * - * @param config The configuration to use - * @param layout The index layout - * @param readQueueMaxSize The maximum size of the read queue - * @param zoneCount The number of zones to use - * @param newVolume A pointer to hold the new volume - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int allocateVolume(const Configuration *config, - IndexLayout *layout, - unsigned int readQueueMaxSize, - unsigned int zoneCount, - Volume **newVolume) -{ - Volume *volume; - int result = ALLOCATE(1, Volume, "volume", &volume); - if (result != UDS_SUCCESS) { - return result; - } - volume->nonce = getVolumeNonce(layout); - // It is safe to call freeVolume now to clean up and close the volume - - result = copyGeometry(config->geometry, &volume->geometry); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return logWarningWithStringError(result, - "failed to allocate geometry: error"); - } - - // Need a buffer for each entry in the page cache - unsigned int reservedBuffers - = config->cacheChapters * config->geometry->recordPagesPerChapter; - // And a buffer for the chapter writer - reservedBuffers += 1; - // And a buffer for each entry in the sparse cache - if (isSparse(volume->geometry)) { - reservedBuffers - += config->cacheChapters * config->geometry->indexPagesPerChapter; - } - result = openVolumeStore(&volume->volumeStore, layout, reservedBuffers, - config->geometry->bytesPerPage); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - result = initializeVolumePage(config->geometry, &volume->scratchPage); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - - result = makeRadixSorter(config->geometry->recordsPerPage, - &volume->radixSorter); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - - result = ALLOCATE(config->geometry->recordsPerPage, const UdsChunkRecord *, - "record pointers", &volume->recordPointers); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - - if (isSparse(volume->geometry)) { - result = makeSparseCache(volume->geometry, config->cacheChapters, - zoneCount, &volume->sparseCache); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - } - result = makePageCache(volume->geometry, config->cacheChapters, - readQueueMaxSize, zoneCount, &volume->pageCache); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - result = makeIndexPageMap(volume->geometry, &volume->indexPageMap); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - - *newVolume = volume; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int makeVolume(const Configuration *config, - IndexLayout *layout, - const struct uds_parameters *userParams, - unsigned int readQueueMaxSize, - unsigned int zoneCount, - Volume **newVolume) -{ - unsigned int volumeReadThreads = getReadThreads(userParams); - - if (readQueueMaxSize <= volumeReadThreads) { - logError("Number of read threads must be smaller than read queue"); - return UDS_INVALID_ARGUMENT; - } - - Volume *volume = NULL; - int result = allocateVolume(config, layout, readQueueMaxSize, zoneCount, - &volume); - if (result != UDS_SUCCESS) { - return result; - } - result = initMutex(&volume->readThreadsMutex); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - result = initCond(&volume->readThreadsReadDoneCond); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - result = initCond(&volume->readThreadsCond); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - - // Start the reader threads. If this allocation succeeds, freeVolume knows - // that it needs to try and stop those threads. - result = ALLOCATE(volumeReadThreads, Thread, "reader threads", - &volume->readerThreads); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - unsigned int i; - for (i = 0; i < volumeReadThreads; i++) { - result = createThread(readThreadFunction, (void *) volume, "reader", - &volume->readerThreads[i]); - if (result != UDS_SUCCESS) { - freeVolume(volume); - return result; - } - // We only stop as many threads as actually got started. - volume->numReadThreads = i + 1; - } - - *newVolume = volume; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void freeVolume(Volume *volume) -{ - if (volume == NULL) { - return; - } - - // If readerThreads is NULL, then we haven't set up the reader threads. - if (volume->readerThreads != NULL) { - // Stop the reader threads. It is ok if there aren't any of them. - lockMutex(&volume->readThreadsMutex); - volume->readerState |= READER_STATE_EXIT; - broadcastCond(&volume->readThreadsCond); - unlockMutex(&volume->readThreadsMutex); - unsigned int i; - for (i = 0; i < volume->numReadThreads; i++) { - joinThreads(volume->readerThreads[i]); - } - FREE(volume->readerThreads); - volume->readerThreads = NULL; - } - - // Must close the volume store AFTER freeing the scratch page and the caches - destroyVolumePage(&volume->scratchPage); - freePageCache(volume->pageCache); - freeSparseCache(volume->sparseCache); - closeVolumeStore(&volume->volumeStore); - - destroyCond(&volume->readThreadsCond); - destroyCond(&volume->readThreadsReadDoneCond); - destroyMutex(&volume->readThreadsMutex); - freeIndexPageMap(volume->indexPageMap); - freeRadixSorter(volume->radixSorter); - FREE(volume->geometry); - FREE(volume->recordPointers); - FREE(volume); -} diff --git a/uds/volume.h b/uds/volume.h deleted file mode 100644 index 82aef00..0000000 --- a/uds/volume.h +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/volume.h#14 $ - */ - -#ifndef VOLUME_H -#define VOLUME_H - -#include "cacheCounters.h" -#include "common.h" -#include "chapterIndex.h" -#include "indexConfig.h" -#include "indexLayout.h" -#include "indexPageMap.h" -#include "pageCache.h" -#include "request.h" -#include "sparseCache.h" -#include "uds.h" -#include "util/radixSort.h" -#include "volumeStore.h" - -typedef enum { - READER_STATE_RUN = 1, - READER_STATE_EXIT = 2, - READER_STATE_STOP = 4 -} ReaderState; - -typedef enum indexLookupMode { - /* Always do lookups in all chapters normally. */ - LOOKUP_NORMAL, - /* - * Don't do lookups in closed chapters; assume records not in the - * open chapter are always new. You don't want this normally; it's - * for programs like albfill. (Even then, with multiple runs using - * the same tag, we may actually duplicate older records, but if - * it's in a separate chapter it won't really matter.) - */ - LOOKUP_CURRENT_CHAPTER_ONLY, - /* - * Only do a subset of lookups needed when rebuilding an index. - * This cannot be set externally. - */ - LOOKUP_FOR_REBUILD -} IndexLookupMode; - -typedef struct volume { - /* The layout of the volume */ - Geometry *geometry; - /* The configuration of the volume */ - Configuration *config; - /* The access to the volume's backing store */ - struct volume_store volumeStore; - /* A single page used for writing to the volume */ - struct volume_page scratchPage; - /* The nonce used to save the volume */ - uint64_t nonce; - /* A single page's records, for sorting */ - const UdsChunkRecord **recordPointers; - /* For sorting record pages */ - RadixSorter *radixSorter; - /* The sparse chapter index cache */ - SparseCache *sparseCache; - /* The page cache */ - PageCache *pageCache; - /* The index page map maps delta list numbers to index page numbers */ - IndexPageMap *indexPageMap; - /* Mutex to sync between read threads and index thread */ - Mutex readThreadsMutex; - /* Condvar to indicate when read threads should start working */ - CondVar readThreadsCond; - /* Condvar to indicate when a read thread has finished a read */ - CondVar readThreadsReadDoneCond; - /* Threads to read data from disk */ - Thread *readerThreads; - /* Number of threads busy with reads */ - unsigned int busyReaderThreads; - /* The state of the reader threads */ - ReaderState readerState; - /* The lookup mode for the index */ - IndexLookupMode lookupMode; - /* Number of read threads to use (run-time parameter) */ - unsigned int numReadThreads; -} Volume; - -/** - * Create a volume. - * - * @param config The configuration to use. - * @param layout The index layout - * @param userParams The index session parameters. If NULL, the default - * session parameters will be used. - * @param readQueueMaxSize The maximum size of the read queue. - * @param zoneCount The number of zones to use. - * @param newVolume A pointer to hold a pointer to the new volume. - * - * @return UDS_SUCCESS or an error code - **/ -int makeVolume(const Configuration *config, - IndexLayout *layout, - const struct uds_parameters *userParams, - unsigned int readQueueMaxSize, - unsigned int zoneCount, - Volume **newVolume) - __attribute__((warn_unused_result)); - -/** - * Clean up a volume and its memory. - * - * @param volume The volume to destroy. - **/ -void freeVolume(Volume *volume); - -/** - * Enqueue a page read. - * - * @param volume the volume - * @param request the request to waiting on the read - * @param physicalPage the page number to read - * - * @return UDS_QUEUED if successful, or an error code - **/ -int enqueuePageRead(Volume *volume, Request *request, int physicalPage) - __attribute__((warn_unused_result)); - -/** - * Find the lowest and highest contiguous chapters and determine their - * virtual chapter numbers. - * - * @param [in] volume The volume to probe. - * @param [out] lowestVCN Pointer for lowest virtual chapter number. - * @param [out] highestVCN Pointer for highest virtual chapter number. - * @param [out] isEmpty Pointer to a bool indicating whether or not the - * volume is empty. - * - * @return UDS_SUCCESS, or an error code. - * - * @note This routine does something similar to a binary search to find - * the location in the volume file where the discontinuity of - * chapter numbers occurs. In a good save, the discontinuity is - * a sharp cliff, but if write failures occured during saving - * there may be one or more chapters which are partially written. - * - * @note This method takes advantage of the fact that the physical - * chapter number in which the index pages are found should have - * headers which state that the virtual chapter number are all - * identical and maintain the invariant that - * pcn == vcn % chaptersPerVolume. - **/ -int findVolumeChapterBoundaries(Volume *volume, - uint64_t *lowestVCN, - uint64_t *highestVCN, - bool *isEmpty) - __attribute__((warn_unused_result)); - -/** - * Find any matching metadata for the given name within a given physical - * chapter. - * - * @param volume The volume. - * @param request The request originating the search. - * @param name The block name of interest. - * @param virtualChapter The number of the chapter to search. - * @param metadata The old metadata for the name. - * @param found A pointer which will be set to - * true if a match was found. - * - * @return UDS_SUCCESS or an error - **/ -int searchVolumePageCache(Volume *volume, - Request *request, - const UdsChunkName *name, - uint64_t virtualChapter, - UdsChunkData *metadata, - bool *found) - __attribute__((warn_unused_result)); - -/** - * Fetch a record page from the cache or read it from the volume and search it - * for a chunk name. - * - * If a match is found, optionally returns the metadata from the stored - * record. If the requested record page is not cached, the page fetch may be - * asynchronously completed on the slow lane, in which case UDS_QUEUED will be - * returned and the request will be requeued for continued processing after - * the page is read and added to the cache. - * - * @param volume the volume containing the record page to search - * @param request the request originating the search (may be NULL for - * a direct query from volume replay) - * @param name the name of the block or chunk - * @param chapter the chapter to search - * @param recordPageNumber the record page number of the page to search - * @param duplicate an array in which to place the metadata of the - * duplicate, if one was found - * @param found a (bool *) which will be set to true if the chunk - * was found - * - * @return UDS_SUCCESS, UDS_QUEUED, or an error code - **/ -int searchCachedRecordPage(Volume *volume, - Request *request, - const UdsChunkName *name, - unsigned int chapter, - int recordPageNumber, - UdsChunkData *duplicate, - bool *found) - __attribute__((warn_unused_result)); - -/** - * Forget the contents of a chapter. Invalidates any cached state for the - * specified chapter. - * - * @param volume the volume containing the chapter - * @param chapter the virtual chapter number - * @param reason the reason for invalidation - * - * @return UDS_SUCCESS or an error code - **/ -int forgetChapter(Volume *volume, - uint64_t chapter, - InvalidationReason reason) - __attribute__((warn_unused_result)); - -/** - * Write a chapter's worth of index pages to a volume - * - * @param volume the volume containing the chapter - * @param physicalPage the page number in the volume for the chapter - * @param chapterIndex the populated delta chapter index - * @param pages pointer to array of page pointers. Used only in testing - * to return what data has been written to disk. - * - * @return UDS_SUCCESS or an error code - **/ -int writeIndexPages(Volume *volume, - int physicalPage, - OpenChapterIndex *chapterIndex, - byte **pages) -__attribute__((warn_unused_result)); - -/** - * Write a chapter's worth of record pages to a volume - * - * @param volume the volume containing the chapter - * @param physicalPage the page number in the volume for the chapter - * @param records a 1-based array of chunk records in the chapter - * @param pages pointer to array of page pointers. Used only in testing - * to return what data has been written to disk. - * - * @return UDS_SUCCESS or an error code - **/ -int writeRecordPages(Volume *volume, - int physicalPage, - const UdsChunkRecord records[], - byte **pages) -__attribute__((warn_unused_result)); - -/** - * Write the index and records from the most recently filled chapter to the - * volume. - * - * @param volume the volume containing the chapter - * @param chapterIndex the populated delta chapter index - * @param records a 1-based array of chunk records in the chapter - * - * @return UDS_SUCCESS or an error code - **/ -int writeChapter(Volume *volume, - OpenChapterIndex *chapterIndex, - const UdsChunkRecord records[]) - __attribute__((warn_unused_result)); - -/** - * Read all the index pages for a chapter from the volume and initialize an - * array of ChapterIndexPages to represent them. - * - * @param [in] volume the volume containing the chapter - * @param [in] virtualChapter the virtual chapter number of the index to read - * @param [out] volumePages an array to receive the raw index page data - * @param [out] indexPages an array of ChapterIndexPages to initialize - * - * @return UDS_SUCCESS or an error code - **/ -int readChapterIndexFromVolume(const Volume *volume, - uint64_t virtualChapter, - struct volume_page volumePages[], - DeltaIndexPage indexPages[]) - __attribute__((warn_unused_result)); - -/** - * Retrieve a page either from the cache (if we can) or from disk. If a read - * from disk is required, this is done immediately in the same thread and the - * page is then returned. - * - * The caller of this function must be holding the volume read mutex before - * calling this function. - * - * As a side-effect, the retrieved page will become the most recent page in - * the cache. - * - * This function is only exposed for the use of unit tests. - * - * @param volume The volume containing the page - * @param request The request originating the search - * @param physicalPage The physical page number - * @param probeType The type of cache access being done - * @param entryPtr A pointer to hold the retrieved cached entry - * - * @return UDS_SUCCESS or an error code - **/ -int getPageLocked(Volume *volume, - Request *request, - unsigned int physicalPage, - CacheProbeType probeType, - CachedPage **entryPtr) - __attribute__((warn_unused_result)); - -/** - * Retrieve a page either from the cache (if we can) or from disk. If a read - * from disk is required, the read request is enqueued for later processing - * by another thread. When that thread finally reads the page into the cache, - * a callback function is called to inform the caller the read is complete. - * - * The caller of this function should not be holding the volume read lock. - * Instead, the caller must call beingPendingSearch() for the given zone - * the request is being processed in. That state will be maintained or - * restored when the call returns, at which point the caller should call - * endPendingSearch(). - * - * As a side-effect, the retrieved page will become the most recent page in - * the cache. - * - * This function is only exposed for the use of unit tests. - * - * @param volume The volume containing the page - * @param request The request originating the search - * @param physicalPage The physical page number - * @param probeType The type of cache access being done - * @param entryPtr A pointer to hold the retrieved cached entry - * - * @return UDS_SUCCESS or an error code - **/ -int getPageProtected(Volume *volume, - Request *request, - unsigned int physicalPage, - CacheProbeType probeType, - CachedPage **entryPtr) - __attribute__((warn_unused_result)); - -/** - * Retrieve a page either from the cache (if we can) or from disk. If a read - * from disk is required, this is done immediately in the same thread and the - * page is then returned. - * - * The caller of this function must not be holding the volume read lock before - * calling this function. This method will grab that lock and release it - * when it returns. - * - * As a side-effect, the retrieved page will become the most recent page in - * the cache. - * - * This function should only be called by areas of the code that do not use - * multi-threading to access the volume. These include rebuild, volume - * explorer, and certain unit tests. - * - * @param volume The volume containing the page - * @param chapter The number of the chapter containing the page - * @param pageNumber The number of the page - * @param probeType The type of cache access being done - * @param dataPtr Pointer to hold the retrieved page, NULL if not wanted - * @param indexPagePtr Pointer to hold the retrieved chapter index page, or - * NULL if not wanted - * - * @return UDS_SUCCESS or an error code - **/ -int getPage(Volume *volume, - unsigned int chapter, - unsigned int pageNumber, - CacheProbeType probeType, - byte **dataPtr, - DeltaIndexPage **indexPagePtr) - __attribute__((warn_unused_result)); - -/**********************************************************************/ -size_t getCacheSize(Volume *volume) __attribute__((warn_unused_result)); - -/**********************************************************************/ -int findVolumeChapterBoundariesImpl(unsigned int chapterLimit, - unsigned int maxBadChapters, - uint64_t *lowestVCN, - uint64_t *highestVCN, - int (*probeFunc)(void *aux, - unsigned int chapter, - uint64_t *vcn), - void *aux) - __attribute__((warn_unused_result)); - -/** - * Map a chapter number and page number to a phsical volume page number. - * - * @param geometry the layout of the volume - * @param chapter the chapter number of the desired page - * @param page the chapter page number of the desired page - * - * @return the physical page number - **/ -int mapToPhysicalPage(const Geometry *geometry, int chapter, int page) - __attribute__((warn_unused_result)); - -#endif /* VOLUME_H */ diff --git a/uds/volumeStore.c b/uds/volumeStore.c deleted file mode 100644 index 8b9f820..0000000 --- a/uds/volumeStore.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/volumeStore.c#2 $ - */ - -#include "geometry.h" -#include "indexLayout.h" -#include "logger.h" -#include "uds-error.h" -#include "volumeStore.h" - - -/*****************************************************************************/ -void closeVolumeStore(struct volume_store *volumeStore) -{ -#ifdef __KERNEL__ - if (volumeStore->vs_client != NULL) { - dm_bufio_client_destroy(volumeStore->vs_client); - volumeStore->vs_client = NULL; - } -#else - if (volumeStore->vs_region != NULL) { - putIORegion(volumeStore->vs_region); - volumeStore->vs_region = NULL; - } -#endif -} - -/*****************************************************************************/ -void destroyVolumePage(struct volume_page *volumePage) -{ -#ifdef __KERNEL__ - releaseVolumePage(volumePage); -#else - FREE(volumePage->vp_data); - volumePage->vp_data = NULL; -#endif -} - -/*****************************************************************************/ -int initializeVolumePage(const struct geometry *geometry, - struct volume_page *volumePage) -{ -#ifdef __KERNEL__ - volumePage->vp_buffer = NULL; - return UDS_SUCCESS; -#else - return ALLOCATE_IO_ALIGNED(geometry->bytesPerPage, byte, __func__, - &volumePage->vp_data); -#endif -} - -/*****************************************************************************/ -int openVolumeStore(struct volume_store *volumeStore, - IndexLayout *layout, - unsigned int reservedBuffers __attribute__((unused)), - size_t bytesPerPage) -{ -#ifdef __KERNEL__ - return openVolumeBufio(layout, bytesPerPage, reservedBuffers, - &volumeStore->vs_client); -#else - volumeStore->vs_bytesPerPage = bytesPerPage; - return openVolumeRegion(layout, &volumeStore->vs_region); -#endif -} - -/*****************************************************************************/ -void prefetchVolumePages(const struct volume_store *vs __attribute__((unused)), - unsigned int physicalPage __attribute__((unused)), - unsigned int pageCount __attribute__((unused))) -{ -#ifdef __KERNEL__ - dm_bufio_prefetch(vs->vs_client, physicalPage, pageCount); -#else - // Nothing to do in user mode -#endif -} - -/*****************************************************************************/ -int prepareToWriteVolumePage(const struct volume_store *volumeStore - __attribute__((unused)), - unsigned int physicalPage - __attribute__((unused)), - struct volume_page *volumePage - __attribute__((unused))) -{ -#ifdef __KERNEL__ - releaseVolumePage(volumePage); - struct dm_buffer *buffer = NULL; - byte *data = dm_bufio_new(volumeStore->vs_client, physicalPage, &buffer); - if (IS_ERR(data)) { - return -PTR_ERR(data); - } - volumePage->vp_buffer = buffer; -#else - // Nothing to do in user mode -#endif - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int readVolumePage(const struct volume_store *volumeStore, - unsigned int physicalPage, - struct volume_page *volumePage) -{ -#ifdef __KERNEL__ - releaseVolumePage(volumePage); - byte *data = dm_bufio_read(volumeStore->vs_client, physicalPage, - &volumePage->vp_buffer); - if (IS_ERR(data)) { - return logWarningWithStringError(-PTR_ERR(data), - "error reading physical page %u", - physicalPage); - } -#else - off_t offset = (off_t) physicalPage * volumeStore->vs_bytesPerPage; - int result = readFromRegion(volumeStore->vs_region, offset, - getPageData(volumePage), - volumeStore->vs_bytesPerPage, NULL); - if (result != UDS_SUCCESS) { - return logWarningWithStringError(result, - "error reading physical page %u", - physicalPage); - } -#endif - return UDS_SUCCESS; -} - -/*****************************************************************************/ -void releaseVolumePage(struct volume_page *volumePage __attribute__((unused))) -{ -#ifdef __KERNEL__ - if (volumePage->vp_buffer != NULL) { - dm_bufio_release(volumePage->vp_buffer); - volumePage->vp_buffer = NULL; - } -#else - // Nothing to do in user mode -#endif -} - -/*****************************************************************************/ -void swapVolumePages(struct volume_page *volumePage1, - struct volume_page *volumePage2) -{ - struct volume_page temp = *volumePage1; - *volumePage1 = *volumePage2; - *volumePage2 = temp; -} - -/*****************************************************************************/ -int syncVolumeStore(const struct volume_store *volumeStore) -{ -#ifdef __KERNEL__ - int result = -dm_bufio_write_dirty_buffers(volumeStore->vs_client); -#else - int result = syncRegionContents(volumeStore->vs_region); -#endif - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "cannot sync chapter to volume"); - } - return UDS_SUCCESS; -} - -/*****************************************************************************/ -int writeVolumePage(const struct volume_store *volumeStore, - unsigned int physicalPage, - struct volume_page *volumePage) -{ -#ifdef __KERNEL__ - dm_bufio_mark_buffer_dirty(volumePage->vp_buffer); - return UDS_SUCCESS; -#else - off_t offset = (off_t) physicalPage * volumeStore->vs_bytesPerPage; - return writeToRegion(volumeStore->vs_region, offset, getPageData(volumePage), - volumeStore->vs_bytesPerPage, - volumeStore->vs_bytesPerPage); -#endif -} diff --git a/uds/volumeStore.h b/uds/volumeStore.h deleted file mode 100644 index f475427..0000000 --- a/uds/volumeStore.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/volumeStore.h#2 $ - */ - -#ifndef VOLUME_STORE_H -#define VOLUME_STORE_H - -#include "common.h" -#include "compiler.h" -#include "memoryAlloc.h" - -#ifdef __KERNEL__ -#include -#else -#include "ioRegion.h" -#endif - -struct geometry; -struct indexLayout; - - -struct volume_store { -#ifdef __KERNEL__ - struct dm_bufio_client *vs_client; -#else - IORegion *vs_region; - size_t vs_bytesPerPage; -#endif -}; - - -struct volume_page { -#ifdef __KERNEL__ - struct dm_buffer *vp_buffer; -#else - byte *vp_data; -#endif -}; - -/** - * Close a volume store. - * - * @param volumeStore The volume store - **/ -void closeVolumeStore(struct volume_store *volumeStore); - -/** - * Uninitialize a volume page buffer. - * - * @param volumePage The volume page buffer - **/ -void destroyVolumePage(struct volume_page *volumePage); - -/** - * Get a pointer to the data contained in a volume page buffer. - * - * @param volumePage The volume page buffer - * - * @return the address of the data - **/ -__attribute__((warn_unused_result)) -static INLINE byte *getPageData(const struct volume_page *volumePage) -{ -#ifdef __KERNEL__ - return dm_bufio_get_block_data(volumePage->vp_buffer); -#else - return volumePage->vp_data; -#endif -} - -/** - * Initialize a volume page buffer. - * - * @param geometry The volume geometry - * @param volumePage The volume page buffer - * - * @return UDS_SUCCESS or an error status - **/ -int initializeVolumePage(const struct geometry *geometry, - struct volume_page *volumePage) - __attribute__((warn_unused_result)); - -/** - * Open a volume store. - * - * @param volumeStore The volume store - * @param layout The index layout - * @param reservedBuffers The number of buffers that can be reserved - * @param bytesPerPage The number of bytes in a volume page - **/ -int openVolumeStore(struct volume_store *volumeStore, - struct indexLayout *layout, - unsigned int reservedBuffers, - size_t bytesPerPage) - __attribute__((warn_unused_result)); - -/** - * Prefetch volume pages into memory. - * - * @param volumeStore The volume store - * @param physicalPage The volume page number of the first desired page - * @param pageCount The number of volume pages to prefetch - **/ -void prefetchVolumePages(const struct volume_store *volumeStore, - unsigned int physicalPage, - unsigned int pageCount); - -/** - * Prepare a buffer to write a page to the volume. - * - * @param volumeStore The volume store - * @param physicalPage The volume page number of the desired page - * @param volumePage The volume page buffer - * - * @return UDS_SUCCESS or an error code - **/ -int prepareToWriteVolumePage(const struct volume_store *volumeStore, - unsigned int physicalPage, - struct volume_page *volumePage) - __attribute__((warn_unused_result)); - -/** - * Read a page from a volume store. - * - * @param volumeStore The volume store - * @param physicalPage The volume page number of the desired page - * @param volumePage The volume page buffer - * - * @return UDS_SUCCESS or an error code - **/ -int readVolumePage(const struct volume_store *volumeStore, - unsigned int physicalPage, - struct volume_page *volumePage) - __attribute__((warn_unused_result)); - -/** - * Release a volume page buffer, because it will no longer be accessed before a - * call to readVolumePage or prepareToWriteVolumePage. - * - * @param volumePage The volume page buffer - **/ -void releaseVolumePage(struct volume_page *volumePage); - -/** - * Swap volume pages. This is used to put the contents of a newly written - * index page (in the scratch page) into the page cache. - * - * @param volumePage1 The volume page buffer - * @param volumePage2 The volume page buffer - **/ -void swapVolumePages(struct volume_page *volumePage1, - struct volume_page *volumePage2); - -/** - * Sync the volume store to storage. - * - * @param volumeStore The volume store - * - * @return UDS_SUCCESS or an error code - **/ -int syncVolumeStore(const struct volume_store *volumeStore) - __attribute__((warn_unused_result)); - -/** - * Write a page to a volume store. - * - * @param volumeStore The volume store - * @param physicalPage The volume page number of the desired page - * @param volumePage The volume page buffer - * - * @return UDS_SUCCESS or an error code - **/ -int writeVolumePage(const struct volume_store *volumeStore, - unsigned int physicalPage, - struct volume_page *volumePage) - __attribute__((warn_unused_result)); - -#endif /* VOLUME_STORE_H */ diff --git a/uds/zone.c b/uds/zone.c deleted file mode 100644 index cc07674..0000000 --- a/uds/zone.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/zone.c#4 $ - */ - -#include "zone.h" - -#include "logger.h" -#include "threads.h" - -/**********************************************************************/ -unsigned int getZoneCount(const struct uds_parameters *userParams) -{ - unsigned int zoneCount = (userParams == NULL) ? 0 : userParams->zone_count; - if (zoneCount == 0) { - zoneCount = getNumCores() / 2; - } - if (zoneCount < 1) { - zoneCount = 1; - } - if (zoneCount > MAX_ZONES) { - zoneCount = MAX_ZONES; - } - logInfo("Using %u indexing zone%s for concurrency.", zoneCount, - zoneCount == 1 ? "" : "s"); - return zoneCount; -} diff --git a/uds/zone.h b/uds/zone.h deleted file mode 100644 index 99daf40..0000000 --- a/uds/zone.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/uds-releases/jasper/src/uds/zone.h#2 $ - */ - -#ifndef ZONE_H -#define ZONE_H - -#include "uds.h" - -enum { - MAX_ZONES = 16, -}; - -/** - * Return the number of zones. - * - * @param userParams the index session parameters. If NULL, the default - * session parameters will be used. - * - * @return the number of zones - **/ -unsigned int getZoneCount(const struct uds_parameters *userParams) - __attribute__((warn_unused_result)); - -#endif /* ZONE_H */ diff --git a/vdo/Makefile b/vdo/Makefile deleted file mode 100644 index 816c219..0000000 --- a/vdo/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -VDO_VERSION = 6.2.4.26 - -VDO_VERSION_MAJOR = $(word 1,$(subst ., ,$(VDO_VERSION))) -VDO_VERSION_MINOR = $(word 2,$(subst ., ,$(VDO_VERSION))) -VDO_VERSION_MICRO = $(word 3,$(subst ., ,$(VDO_VERSION))) - -SOURCES = $(addprefix base/,$(notdir $(wildcard $(src)/base/*.c))) -SOURCES += $(addprefix kernel/,$(notdir $(wildcard $(src)/kernel/*.c))) -OBJECTS = $(SOURCES:%.c=%.o) -INCLUDES = -I$(src)/base -I$(src)/kernel -I$(src)/../uds - -EXTRA_CFLAGS = -std=gnu99 \ - -fno-builtin-memset \ - -Werror \ - -Wframe-larger-than=400 \ - -Wno-declaration-after-statement \ - -DVDO_VERSION_MAJOR=$(VDO_VERSION_MAJOR) \ - -DVDO_VERSION_MINOR=$(VDO_VERSION_MINOR) \ - -DVDO_VERSION_MICRO=$(VDO_VERSION_MICRO) \ - -DCURRENT_VERSION=\"$(VDO_VERSION)\" \ - $(INCLUDES) - -CFLAGS_REMOVE_vdoPageCache.o= -std=gnu99 -CFLAGS_REMOVE_vio.o= -std=gnu99 - -CFLAGS_vdoPageCache.o= -std=gnu89 -CFLAGS_vio.o= -std=gnu89 - -obj-m += kvdo.o - -kvdo-objs = $(OBJECTS) diff --git a/vdo/base/actionManager.c b/vdo/base/actionManager.c deleted file mode 100644 index 664131d..0000000 --- a/vdo/base/actionManager.c +++ /dev/null @@ -1,399 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/actionManager.c#9 $ - */ - -#include "actionManager.h" - -#include "memoryAlloc.h" - -#include "adminState.h" -#include "completion.h" -#include "types.h" - -/** An action to be performed in each of a set of zones */ -typedef struct action Action; -struct action { - /** Whether this structure is in use */ - bool inUse; - /** The admin operation associated with this action */ - AdminStateCode operation; - /** - * The method to run on the initiator thread before the action is applied to - * each zone. - **/ - ActionPreamble *preamble; - /** The action to be performed in each zone */ - ZoneAction *zoneAction; - /** - * The method to run on the initiator thread after the action has been - * applied to each zone - **/ - ActionConclusion *conclusion; - /** The object to notify when the action is complete */ - VDOCompletion *parent; - /** The action specific context */ - void *context; - /** The action to perform after this one */ - Action *next; -}; - -struct actionManager { - /** The completion for performing actions */ - VDOCompletion completion; - /** The state of this action manager */ - AdminState state; - /** The two action slots*/ - Action actions[2]; - /** The current action slot */ - Action *currentAction; - /** The number of zones in which an action is to be applied */ - ZoneCount zones; - /** A function to schedule a default next action */ - ActionScheduler *scheduler; - /** - * A function to get the id of the thread on which to apply an action to a - * zone - **/ - ZoneThreadGetter *getZoneThreadID; - /** The ID of the thread on which actions may be initiated */ - ThreadID initiatorThreadID; - /** Opaque data associated with this action manager */ - void *context; - /** The zone currently being acted upon */ - ZoneCount actingZone; -}; - -/** - * Convert a generic VDOCompletion to a ActionManager. - * - * @param completion The completion to convert - * - * @return The completion as a ActionManager - **/ -static inline ActionManager *asActionManager(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(ActionManager, completion) == 0); - assertCompletionType(completion->type, ACTION_COMPLETION); - return (ActionManager *) completion; -} - -/** - * An action scheduler which does not schedule an action. - * - *

Implements ActionScheduler. - **/ -static bool noDefaultAction(void *context __attribute__((unused))) -{ - return false; -} - -/** - * A default preamble which does nothing. - * - *

Implements ActionPreamble - **/ -static void noPreamble(void *context __attribute__((unused)), - VDOCompletion *completion) -{ - completeCompletion(completion); -} - -/** - * A default conclusion which does nothing. - * - *

Implements ActionConclusion. - **/ -static int noConclusion(void *context __attribute__((unused))) { - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeActionManager(ZoneCount zones, - ZoneThreadGetter *getZoneThreadID, - ThreadID initiatorThreadID, - void *context, - ActionScheduler *scheduler, - PhysicalLayer *layer, - ActionManager **managerPtr) -{ - ActionManager *manager; - int result = ALLOCATE(1, ActionManager, __func__, &manager); - if (result != VDO_SUCCESS) { - return result; - } - - *manager = (ActionManager) { - .zones = zones, - .scheduler = ((scheduler == NULL) ? noDefaultAction : scheduler), - .getZoneThreadID = getZoneThreadID, - .initiatorThreadID = initiatorThreadID, - .context = context, - }; - - manager->actions[0].next = &manager->actions[1]; - manager->currentAction = manager->actions[1].next = &manager->actions[0]; - - result = initializeEnqueueableCompletion(&manager->completion, - ACTION_COMPLETION, layer); - if (result != VDO_SUCCESS) { - freeActionManager(&manager); - return result; - } - - *managerPtr = manager; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeActionManager(ActionManager **managerPtr) -{ - ActionManager *manager = *managerPtr; - if (manager == NULL) { - return; - } - - destroyEnqueueable(&manager->completion); - FREE(manager); - *managerPtr = NULL; -} - -/**********************************************************************/ -AdminStateCode getCurrentManagerOperation(ActionManager *manager) -{ - return manager->state.state; -} - -/**********************************************************************/ -void *getCurrentActionContext(ActionManager *manager) -{ - return (manager->currentAction->inUse - ? manager->currentAction->context : NULL); -} - -/**********************************************************************/ -static void finishActionCallback(VDOCompletion *completion); -static void applyToZone(VDOCompletion *completion); - -/** - * Get the thread ID for the current zone. - * - * @param manager The action manager - * - * @return The ID of the thread on which to run actions for the current zone - **/ -static ThreadID getActingZoneThreadID(ActionManager *manager) -{ - return manager->getZoneThreadID(manager->context, manager->actingZone); -} - -/** - * Prepare the manager's completion to run on the next zone. - * - * @param manager The action manager - **/ -static void prepareForNextZone(ActionManager *manager) -{ - prepareForRequeue(&manager->completion, applyToZone, - preserveErrorAndContinue, getActingZoneThreadID(manager), - manager->currentAction->parent); -} - -/** - * Prepare the manager's completion to run the conclusion on the initiator - * thread. - * - * @param manager The action manager - **/ -static void prepareForConclusion(ActionManager *manager) -{ - prepareForRequeue(&manager->completion, finishActionCallback, - preserveErrorAndContinue, manager->initiatorThreadID, - manager->currentAction->parent); -} - -/** - * Perform an action on the next zone if there is one. - * - * @param completion The action completion - **/ -static void applyToZone(VDOCompletion *completion) -{ - ActionManager *manager = asActionManager(completion); - ASSERT_LOG_ONLY((getCallbackThreadID() == getActingZoneThreadID(manager)), - "applyToZone() called on acting zones's thread"); - - ZoneCount zone = manager->actingZone++; - if (manager->actingZone == manager->zones) { - // We are about to apply to the last zone. Once that is finished, - // we're done, so go back to the initiator thread and finish up. - prepareForConclusion(manager); - } else { - // Prepare to come back on the next zone - prepareForNextZone(manager); - } - - manager->currentAction->zoneAction(manager->context, zone, completion); -} - -/** - * The error handler for preamble errors. - * - * @param completion The manager completion - **/ -static void handlePreambleError(VDOCompletion *completion) -{ - // Skip the zone actions since the preamble failed. - completion->callback = finishActionCallback; - preserveErrorAndContinue(completion); -} - -/** - * Launch the current action. - * - * @param manager The action manager - **/ -static void launchCurrentAction(ActionManager *manager) -{ - Action *action = manager->currentAction; - int result = startOperation(&manager->state, action->operation); - if (result != VDO_SUCCESS) { - if (action->parent != NULL) { - setCompletionResult(action->parent, result); - } - - // We aren't going to run the preamble, so don't run the conclusion - action->conclusion = noConclusion; - finishActionCallback(&manager->completion); - return; - } - - if (action->zoneAction == NULL) { - prepareForConclusion(manager); - } else { - manager->actingZone = 0; - prepareForRequeue(&manager->completion, applyToZone, handlePreambleError, - getActingZoneThreadID(manager), - manager->currentAction->parent); - } - - action->preamble(manager->context, &manager->completion); -} - -/**********************************************************************/ -bool scheduleDefaultAction(ActionManager *manager) -{ - // Don't schedule a default action if we are operating or not in normal - // operation. - return ((manager->state.state == ADMIN_STATE_NORMAL_OPERATION) - && manager->scheduler(manager->context)); -} - -/** - * Finish an action now that it has been applied to all zones. This - * callback is registered in applyToZone(). - * - * @param completion The action manager completion - **/ -static void finishActionCallback(VDOCompletion *completion) -{ - ActionManager *manager = asActionManager(completion); - Action action = *(manager->currentAction); - manager->currentAction->inUse = false; - manager->currentAction = manager->currentAction->next; - - // We need to check this now to avoid use-after-free issues if running the - // conclusion or notifying the parent results in the manager being freed. - bool hasNextAction = (manager->currentAction->inUse - || scheduleDefaultAction(manager)); - int result = action.conclusion(manager->context); - finishOperation(&manager->state); - if (action.parent != NULL) { - finishCompletion(action.parent, result); - } - - if (hasNextAction) { - launchCurrentAction(manager); - } -} - -/**********************************************************************/ -bool scheduleAction(ActionManager *manager, - ActionPreamble *preamble, - ZoneAction *zoneAction, - ActionConclusion *conclusion, - VDOCompletion *parent) -{ - return scheduleOperation(manager, ADMIN_STATE_OPERATING, preamble, - zoneAction, conclusion, parent); -} - -/**********************************************************************/ -bool scheduleOperation(ActionManager *manager, - AdminStateCode operation, - ActionPreamble *preamble, - ZoneAction *zoneAction, - ActionConclusion *conclusion, - VDOCompletion *parent) -{ - return scheduleOperationWithContext(manager, operation, preamble, zoneAction, - conclusion, NULL, parent); -} - -/**********************************************************************/ -bool scheduleOperationWithContext(ActionManager *manager, - AdminStateCode operation, - ActionPreamble *preamble, - ZoneAction *zoneAction, - ActionConclusion *conclusion, - void *context, - VDOCompletion *parent) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() == manager->initiatorThreadID), - "action initiated from correct thread"); - Action *action; - if (!manager->currentAction->inUse) { - action = manager->currentAction; - } else if (!manager->currentAction->next->inUse) { - action = manager->currentAction->next; - } else { - if (parent != NULL) { - finishCompletion(parent, VDO_COMPONENT_BUSY); - } - - return false; - } - - *action = (Action) { - .inUse = true, - .operation = operation, - .preamble = (preamble == NULL) ? noPreamble : preamble, - .zoneAction = zoneAction, - .conclusion = (conclusion == NULL) ? noConclusion : conclusion, - .context = context, - .parent = parent, - .next = action->next, - }; - - if (action == manager->currentAction) { - launchCurrentAction(manager); - } - - return true; -} diff --git a/vdo/base/actionManager.h b/vdo/base/actionManager.h deleted file mode 100644 index 2e0ef13..0000000 --- a/vdo/base/actionManager.h +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/actionManager.h#6 $ - */ - -#ifndef ACTION_MANAGER_H -#define ACTION_MANAGER_H - -#include "adminState.h" -#include "completion.h" -#include "types.h" - -/** - * ActionManager provides a generic mechanism for applying actions to - * multi-zone entities (such as the block map or slab depot). Each action - * manager is tied to a specific context for which it manages actions. The - * manager ensures that only one action is active on that context at a time, - * and supports at most one pending action. Calls to schedule an action when - * there is already a pending action will result in VDO_COMPONENT_BUSY errors. - * Actions may only be submitted to the action manager from a single thread - * (which thread is determined when the action manager is constructed). - * - * A scheduled action consists of four components: - * preamble: an optional method to be run on the initator thread before - * applying the action to all zones - * zoneAction: an optional method to be applied to each of the zones - * conclusion: an optional method to be run on the initiator thread once the - * per-zone method has been applied to all zones - * parent: an optional completion to be finished once the conclusion - * is done - * - * At least one of the three methods must be provided. - **/ - -/** - * A function which is to be applied asynchronously to a set of zones. - * - * @param context The object which holds the per-zone context for the - * action - * @param zoneNumber The number of zone to which the action is being applied - * @param parent The object to notify when the action is complete - **/ -typedef void ZoneAction(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent); - -/** - * A function which is to be applied asynchronously on an action manager's - * initiator thread as the preamble of an action. - * - * @param context The object which holds the per-zone context for the action - * @param parent The object to notify when the action is complete - **/ -typedef void ActionPreamble(void *context, VDOCompletion *parent); - -/** - * A function which will run on the action manager's initiator thread as the - * conclusion of an action. - * - * @param context The object which holds the per-zone context for the action - * - * @return VDO_SUCCESS or an error - **/ -typedef int ActionConclusion(void *context); - -/** - * A function to schedule an action. - * - * @param context The object which holds the per-zone context for the action - * - * @return true if an action was scheduled - **/ -typedef bool ActionScheduler(void *context); - -/** - * Get the id of the thread associated with a given zone. - * - * @param context The action context - * @param zoneNumber The number of the zone for which the thread ID is desired - **/ -typedef ThreadID ZoneThreadGetter(void *context, ZoneCount zoneNumber); - -/** - * Make an action manager. - * - * @param [in] zones The number of zones to which actions will be - * applied - * @param [in] getZoneThreadID A function to get the thread id associated - * with a zone - * @param [in] initiatorThreadID The thread on which actions may initiated - * @param [in] context The object which holds the per-zone context - * for the action - * @param [in] scheduler A function to schedule a next action after an - * action concludes if there is no pending - * action (may be NULL) - * @param [in] layer The layer used to make completions - * @param [out] managerPtr A pointer to hold the new action manager - * - * @return VDO_SUCCESS or an error code - **/ -int makeActionManager(ZoneCount zones, - ZoneThreadGetter *getZoneThreadID, - ThreadID initiatorThreadID, - void *context, - ActionScheduler *scheduler, - PhysicalLayer *layer, - ActionManager **managerPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy an action manager and null out the reference to it. - * - * @param managerPtr The reference to the manager to destroy - **/ -void freeActionManager(ActionManager **managerPtr); - -/** - * Get the current operation an action manager is performing. - * - * @param manager The manager to query - * - * @return The manager's current operation - **/ -AdminStateCode getCurrentManagerOperation(ActionManager *manager) - __attribute__((warn_unused_result)); - -/** - * Get the action-specific context for the operation an action manager is - * currently performing. - * - * @param manager The manager to query - * - * @return The action-specific context for the manager's current action or - * NULL if there is no context or no current action - **/ -void *getCurrentActionContext(ActionManager *manager) - __attribute__((warn_unused_result)); - -/** - * Attempt to schedule the default action. If the manager is not operating - * normally, the action will not be scheduled. - * - * @param manager The action manager - * - * @return true if an action was scheduled. - **/ -bool scheduleDefaultAction(ActionManager *manager); - -/** - * Schedule an action to be applied to all zones. The action will be launched - * immediately if there is no current action, or as soon as the current action - * completes. If there is already a pending action, this action will not be - * scheduled, and, if it has a parent, that parent will be notified. At least - * one of the preamble, zoneAction, or conclusion must not be NULL. - * - * @param manager The action manager to schedule the action on - * @param preamble A method to be invoked on the initiator thread once this - * action is started but before applying to each zone; may - * be NULL - * @param zoneAction The action to apply to each zone; may be NULL - * @param conclusion A method to be invoked back on the initiator thread once - * the action has been applied to all zones; may be NULL - * @param parent The object to notify once the action is complete or if - * the action can not be scheduled; may be NULL - * - * @return true if the action was scheduled - **/ -bool scheduleAction(ActionManager *manager, - ActionPreamble *preamble, - ZoneAction *zoneAction, - ActionConclusion *conclusion, - VDOCompletion *parent); - -/** - * Schedule an operation to be applied to all zones. The operation's action - * will be launched immediately if there is no current action, or as soon as - * the current action completes. If there is already a pending action, this - * operation will not be scheduled, and, if it has a parent, that parent will - * be notified. At least one of the preamble, zoneAction, or conclusion must - * not be NULL. - * - * @param manager The action manager to schedule the action on - * @param operation The operation this action will perform - * @param preamble A method to be invoked on the initiator thread once this - * action is started but before applying to each zone; may - * be NULL - * @param zoneAction The action to apply to each zone; may be NULL - * @param conclusion A method to be invoked back on the initiator thread once - * the action has been applied to all zones; may be NULL - * @param parent The object to notify once the action is complete or if - * the action can not be scheduled; may be NULL - * - * @return true if the action was scheduled - **/ -bool scheduleOperation(ActionManager *manager, - AdminStateCode operation, - ActionPreamble *preamble, - ZoneAction *zoneAction, - ActionConclusion *conclusion, - VDOCompletion *parent); - -/** - * Schedule an operation to be applied to all zones. The operation's action - * will be launched immediately if there is no current action, or as soon as - * the current action completes. If there is already a pending action, this - * operation will not be scheduled, and, if it has a parent, that parent will - * be notified. At least one of the preamble, zoneAction, or conclusion must - * not be NULL. - * - * @param manager The action manager to schedule the action on - * @param operation The operation this action will perform - * @param preamble A method to be invoked on the initiator thread once this - * action is started but before applying to each zone; may - * be NULL - * @param zoneAction The action to apply to each zone; may be NULL - * @param conclusion A method to be invoked back on the initiator thread once - * the action has been applied to all zones; may be NULL - * @param context An action-specific context which may be retrieved via - * getCurrentActionContext(); may be NULL - * @param parent The object to notify once the action is complete or if - * the action can not be scheduled; may be NULL - * - * @return true if the action was scheduled - **/ -bool scheduleOperationWithContext(ActionManager *manager, - AdminStateCode operation, - ActionPreamble *preamble, - ZoneAction *zoneAction, - ActionConclusion *conclusion, - void *context, - VDOCompletion *parent); - -#endif // ACTION_MANAGER_H diff --git a/vdo/base/adminCompletion.c b/vdo/base/adminCompletion.c deleted file mode 100644 index 5c5ed26..0000000 --- a/vdo/base/adminCompletion.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminCompletion.c#4 $ - */ - -#include "adminCompletion.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "atomic.h" -#include "completion.h" -#include "types.h" -#include "vdoInternal.h" - -/**********************************************************************/ -void assertAdminOperationType(AdminCompletion *completion, - AdminOperationType expected) -{ - ASSERT_LOG_ONLY(completion->type == expected, - "admin operation type is %u instead of %u", - completion->type, expected); -} - -/**********************************************************************/ -AdminCompletion *adminCompletionFromSubTask(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(AdminCompletion, completion) == 0); - assertCompletionType(completion->type, SUB_TASK_COMPLETION); - VDOCompletion *parent = completion->parent; - assertCompletionType(parent->type, ADMIN_COMPLETION); - return (AdminCompletion *) parent; -} - -/**********************************************************************/ -void assertAdminPhaseThread(AdminCompletion *adminCompletion, - const char *what, - const char *phaseNames[]) -{ - ThreadID expected = adminCompletion->getThreadID(adminCompletion); - ASSERT_LOG_ONLY((getCallbackThreadID() == expected), - "%s on correct thread for %s", - what, phaseNames[adminCompletion->phase]); -} - -/**********************************************************************/ -VDO *vdoFromAdminSubTask(VDOCompletion *completion, - AdminOperationType expected) -{ - AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); - assertAdminOperationType(adminCompletion, expected); - return adminCompletion->completion.parent; -} - -/**********************************************************************/ -int initializeAdminCompletion(VDO *vdo, AdminCompletion *adminCompletion) -{ - int result = initializeEnqueueableCompletion(&adminCompletion->completion, - ADMIN_COMPLETION, vdo->layer); - if (result != VDO_SUCCESS) { - return result; - } - - result = initializeEnqueueableCompletion(&adminCompletion->subTaskCompletion, - SUB_TASK_COMPLETION, vdo->layer); - if (result != VDO_SUCCESS) { - uninitializeAdminCompletion(adminCompletion); - return result; - } - - atomicStoreBool(&adminCompletion->busy, false); - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void uninitializeAdminCompletion(AdminCompletion *adminCompletion) -{ - destroyEnqueueable(&adminCompletion->subTaskCompletion); - destroyEnqueueable(&adminCompletion->completion); -} - -/**********************************************************************/ -VDOCompletion *resetAdminSubTask(VDOCompletion *completion) -{ - AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); - resetCompletion(completion); - completion->callbackThreadID = adminCompletion->getThreadID(adminCompletion); - return completion; -} - -/**********************************************************************/ -void prepareAdminSubTaskOnThread(VDO *vdo, - VDOAction *callback, - VDOAction *errorHandler, - ThreadID threadID) -{ - prepareForRequeue(&vdo->adminCompletion.subTaskCompletion, callback, - errorHandler, threadID, &vdo->adminCompletion); -} - -/**********************************************************************/ -void prepareAdminSubTask(VDO *vdo, - VDOAction *callback, - VDOAction *errorHandler) -{ - AdminCompletion *adminCompletion = &vdo->adminCompletion; - prepareAdminSubTaskOnThread(vdo, callback, errorHandler, - adminCompletion->completion.callbackThreadID); -} - -/** - * Callback for admin operations which will notify the layer that the operation - * is complete. - * - * @param completion The admin completion - **/ -static void adminOperationCallback(VDOCompletion *completion) -{ - completion->layer->completeAdminOperation(completion->layer); -} - -/**********************************************************************/ -int performAdminOperation(VDO *vdo, - AdminOperationType type, - ThreadIDGetterForPhase *threadIDGetter, - VDOAction *action, - VDOAction *errorHandler) -{ - AdminCompletion *adminCompletion = &vdo->adminCompletion; - if (!compareAndSwapBool(&adminCompletion->busy, false, true)) { - return logErrorWithStringError(VDO_COMPONENT_BUSY, - "Can't start admin operation of type %u, " - "another operation is already in progress", - type); - } - - prepareCompletion(&adminCompletion->completion, adminOperationCallback, - adminOperationCallback, - getAdminThread(getThreadConfig(vdo)), vdo); - adminCompletion->type = type; - adminCompletion->getThreadID = threadIDGetter; - adminCompletion->phase = 0; - prepareAdminSubTask(vdo, action, errorHandler); - - PhysicalLayer *layer = vdo->layer; - layer->enqueue(adminCompletion->subTaskCompletion.enqueueable); - layer->waitForAdminOperation(layer); - int result = adminCompletion->completion.result; - atomicStoreBool(&adminCompletion->busy, false); - return result; -} diff --git a/vdo/base/adminCompletion.h b/vdo/base/adminCompletion.h deleted file mode 100644 index 50eeecd..0000000 --- a/vdo/base/adminCompletion.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminCompletion.h#4 $ - */ - -#ifndef ADMIN_COMPLETION_H -#define ADMIN_COMPLETION_H - -#include "atomic.h" -#include "completion.h" -#include "types.h" - -typedef enum adminOperationType { - ADMIN_OPERATION_UNKNOWN = 0, - ADMIN_OPERATION_GROW_LOGICAL, - ADMIN_OPERATION_GROW_PHYSICAL, - ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, - ADMIN_OPERATION_LOAD, - ADMIN_OPERATION_RESUME, - ADMIN_OPERATION_SAVE, - ADMIN_OPERATION_SUSPEND, -} AdminOperationType; - -typedef struct adminCompletion AdminCompletion; - -/** - * A function which gets the ID of the thread on which the current phase of an - * admin operation should be run. - * - * @param adminCompletion The AdminCompletion - * - * @return The ID of the thread on which the current phase should be performed - **/ -typedef ThreadID ThreadIDGetterForPhase(AdminCompletion *adminCompletion); - -struct adminCompletion { - /** The completion */ - VDOCompletion completion; - /** The sub-task completion */ - VDOCompletion subTaskCompletion; - /** Whether this completion is in use */ - AtomicBool busy; - /** The operation type */ - AdminOperationType type; - /** Method to get the ThreadID for the current phase */ - ThreadIDGetterForPhase *getThreadID; - /** The current phase of the operation */ - uint32_t phase; -}; - -/** - * Check that an AdminCompletion's type is as expected. - * - * @param completion The AdminCompletion to check - * @param expected The expected type - **/ -void assertAdminOperationType(AdminCompletion *completion, - AdminOperationType expected); - -/** - * Convert the sub-task completion of an AdminCompletion to an AdminCompletion. - * - * @param completion the AdminCompletion's sub-task completion - * - * @return The sub-task completion as its enclosing AdminCompletion - **/ -AdminCompletion *adminCompletionFromSubTask(VDOCompletion *completion) - __attribute__((warn_unused_result)); - -/** - * Assert that we are operating on the correct thread for the current phase. - * - * @param adminCompletion The AdminCompletion to check - * @param what The method doing the phase check - * @param phaseNames The names of the phases of the current operation - **/ -void assertAdminPhaseThread(AdminCompletion *adminCompletion, - const char *what, - const char *phaseNames[]); - -/** - * Get the VDO from the sub-task completion of its AdminCompletion. - * - * @param completion the sub-task completion - * @param expected the expected operation type of the AdminCompletion - * - * @return The VDO - **/ -VDO *vdoFromAdminSubTask(VDOCompletion *completion, - AdminOperationType expected) - __attribute__((warn_unused_result)); - -/** - * Initialize an admin completion. - * - * @param vdo The VDO which owns the completion - * @param adminCompletion The AdminCompletion to initialize - * - * @return VDO_SUCCESS or an error - **/ -int initializeAdminCompletion(VDO *vdo, AdminCompletion *adminCompletion) - __attribute__((warn_unused_result)); - -/** - * Clean up an admin completion's resources. - * - * @param adminCompletion The AdminCompletion to uninitialize - **/ -void uninitializeAdminCompletion(AdminCompletion *adminCompletion); - -/** - * Reset an AdminCompletion's sub-task completion. - * - * @param completion The AdminCompletion's sub-task completion - * - * @return The sub-task completion for the convenience of callers - **/ -VDOCompletion *resetAdminSubTask(VDOCompletion *completion); - -/** - * Prepare the sub-task completion of a VDO's AdminCompletion - * - * @param vdo The VDO - * @param callback The callback for the sub-task - * @param errorHandler The error handler for the sub-task - * @param threadID The ID of the thread on which to run the callback - **/ -void prepareAdminSubTaskOnThread(VDO *vdo, - VDOAction *callback, - VDOAction *errorHandler, - ThreadID threadID); - -/** - * Prepare the sub-task completion of a VDO's AdminCompletion to run on the - * same thread as the AdminCompletion's main completion. - * - * @param vdo The VDO - * @param callback The callback for the sub-task - * @param errorHandler The error handler for the sub-task - **/ -void prepareAdminSubTask(VDO *vdo, - VDOAction *callback, - VDOAction *errorHandler); - -/** - * Perform an administrative operation (load, suspend, grow logical, or grow - * physical). This method should not be called from base threads unless it is - * certain the calling thread won't be needed to perform the operation. It may - * (and should) be called from non-base threads. - * - * @param vdo The VDO on which to perform the operation - * @param type The type of operation to perform - * @param threadIDGetter A function for getting the ID of the thread on which - * a given phase should be run - * @param action The action which starts the operation - * @param errorHandler The error handler for the operation - * - * @return The result of the operation - **/ -int performAdminOperation(VDO *vdo, - AdminOperationType type, - ThreadIDGetterForPhase *threadIDGetter, - VDOAction *action, - VDOAction *errorHandler) - __attribute__((warn_unused_result)); - -#endif /* ADMIN_COMPLETION_H */ diff --git a/vdo/base/adminState.c b/vdo/base/adminState.c deleted file mode 100644 index 6b30315..0000000 --- a/vdo/base/adminState.c +++ /dev/null @@ -1,370 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminState.c#14 $ - */ - -#include "adminState.h" - -#include "logger.h" -#include "permassert.h" - -#include "completion.h" -#include "types.h" - -/**********************************************************************/ -const char *getAdminStateCodeName(AdminStateCode code) -{ - switch (code) { - case ADMIN_STATE_NORMAL_OPERATION: - return "ADMIN_STATE_NORMAL_OPERATION"; - - case ADMIN_STATE_OPERATING: - return "ADMIN_STATE_OPERATING"; - - case ADMIN_STATE_FORMATTING: - return "ADMIN_STATE_FORMATTING"; - - case ADMIN_STATE_LOADING: - return "ADMIN_STATE_LOADING"; - - case ADMIN_STATE_LOADING_FOR_RECOVERY: - return "ADMIN_STATE_LOADING_FOR_RECOVERY"; - - case ADMIN_STATE_LOADING_FOR_REBUILD: - return "ADMIN_STATE_LOADING_FOR_REBUILD"; - - case ADMIN_STATE_NEW: - return "ADMIN_STATE_NEW"; - - case ADMIN_STATE_WAITING_FOR_RECOVERY: - return "ADMIN_STATE_WAITING_FOR_RECOVERY"; - - case ADMIN_STATE_RECOVERING: - return "ADMIN_STATE_RECOVERING"; - - case ADMIN_STATE_REBUILDING: - return "ADMIN_STATE_REBUILDING"; - - case ADMIN_STATE_SAVING: - return "ADMIN_STATE_SAVING"; - - case ADMIN_STATE_SAVED: - return "ADMIN_STATE_SAVED"; - - case ADMIN_STATE_SCRUBBING: - return "ADMIN_STATE_SCRUBBING"; - - case ADMIN_STATE_SAVE_FOR_SCRUBBING: - return "ADMIN_STATE_SAVE_FOR_SCRUBBING"; - - case ADMIN_STATE_SUSPENDING: - return "ADMIN_STATE_SUSPENDING"; - - case ADMIN_STATE_SUSPENDED: - return "ADMIN_STATE_SUSPENDED"; - - case ADMIN_STATE_SUSPENDED_OPERATION: - return "ADMIN_STATE_SUSPENDED_OPERATION"; - - case ADMIN_STATE_RESUMING: - return "ADMIN_STATE_RESUMING"; - - default: - return "INVALID ADMIN_STATE"; - } -} - -/**********************************************************************/ -const char *getAdminStateName(const AdminState *state) -{ - return getAdminStateCodeName(state->state); -} - -/**********************************************************************/ -static AdminStateCode getNextState(AdminStateCode previousState, - AdminStateCode operation) -{ - if (isQuiescingCode(operation)) { - return ((operation & ADMIN_TYPE_MASK) | ADMIN_FLAG_QUIESCENT); - } - - if (operation == ADMIN_STATE_SUSPENDED_OPERATION) { - return previousState; - } - - return ADMIN_STATE_NORMAL_OPERATION; -} - -/** - * Finish an operation if one is in progress. If there is a waiter, it will be - * notified. - * - * @param state The AdminState - * @param result The result of the operation - * - * @return true if an operation was in progress and has been - * finished. - **/ -static bool endOperation(AdminState *state, int result) -{ - if (!isOperating(state)) { - return false; - } - - if (state->starting) { - state->complete = true; - if (state->waiter != NULL) { - setCompletionResult(state->waiter, result); - } - } else { - state->complete = false; - state->state = state->nextState; - releaseCompletionWithResult(&state->waiter, result); - } - - return true; -} - -/** - * Begin an operation if it may be started given the current state. - * - * @param state The AdminState - * @param operation The operation to begin - * @param waiter A completion to notify when the operation is complete; may - * be NULL - * @param initiator The AdminInitiator to call if the operation may begin; may - * be NULL - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int beginOperation(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator) -{ - int result; - if (isOperating(state) - || (isQuiescent(state) != isQuiescentOperation(operation))) { - result = logErrorWithStringError(VDO_INVALID_ADMIN_STATE, - "Can't start %s from %s", - getAdminStateCodeName(operation), - getAdminStateName(state)); - } else if (state->waiter != NULL) { - result = logErrorWithStringError(VDO_COMPONENT_BUSY, - "Can't start %s with extant waiter", - getAdminStateCodeName(operation)); - } else { - state->waiter = waiter; - state->nextState = getNextState(state->state, operation); - state->state = operation; - if (initiator != NULL) { - state->starting = true; - initiator(state); - state->starting = false; - if (state->complete) { - endOperation(state, VDO_SUCCESS); - } - } - - return VDO_SUCCESS; - } - - if (waiter != NULL) { - finishCompletion(waiter, result); - } - - return result; -} - -/** - * Check the result of a state validation. If the result failed, log an invalid - * state error and, if there is a waiter, notify it. - * - * @param valid true if the code is of an appropriate type - * @param code The code which failed to be of the correct type - * @param what What the code failed to be, for logging - * @param waiter The completion to notify of the error; may be NULL - * - * @return The result of the check - **/ -static bool checkCode(bool valid, - AdminStateCode code, - const char *what, - VDOCompletion *waiter) -{ - if (valid) { - return true; - } - - int result = logErrorWithStringError(VDO_INVALID_ADMIN_STATE, - "%s is not a %s", - getAdminStateCodeName(code), what); - if (waiter != NULL) { - finishCompletion(waiter, result); - } - - return false; -} - -/**********************************************************************/ -bool assertDrainOperation(AdminStateCode operation, VDOCompletion *waiter) -{ - return checkCode(isDrainOperation(operation), operation, "drain operation", - waiter); -} - -/**********************************************************************/ -bool startDraining(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator) -{ - return (assertDrainOperation(operation, waiter) - && (beginOperation(state, operation, waiter, initiator) - == VDO_SUCCESS)); -} - -/**********************************************************************/ -bool finishDraining(AdminState *state) -{ - return finishDrainingWithResult(state, VDO_SUCCESS); -} - -/**********************************************************************/ -bool finishDrainingWithResult(AdminState *state, int result) -{ - return (isDraining(state) && endOperation(state, result)); -} - -/**********************************************************************/ -bool assertLoadOperation(AdminStateCode operation, VDOCompletion *waiter) -{ - return checkCode(isLoadOperation(operation), operation, "load operation", - waiter); -} - -/**********************************************************************/ -bool startLoading(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator) -{ - return (assertLoadOperation(operation, waiter) - && (beginOperation(state, operation, waiter, initiator) - == VDO_SUCCESS)); -} - -/**********************************************************************/ -bool finishLoading(AdminState *state) -{ - return finishLoadingWithResult(state, VDO_SUCCESS); -} - -/**********************************************************************/ -bool finishLoadingWithResult(AdminState *state, int result) -{ - return (isLoading(state) && endOperation(state, result)); -} - -/**********************************************************************/ -bool assertResumeOperation(AdminStateCode operation, VDOCompletion *waiter) -{ - return checkCode(isResumeOperation(operation), operation, "resume operation", - waiter); -} - -/**********************************************************************/ -bool startResuming(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator) -{ - return (assertResumeOperation(operation, waiter) - && (beginOperation(state, operation, waiter, initiator) - == VDO_SUCCESS)); -} - -/**********************************************************************/ -bool finishResuming(AdminState *state) -{ - return finishResumingWithResult(state, VDO_SUCCESS); -} - -/**********************************************************************/ -bool finishResumingWithResult(AdminState *state, int result) -{ - return (isResuming(state) && endOperation(state, result)); -} - -/**********************************************************************/ -int resumeIfQuiescent(AdminState *state) -{ - if (!isQuiescent(state)) { - return VDO_INVALID_ADMIN_STATE; - } - - state->state = ADMIN_STATE_NORMAL_OPERATION; - return VDO_SUCCESS; -} - -/** - * Check whether an AdminStateCode is an operation. - * - * @param code The operation to check - * @param waiter The completion to notify if the code is not an operation; may - * be NULL - * - * @return true if the code is an operation - **/ -static bool assertOperation(AdminStateCode code, VDOCompletion *waiter) -{ - return checkCode(isOperation(code), code, "operation", waiter); -} - -/**********************************************************************/ -int startOperation(AdminState *state, AdminStateCode operation) -{ - return (assertOperation(operation, NULL) - ? beginOperation(state, operation, NULL, NULL) - : VDO_INVALID_ADMIN_STATE); -} - -/**********************************************************************/ -bool startOperationWithWaiter(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator) -{ - return (assertOperation(operation, waiter) - && (beginOperation(state, operation, waiter, initiator) - == VDO_SUCCESS)); -} - -/**********************************************************************/ -bool finishOperation(AdminState *state) -{ - return finishOperationWithResult(state, VDO_SUCCESS); -} - -/**********************************************************************/ -bool finishOperationWithResult(AdminState *state, int result) -{ - return endOperation(state, result); -} diff --git a/vdo/base/adminState.h b/vdo/base/adminState.h deleted file mode 100644 index 5ab13cb..0000000 --- a/vdo/base/adminState.h +++ /dev/null @@ -1,666 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/adminState.h#17 $ - */ - -#ifndef ADMIN_STATE_H -#define ADMIN_STATE_H - -#include "completion.h" -#include "types.h" - -/** - * The list of state types. - **/ -typedef enum { - /** Normal operation, DataVIOs may be active */ - ADMIN_TYPE_NORMAL = 0, - /** - * Format: an operation for formatting a new VDO. - **/ - ADMIN_TYPE_FORMAT, - /** - * Recover: a recovery operation. - **/ - ADMIN_TYPE_RECOVER, - /** - * Rebuild: write data necessary for a full rebuild, drain outstanding I/O, - * and return to normal operation. - **/ - ADMIN_TYPE_REBUILD, - /** - * Save: write all dirty metadata thereby restoring the VDO to a clean state, - * drain outstanding I/O, and become quiescent. - **/ - ADMIN_TYPE_SAVE, - /** - * Scrub: load and/or save state necessary to scrub a slab. - **/ - ADMIN_TYPE_SCRUB, - /** - * Suspend: write enough dirty metadata to perform resize transactions, - * drain outstanding I/O, and become quiescent. - **/ - ADMIN_TYPE_SUSPEND, - /** - * Resume: return to normal from a quiescent state - **/ - ADMIN_TYPE_RESUME, - /** The mask for extracting the AdminType from and AdminStateCode */ - ADMIN_TYPE_MASK = 0xff, -} AdminType; - - -/** - * The bit position of flags used to categorize states. - **/ -typedef enum { - ADMIN_FLAG_BIT_START = 8, - /** Flag indicating that I/O is draining */ - ADMIN_FLAG_BIT_DRAINING = ADMIN_FLAG_BIT_START, - /** Flag indicating a load operation */ - ADMIN_FLAG_BIT_LOADING, - /** Flag indicating that the next state will be a quiescent state */ - ADMIN_FLAG_BIT_QUIESCING, - /** Flag indicating that the state is quiescent */ - ADMIN_FLAG_BIT_QUIESCENT, - /** - * Flag indicating that an operation is in progress and so no other - * operation may be started. - **/ - ADMIN_FLAG_BIT_OPERATING, -} AdminFlagBit; - -/** - * The flags themselves. - **/ -typedef enum { - ADMIN_FLAG_DRAINING = (uint32_t) (1 << ADMIN_FLAG_BIT_DRAINING), - ADMIN_FLAG_LOADING = (uint32_t) (1 << ADMIN_FLAG_BIT_LOADING), - ADMIN_FLAG_QUIESCING = (uint32_t) (1 << ADMIN_FLAG_BIT_QUIESCING), - ADMIN_FLAG_QUIESCENT = (uint32_t) (1 << ADMIN_FLAG_BIT_QUIESCENT), - ADMIN_FLAG_OPERATING = (uint32_t) (1 << ADMIN_FLAG_BIT_OPERATING), -} AdminFlag; - -/** - * The state codes. - **/ -typedef enum { - ADMIN_STATE_NORMAL_OPERATION = ADMIN_TYPE_NORMAL, - ADMIN_STATE_OPERATING = (ADMIN_TYPE_NORMAL - | ADMIN_FLAG_OPERATING), - ADMIN_STATE_FORMATTING = (ADMIN_TYPE_FORMAT - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_LOADING), - ADMIN_STATE_LOADING = (ADMIN_TYPE_NORMAL - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_LOADING), - ADMIN_STATE_LOADING_FOR_RECOVERY = (ADMIN_TYPE_RECOVER - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_LOADING), - ADMIN_STATE_LOADING_FOR_REBUILD = (ADMIN_TYPE_REBUILD - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_LOADING), - ADMIN_STATE_WAITING_FOR_RECOVERY = (ADMIN_TYPE_RECOVER - | ADMIN_FLAG_OPERATING), - ADMIN_STATE_NEW = (ADMIN_TYPE_NORMAL - | ADMIN_FLAG_QUIESCENT), - ADMIN_STATE_RECOVERING = (ADMIN_TYPE_RECOVER - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_DRAINING), - ADMIN_STATE_REBUILDING = (ADMIN_TYPE_REBUILD - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_DRAINING), - ADMIN_STATE_SAVING = (ADMIN_TYPE_SAVE - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_DRAINING - | ADMIN_FLAG_QUIESCING), - ADMIN_STATE_SAVED = (ADMIN_TYPE_SAVE - | ADMIN_FLAG_QUIESCENT), - ADMIN_STATE_SCRUBBING = (ADMIN_TYPE_SCRUB - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_DRAINING - | ADMIN_FLAG_LOADING), - ADMIN_STATE_SAVE_FOR_SCRUBBING = (ADMIN_TYPE_SCRUB - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_DRAINING), - ADMIN_STATE_SUSPENDING = (ADMIN_TYPE_SUSPEND - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_DRAINING - | ADMIN_FLAG_QUIESCING), - ADMIN_STATE_SUSPENDED = (ADMIN_TYPE_SUSPEND - | ADMIN_FLAG_QUIESCENT), - ADMIN_STATE_SUSPENDED_OPERATION = (ADMIN_TYPE_SUSPEND - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_QUIESCENT), - ADMIN_STATE_RESUMING = (ADMIN_TYPE_RESUME - | ADMIN_FLAG_OPERATING - | ADMIN_FLAG_QUIESCENT), -} AdminStateCode; - -typedef struct { - /** The current administrative state */ - AdminStateCode state; - /** The next administrative state (when the current operation finishes */ - AdminStateCode nextState; - /** A completion waiting on a state change */ - VDOCompletion *waiter; - /** Whether an operation is being initiated */ - bool starting; - /** Whether an operation has completed in the initiator */ - bool complete; -} AdminState; - -/** - * A method to be called once an admin operation may be initiated. - **/ -typedef void AdminInitiator(AdminState *state); - -/** - * Get the name of an AdminStateCode for logging purposes. - * - * @param code The AdminStateCode - * - * @return The name of the state's code - **/ -const char *getAdminStateCodeName(AdminStateCode code) - __attribute__((warn_unused_result)); - -/** - * Get the name of an AdminState's code for logging purposes. - * - * @param state The AdminState - * - * @return The name of the state's code - **/ -const char *getAdminStateName(const AdminState *state) - __attribute__((warn_unused_result)); - -/** - * Check whether an AdminState is in normal operation. - * - * @param state The AdminState to query - * - * @return true if the state is normal - **/ -__attribute__((warn_unused_result)) -static inline bool isNormal(AdminState *state) -{ - return ((state->state & ADMIN_TYPE_MASK) == ADMIN_TYPE_NORMAL); -} - -/** - * Check whether an AdminStateCode is an operation. - * - * @param code The code to check - * - * @return true if the code is an operation - **/ -__attribute__((warn_unused_result)) -static inline bool isOperation(AdminStateCode code) -{ - return ((code & ADMIN_FLAG_OPERATING) == ADMIN_FLAG_OPERATING); -} - -/** - * Check whether an AdminState is operating. - * - * @param state The AdminState to query - * - * @return true if the state is operating - **/ -__attribute__((warn_unused_result)) -static inline bool isOperating(AdminState *state) -{ - return isOperation(state->state); -} - -/** - * Check whether an AdminState is suspending. - * - * @param state The AdminState to query - * - * @return true if the state is suspending - **/ -__attribute__((warn_unused_result)) -static inline bool isSuspending(AdminState *state) -{ - return (state->state == ADMIN_STATE_SUSPENDING); -} - -/** - * Check whether an AdminState is suspended. - * - * @param state The AdminState to query - * - * @return true if the state is suspended - **/ -__attribute__((warn_unused_result)) -static inline bool isSuspended(AdminState *state) -{ - return (state->state == ADMIN_STATE_SUSPENDED); -} - -/** - * Check whether an AdminState is saving. - * - * @param state The AdminState to query - * - * @return true if the state is saving - **/ -__attribute__((warn_unused_result)) -static inline bool isSaving(AdminState *state) -{ - return (state->state == ADMIN_STATE_SAVING); -} - -/** - * Check whether an AdminState is saved. - * - * @param state The AdminState to query - * - * @return true if the state is saved - **/ -__attribute__((warn_unused_result)) -static inline bool isSaved(AdminState *state) -{ - return (state->state == ADMIN_STATE_SAVED); -} - -/** - * Check whether an AdminStateCode is a drain operation. - * - * @param code The AdminStateCode to check - * - * @return true if the code is for a drain operation - **/ -__attribute__((warn_unused_result)) -static inline bool isDrainOperation(AdminStateCode code) -{ - return ((code & ADMIN_FLAG_DRAINING) == ADMIN_FLAG_DRAINING); -} - -/** - * Check whether an AdminState is draining. - * - * @param state The AdminState to query - * - * @return true if the state is draining - **/ -__attribute__((warn_unused_result)) -static inline bool isDraining(AdminState *state) -{ - return isDrainOperation(state->state); -} - -/** - * Check whether an AdminStateCode is a load operation. - * - * @param code The AdminStateCode to check - * - * @return true if the code is for a load operation - **/ -__attribute__((warn_unused_result)) -static inline bool isLoadOperation(AdminStateCode code) -{ - return ((code & ADMIN_FLAG_LOADING) == ADMIN_FLAG_LOADING); -} - -/** - * Check whether an AdminState is loading. - * - * @param state The AdminState to query - * - * @return true if the state is loading - **/ -__attribute__((warn_unused_result)) -static inline bool isLoading(AdminState *state) -{ - return isLoadOperation(state->state); -} - -/** - * Check whether an AdminStateCode is a resume operation. - * - * @param code The AdminStateCode to check - * - * @return true if the code is for a resume operation - **/ -__attribute__((warn_unused_result)) -static inline bool isResumeOperation(AdminStateCode code) -{ - return ((code & ADMIN_TYPE_MASK) == ADMIN_TYPE_RESUME); -} - -/** - * Check whether an AdminState is resumeing. - * - * @param state The AdminState to query - * - * @return true if the state is resumeing - **/ -__attribute__((warn_unused_result)) -static inline bool isResuming(AdminState *state) -{ - return isResumeOperation(state->state); -} - -/** - * Check whether an AdminState is doing a clean load. - * - * @param state The AdminState to query - * - * @return true if the state is a clean load - **/ -__attribute__((warn_unused_result)) -static inline bool isCleanLoad(AdminState *state) -{ - return ((state->state == ADMIN_STATE_FORMATTING) - || (state->state == ADMIN_STATE_LOADING)); -} - -/** - * Check whether an AdminStateCode is quiescing. - * - * param code The AdminStateCode to check - * - * @return true is the state is quiescing - **/ -__attribute__((warn_unused_result)) -static inline bool isQuiescingCode(AdminStateCode code) -{ - return ((code & ADMIN_FLAG_QUIESCING) == ADMIN_FLAG_QUIESCING); -} - -/** - * Check whether an AdminState is quiescing. - * - * @param state The AdminState to check - * - * @return true if the state is quiescing - **/ -__attribute__((warn_unused_result)) -static inline bool isQuiescing(AdminState *state) -{ - return isQuiescingCode(state->state); -} - -/** - * Check where an AdminStateCode is quiescent. - * - * param code The AdminStateCode to check - * - * @return true is the state is quiescent - **/ -__attribute__((warn_unused_result)) -static inline bool isQuiescentCode(AdminStateCode code) -{ - return ((code & ADMIN_FLAG_QUIESCENT) == ADMIN_FLAG_QUIESCENT); -} - -/** - * Check whether an AdminState is quiescent. - * - * @param state The AdminState to query - * - * @return true is the state is quiescent - **/ -__attribute__((warn_unused_result)) -static inline bool isQuiescent(AdminState *state) -{ - return isQuiescentCode(state->state); -} - -/** - * Check whether an AdminStateCode is a quiescent operation. - * - * @param code The code to check - * - * @return true if the code is a quiescent operation - **/ -__attribute__((warn_unused_result)) -static inline bool isQuiescentOperation(AdminStateCode code) -{ - return (isQuiescentCode(code) && isOperation(code)); -} - -/** - * Check that an operation is a drain. - * - * @param operation The operation to check - * @param waiter The completion to finish with an error if the operation is - * not a drain - * - * @return true if the specified operation is a drain - **/ -bool assertDrainOperation(AdminStateCode operation, VDOCompletion *waiter) - __attribute__((warn_unused_result)); - -/** - * Initiate a drain operation if the current state permits it. - * - * @param state The AdminState - * @param operation The type of drain to initiate - * @param waiter The completion to notify when the drain is complete; may - * be NULL - * @param initiator The AdminInitiator to call if the operation may begin; may - * be NULL - * - * @return true if the drain was initiated, if not the waiter - * will be notified - **/ -bool startDraining(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator); - -/** - * Finish a drain operation if one was in progress. - * - * @param state The AdminState to query - * - * @return true if the state was draining; will notify the waiter - * if so - **/ -bool finishDraining(AdminState *state); - -/** - * Finish a drain operation with a status code. - * - * @param state The AdminState to query - * @param result The result of the drain operation - * - * @return true if the state was draining; will notify the - * waiter if so - **/ -bool finishDrainingWithResult(AdminState *state, int result); - -/** - * Check that an operation is a load. - * - * @param operation The operation to check - * @param waiter The completion to finish with an error if the operation is - * not a load - * - * @return true if the specified operation is a load - **/ -bool assertLoadOperation(AdminStateCode operation, VDOCompletion *waiter) - __attribute__((warn_unused_result)); - -/** - * Initiate a load operation if the current state permits it. - * - * @param state The AdminState - * @param operation The type of load to initiate - * @param waiter The completion to notify when the load is complete; may be - * NULL - * @param initiator The AdminInitiator to call if the operation may begin; may - * be NULL - * - * @return true if the load was initiated, if not the waiter - * will be notified - **/ -bool startLoading(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator); - -/** - * Finish a load operation if one was in progress. - * - * @param state The AdminState to query - * - * @return true if the state was loading; will notify the waiter - * if so - **/ -bool finishLoading(AdminState *state); - -/** - * Finish a load operation with a status code. - * - * @param state The AdminState to query - * @param result The result of the load operation - * - * @return true if the state was loading; will notify the - * waiter if so - **/ -bool finishLoadingWithResult(AdminState *state, int result); - -/** - * Check whether an AdminStateCode is a resume operation. - * - * @param operation The operation to check - * @param waiter The completion to notify if the operation is not a resume - * operation; may be NULL - * - * @return true if the code is a resume operation - **/ -bool assertResumeOperation(AdminStateCode operation, VDOCompletion *waiter); - -/** - * Initiate a resume operation if the current state permits it. - * - * @param state The AdminState - * @param operation The type of resume to start - * @param waiter The completion to notify when the resume is complete; may - * be NULL - * @param initiator The AdminInitiator to call if the operation may begin; may - * be NULL - * - * @return true if the resume was initiated, if not the waiter - * will be notified - **/ -bool startResuming(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator); - -/** - * Finish a resume operation if one was in progress. - * - * @param state The AdminState to query - * - * @return true if the state was resuming; will notify the waiter - * if so - **/ -bool finishResuming(AdminState *state); - -/** - * Finish a resume operation with a status code. - * - * @param state The AdminState to query - * @param result The result of the resume operation - * - * @return true if the state was resuming; will notify the - * waiter if so - **/ -bool finishResumingWithResult(AdminState *state, int result); - -/** - * Change the state to normal operation if the current state is quiescent. - * - * @param state The AdminState to resume - * - * @return VDO_SUCCESS if the state resumed, VDO_INVALID_ADMIN_STATE otherwise - **/ -int resumeIfQuiescent(AdminState *state); - -/** - * Attempt to start an operation. - * - * @param state the AdminState - * @param operation the operation to start - * - * @return VDO_SUCCESS if the operation was started - * VDO_INVALID_ADMIN_STATE if not - **/ -int startOperation(AdminState *state, AdminStateCode operation); - -/** - * Attempt to start an operation. - * - * @param state the AdminState - * @param operation the operation to start - * @param waiter the completion to notify when the operation completes or - * fails to start; may be NULL - * @param initiator The AdminInitiator to call if the operation may begin; may - * be NULL - * - * @return true if the operation was started - **/ -bool startOperationWithWaiter(AdminState *state, - AdminStateCode operation, - VDOCompletion *waiter, - AdminInitiator *initiator); - -/** - * Finish the current operation. Will notify the operation waiter if there is - * one. This method should be used for operations started with - * startOperation(). For operations which were started with startDraining(), - * use finishDraining() instead. - * - * @param state The state whose operation is to be finished - * - * @return true if there was an operation to finish - **/ -bool finishOperation(AdminState *state); - -/** - * Finish the current operation with a status code. Will notify the operation - * waiter if there is one. - * - * @param state The state whose operation is to be finished - * @param result The result of the operation - **/ -bool finishOperationWithResult(AdminState *state, int result); - -/** - * Set a result for the current operation. - * - * @param state the AdminState - * @param result the result to set; if there is no waiter, this is a no-op - **/ -static inline void setOperationResult(AdminState *state, int result) -{ - if (state->waiter != NULL) { - setCompletionResult(state->waiter, result); - } -} - -#endif // ADMIN_STATE_H diff --git a/vdo/base/allocatingVIO.c b/vdo/base/allocatingVIO.c deleted file mode 100644 index 4e0ffa8..0000000 --- a/vdo/base/allocatingVIO.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocatingVIO.c#4 $ - */ - -#include "allocatingVIO.h" - -#include "logger.h" - -#include "allocationSelector.h" -#include "blockAllocator.h" -#include "dataVIO.h" -#include "pbnLock.h" -#include "slabDepot.h" -#include "types.h" -#include "vdoInternal.h" -#include "vioWrite.h" - -/** - * Make a single attempt to acquire a write lock on a newly-allocated PBN. - * - * @param allocatingVIO The AllocatingVIO that wants a write lock for its - * newly allocated block - * - * @return VDO_SUCCESS or an error code - **/ -static int attemptPBNWriteLock(AllocatingVIO *allocatingVIO) -{ - assertInPhysicalZone(allocatingVIO); - - ASSERT_LOG_ONLY(allocatingVIO->allocationLock == NULL, - "must not acquire a lock while already referencing one"); - - PBNLock *lock; - int result = attemptPBNLock(allocatingVIO->zone, allocatingVIO->allocation, - allocatingVIO->writeLockType, &lock); - if (result != VDO_SUCCESS) { - return result; - } - - if (lock->holderCount > 0) { - // This block is already locked, which should be impossible. - return logErrorWithStringError(VDO_LOCK_ERROR, - "Newly allocated block %" PRIu64 - " was spuriously locked (holderCount=%u)", - allocatingVIO->allocation, - lock->holderCount); - } - - // We've successfully acquired a new lock, so mark it as ours. - lock->holderCount += 1; - allocatingVIO->allocationLock = lock; - assignProvisionalReference(lock); - return VDO_SUCCESS; -} - -/** - * Attempt to allocate and lock a physical block. If successful, continue - * along the write path. - * - * @param allocatingVIO The AllocatingVIO which needs an allocation - * - * @return VDO_SUCCESS or an error if a block could not be allocated - **/ -static int allocateAndLockBlock(AllocatingVIO *allocatingVIO) -{ - BlockAllocator *allocator = getBlockAllocator(allocatingVIO->zone); - int result = allocateBlock(allocator, &allocatingVIO->allocation); - if (result != VDO_SUCCESS) { - return result; - } - - result = attemptPBNWriteLock(allocatingVIO); - if (result != VDO_SUCCESS) { - return result; - } - - // We got a block! - VIO *vio = allocatingVIOAsVIO(allocatingVIO); - vio->physical = allocatingVIO->allocation; - allocatingVIO->allocationCallback(allocatingVIO); - return VDO_SUCCESS; -} - -static void allocateBlockForWrite(VDOCompletion *completion); - -/** - * Retry allocating a block for write. - * - * @param waiter The AllocatingVIO that was waiting to allocate - * @param context The context (unused) - **/ -static void -retryAllocateBlockForWrite(Waiter *waiter, - void *context __attribute__((unused))) -{ - AllocatingVIO *allocatingVIO = waiterAsAllocatingVIO(waiter); - allocateBlockForWrite(allocatingVIOAsCompletion(allocatingVIO)); -} - -/** - * Attempt to enqueue an AllocatingVIO to wait for a slab to be scrubbed in the - * current allocation zone. - * - * @param allocatingVIO The AllocatingVIO which wants to allocate a block - * - * @return VDO_SUCCESS if the AllocatingVIO was queued, VDO_NO_SPACE if there - * are no slabs to be scrubbed in the current zone, or some other - * error - **/ -static int waitForCleanSlab(AllocatingVIO *allocatingVIO) -{ - Waiter *waiter = allocatingVIOAsWaiter(allocatingVIO); - waiter->callback = retryAllocateBlockForWrite; - - BlockAllocator *allocator = getBlockAllocator(allocatingVIO->zone); - int result = enqueueForCleanSlab(allocator, waiter); - if (result != VDO_SUCCESS) { - return result; - } - - // We've successfully enqueued, when we come back, pretend like we've - // never tried this allocation before. - allocatingVIO->waitForCleanSlab = false; - allocatingVIO->allocationAttempts = 0; - return VDO_SUCCESS; -} - -/** - * Attempt to allocate a block in an AllocatingVIO's current allocation zone. - * - * @param allocatingVIO The AllocatingVIO - * - * @return VDO_SUCCESS or an error - **/ -static int allocateBlockInZone(AllocatingVIO *allocatingVIO) -{ - allocatingVIO->allocationAttempts++; - int result = allocateAndLockBlock(allocatingVIO); - if (result != VDO_NO_SPACE) { - return result; - } - - if (allocatingVIO->waitForCleanSlab) { - result = waitForCleanSlab(allocatingVIO); - if (result != VDO_NO_SPACE) { - return result; - } - } - - VDO *vdo = getVDOFromAllocatingVIO(allocatingVIO); - const ThreadConfig *threadConfig = getThreadConfig(vdo); - if (allocatingVIO->allocationAttempts >= threadConfig->physicalZoneCount) { - if (allocatingVIO->waitForCleanSlab) { - // There were no free blocks in any zone, and no zone had slabs to - // scrub. - allocatingVIO->allocationCallback(allocatingVIO); - return VDO_SUCCESS; - } - - allocatingVIO->waitForCleanSlab = true; - allocatingVIO->allocationAttempts = 0; - } - - // Try the next zone - ZoneCount zoneNumber = getPhysicalZoneNumber(allocatingVIO->zone) + 1; - if (zoneNumber == threadConfig->physicalZoneCount) { - zoneNumber = 0; - } - allocatingVIO->zone = vdo->physicalZones[zoneNumber]; - launchPhysicalZoneCallback(allocatingVIO, allocateBlockForWrite, - THIS_LOCATION("$F;cb=allocBlockInZone")); - return VDO_SUCCESS; -} - -/** - * Attempt to allocate a block. This callback is registered in - * allocateDataBlock() and allocateBlockInZone(). - * - * @param completion The AllocatingVIO needing an allocation - **/ -static void allocateBlockForWrite(VDOCompletion *completion) -{ - AllocatingVIO *allocatingVIO = asAllocatingVIO(completion); - assertInPhysicalZone(allocatingVIO); - allocatingVIOAddTraceRecord(allocatingVIO, THIS_LOCATION(NULL)); - int result = allocateBlockInZone(allocatingVIO); - if (result != VDO_SUCCESS) { - setCompletionResult(completion, result); - allocatingVIO->allocationCallback(allocatingVIO); - } -} - -/**********************************************************************/ -void allocateDataBlock(AllocatingVIO *allocatingVIO, - AllocationSelector *selector, - PBNLockType writeLockType, - AllocationCallback *callback) -{ - allocatingVIO->writeLockType = writeLockType; - allocatingVIO->allocationCallback = callback; - allocatingVIO->allocationAttempts = 0; - allocatingVIO->allocation = ZERO_BLOCK; - - VIO *vio = allocatingVIOAsVIO(allocatingVIO); - allocatingVIO->zone - = vio->vdo->physicalZones[getNextAllocationZone(selector)]; - - launchPhysicalZoneCallback(allocatingVIO, allocateBlockForWrite, - THIS_LOCATION("$F;cb=allocDataBlock")); -} - -/**********************************************************************/ -void releaseAllocationLock(AllocatingVIO *allocatingVIO) -{ - assertInPhysicalZone(allocatingVIO); - PhysicalBlockNumber lockedPBN = allocatingVIO->allocation; - if (hasProvisionalReference(allocatingVIO->allocationLock)) { - allocatingVIO->allocation = ZERO_BLOCK; - } - - releasePBNLock(allocatingVIO->zone, lockedPBN, - &allocatingVIO->allocationLock); -} - -/**********************************************************************/ -void resetAllocation(AllocatingVIO *allocatingVIO) -{ - ASSERT_LOG_ONLY(allocatingVIO->allocationLock == NULL, - "must not reset allocation while holding a PBN lock"); - - allocatingVIOAsVIO(allocatingVIO)->physical = ZERO_BLOCK; - allocatingVIO->zone = NULL; - allocatingVIO->allocation = ZERO_BLOCK; - allocatingVIO->allocationAttempts = 0; - allocatingVIO->waitForCleanSlab = false; -} diff --git a/vdo/base/allocatingVIO.h b/vdo/base/allocatingVIO.h deleted file mode 100644 index a2f2b7b..0000000 --- a/vdo/base/allocatingVIO.h +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocatingVIO.h#4 $ - */ - -#ifndef ALLOCATING_VIO_H -#define ALLOCATING_VIO_H - -#include "atomic.h" -#include "pbnLock.h" -#include "physicalZone.h" -#include "types.h" -#include "vio.h" -#include "waitQueue.h" - -typedef void AllocationCallback(AllocatingVIO *allocationVIO); - -/** - * A VIO which can receive an allocation from the block allocator. Currently, - * these are used both for servicing external data requests and for compressed - * block writes. - **/ -struct allocatingVIO { - /** The underlying VIO */ - VIO vio; - - /** The WaitQueue entry structure */ - Waiter waiter; - - /** The physical zone in which to allocate a physical block */ - PhysicalZone *zone; - - /** The block allocated to this VIO */ - PhysicalBlockNumber allocation; - - /** - * If non-NULL, the pooled PBN lock held on the allocated block. Must be a - * write lock until the block has been written, after which it will become a - * read lock. - **/ - PBNLock *allocationLock; - - /** The type of write lock to obtain on the allocated block */ - PBNLockType writeLockType; - - /** The number of zones in which this VIO has attempted an allocation */ - ZoneCount allocationAttempts; - - /** Whether this VIO should wait for a clean slab */ - bool waitForCleanSlab; - - /** The function to call once allocation is complete */ - AllocationCallback *allocationCallback; -}; - -/** - * Convert a VIO to an AllocatingVIO. - * - * @param vio The VIO to convert - * - * @return The VIO as an AllocatingVIO - **/ -static inline AllocatingVIO *vioAsAllocatingVIO(VIO *vio) -{ - STATIC_ASSERT(offsetof(AllocatingVIO, vio) == 0); - ASSERT_LOG_ONLY(((vio->type == VIO_TYPE_DATA) - || (vio->type == VIO_TYPE_COMPRESSED_BLOCK)), - "VIO is an AllocatingVIO"); - return (AllocatingVIO *) vio; -} - -/** - * Convert an AllocatingVIO to a VIO. - * - * @param allocatingVIO The AllocatingVIO to convert - * - * @return The AllocatingVIO as a VIO - **/ -static inline VIO *allocatingVIOAsVIO(AllocatingVIO *allocatingVIO) -{ - return &allocatingVIO->vio; -} - -/** - * Convert a generic VDOCompletion to an AllocatingVIO. - * - * @param completion The completion to convert - * - * @return The completion as an AllocatingVIO - **/ -static inline AllocatingVIO *asAllocatingVIO(VDOCompletion *completion) -{ - return vioAsAllocatingVIO(asVIO(completion)); -} - -/** - * Convert an AllocatingVIO to a generic completion. - * - * @param allocatingVIO The AllocatingVIO to convert - * - * @return The AllocatingVIO as a completion - **/ -static inline -VDOCompletion *allocatingVIOAsCompletion(AllocatingVIO *allocatingVIO) -{ - return vioAsCompletion(allocatingVIOAsVIO(allocatingVIO)); -} - -/** - * Convert an AllocatingVIO to a generic wait queue entry. - * - * @param allocatingVIO The AllocatingVIO to convert - * - * @return The AllocatingVIO as a wait queue entry - **/ -static inline Waiter *allocatingVIOAsWaiter(AllocatingVIO *allocatingVIO) -{ - return &allocatingVIO->waiter; -} - -/** - * Convert an AllocatingVIO's generic wait queue entry back to the - * AllocatingVIO. - * - * @param waiter The wait queue entry to convert - * - * @return The wait queue entry as an AllocatingVIO - **/ -static inline AllocatingVIO *waiterAsAllocatingVIO(Waiter *waiter) -{ - if (waiter == NULL) { - return NULL; - } - - return - (AllocatingVIO *) ((uintptr_t) waiter - offsetof(AllocatingVIO, waiter)); -} - -/** - * Check whether an AllocatingVIO is a compressed block write. - * - * @param allocatingVIO The AllocatingVIO to check - * - * @return true if the AllocatingVIO is a compressed block write - **/ -static inline bool isCompressedWriteAllocatingVIO(AllocatingVIO *allocatingVIO) -{ - return isCompressedWriteVIO(allocatingVIOAsVIO(allocatingVIO)); -} - -/** - * Add a trace record for the current source location. - * - * @param allocatingVIO The AllocatingVIO structure to be updated - * @param location The source-location descriptor to be recorded - **/ -static inline void allocatingVIOAddTraceRecord(AllocatingVIO *allocatingVIO, - TraceLocation location) -{ - vioAddTraceRecord(allocatingVIOAsVIO(allocatingVIO), location); -} - -/** - * Get the VDO from an AllocatingVIO. - * - * @param allocatingVIO The AllocatingVIO from which to get the VDO - * - * @return The VDO to which an AllocatingVIO belongs - **/ -static inline VDO *getVDOFromAllocatingVIO(AllocatingVIO *allocatingVIO) -{ - return allocatingVIOAsVIO(allocatingVIO)->vdo; -} - -/** - * Check that an AllocatingVIO is running on the physical zone thread in - * which it did its allocation. - * - * @param allocatingVIO The AllocatingVIO in question - **/ -static inline void assertInPhysicalZone(AllocatingVIO *allocatingVIO) -{ - ThreadID expected = getPhysicalZoneThreadID(allocatingVIO->zone); - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((expected == threadID), - "AllocatingVIO for allocated physical block %" PRIu64 - " on thread %u, should be on thread %u", - allocatingVIO->allocation, threadID, expected); -} - -/** - * Set a callback as a physical block operation in an AllocatingVIO's allocated - * zone. - * - * @param allocatingVIO The AllocatingVIO - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setPhysicalZoneCallback(AllocatingVIO *allocatingVIO, - VDOAction *callback, - TraceLocation location) -{ - setCallback(allocatingVIOAsCompletion(allocatingVIO), callback, - getPhysicalZoneThreadID(allocatingVIO->zone)); - allocatingVIOAddTraceRecord(allocatingVIO, location); -} - -/** - * Set a callback as a physical block operation in an AllocatingVIO's allocated - * zone and invoke it immediately. - * - * @param allocatingVIO The AllocatingVIO - * @param callback The callback to invoke - * @param location The tracing info for the call site - **/ -static inline void launchPhysicalZoneCallback(AllocatingVIO *allocatingVIO, - VDOAction *callback, - TraceLocation location) -{ - setPhysicalZoneCallback(allocatingVIO, callback, location); - invokeCallback(allocatingVIOAsCompletion(allocatingVIO)); -} - -/** - * Allocate a data block to an AllocatingVIO. - * - * @param allocatingVIO The AllocatingVIO which needs an allocation - * @param selector The allocation selector for deciding which physical - * zone to allocate from - * @param writeLockType The type of write lock to obtain on the block - * @param callback The function to call once the allocation is complete - **/ -void allocateDataBlock(AllocatingVIO *allocatingVIO, - AllocationSelector *selector, - PBNLockType writeLockType, - AllocationCallback *callback); - -/** - * Release the PBN lock on the allocated block. If the reference to the locked - * block is still provisional, it will be released as well. - * - * @param allocatingVIO The lock holder - **/ -void releaseAllocationLock(AllocatingVIO *allocatingVIO); - -/** - * Reset an AllocatingVIO after it has done an allocation. - * - * @param allocatingVIO The AllocatingVIO - **/ -void resetAllocation(AllocatingVIO *allocatingVIO); - -#endif // ALLOCATING_VIO_H diff --git a/vdo/base/allocationSelector.c b/vdo/base/allocationSelector.c deleted file mode 100644 index e703d09..0000000 --- a/vdo/base/allocationSelector.c +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelector.c#1 $ - */ - -#include "allocationSelector.h" -#include "allocationSelectorInternals.h" - -#include "memoryAlloc.h" - -#include "types.h" - -enum { - ALLOCATIONS_PER_ZONE = 128, -}; - -/**********************************************************************/ -int makeAllocationSelector(ZoneCount physicalZoneCount, - ThreadID threadID, - AllocationSelector **selectorPtr) -{ - AllocationSelector *selector; - int result = ALLOCATE(1, AllocationSelector, __func__, &selector); - if (result != VDO_SUCCESS) { - return result; - } - - *selector = (AllocationSelector) { - .nextAllocationZone = threadID % physicalZoneCount, - .lastPhysicalZone = physicalZoneCount - 1, - }; - - *selectorPtr = selector; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeAllocationSelector(AllocationSelector **selectorPtr) -{ - AllocationSelector *selector = *selectorPtr; - if (selector == NULL) { - return; - } - - FREE(selector); - *selectorPtr = NULL; -} - -/**********************************************************************/ -ZoneCount getNextAllocationZone(AllocationSelector *selector) -{ - if (selector->lastPhysicalZone > 0) { - if (selector->allocationCount < ALLOCATIONS_PER_ZONE) { - selector->allocationCount++; - } else { - selector->allocationCount = 1; - if (selector->nextAllocationZone < selector->lastPhysicalZone) { - selector->nextAllocationZone++; - } else { - selector->nextAllocationZone = 0; - } - } - } - - return selector->nextAllocationZone; -} diff --git a/vdo/base/allocationSelector.h b/vdo/base/allocationSelector.h deleted file mode 100644 index 7b922e9..0000000 --- a/vdo/base/allocationSelector.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelector.h#1 $ - */ - -#ifndef ALLOCATION_SELECTOR_H -#define ALLOCATION_SELECTOR_H - -#include "completion.h" - -/** - * An AllocationSelector is used by any zone which does data block allocations. - * The selector is used to round-robin allocation requests to different - * physical zones. Currently, 128 allocations will be made to a given physical - * zone before switching to the next. - **/ - -/** - * Make a new allocation selector. - * - * @param [in] physicalZoneCount The number of physical zones - * @param [in] threadID The ID of the thread using this selector - * @param [out] selectorPtr A pointer to receive the new selector - * - * @return VDO_SUCCESS or an error - **/ -int makeAllocationSelector(ZoneCount physicalZoneCount, - ThreadID threadID, - AllocationSelector **selectorPtr) - __attribute__((warn_unused_result)); - -/** - * Free an AllocationSelector and null out the reference to it. - * - * @param selectorPtr A reference to the selector to free - **/ -void freeAllocationSelector(AllocationSelector **selectorPtr); - -/** - * Get number of the physical zone from which to allocate next. - * - * @param selector The selector to query - * - * @return The number of the physical zone from which to allocate - **/ -ZoneCount getNextAllocationZone(AllocationSelector *selector) - __attribute__((warn_unused_result)); - -#endif /* ALLOCATION_SELECTOR_H */ diff --git a/vdo/base/allocationSelectorInternals.h b/vdo/base/allocationSelectorInternals.h deleted file mode 100644 index 13df50f..0000000 --- a/vdo/base/allocationSelectorInternals.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/allocationSelectorInternals.h#1 $ - */ - -#ifndef ALLOCATION_SELECTOR_INTERNALS_H -#define ALLOCATION_SELECTOR_INTERNALS_H - -#include "types.h" - -/** Structure used to select which physical zone to allocate from */ -struct allocationSelector { - /** The number of allocations done in the current zone */ - BlockCount allocationCount; - /** The physical zone to allocate from next */ - ZoneCount nextAllocationZone; - /** The number of the last physical zone */ - ZoneCount lastPhysicalZone; -}; - -#endif /* ALLOCATION_SELECTOR_INTERNALS_H */ diff --git a/vdo/base/atomic.h b/vdo/base/atomic.h deleted file mode 100644 index 93b7318..0000000 --- a/vdo/base/atomic.h +++ /dev/null @@ -1,375 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/atomic.h#2 $ - */ - -#ifndef ATOMIC_H -#define ATOMIC_H - -#include "atomicDefs.h" -#include "compiler.h" -#include "typeDefs.h" - -#define ATOMIC_INITIALIZER(value) { (value) } - -typedef struct { - atomic_t value; -} __attribute__((aligned(4))) Atomic32; - -typedef struct { - atomic64_t value; -} __attribute__((aligned(8))) Atomic64; - -typedef struct { - Atomic32 value; -} __attribute__((aligned(4))) AtomicBool; - -/** - * Memory load operations that precede this fence will be prevented from - * changing order with any that follow this fence, by either the compiler or - * the CPU. This can be used to ensure that the load operations accessing - * the fields of a structure are not re-ordered so they actually take effect - * before a pointer to the structure is resolved. - **/ -static INLINE void loadFence(void) -{ - smp_rmb(); -} - -/** - * Memory store operations that precede this fence will be prevented from - * changing order with any that follow this fence, by either the compiler or - * the CPU. This can be used to ensure that the store operations initializing - * the fields of a structure are not re-ordered so they actually take effect - * after a pointer to the structure is published. - **/ -static INLINE void storeFence(void) -{ - smp_wmb(); -} - -/** - * Generate a full memory fence for the compiler and CPU. Load and store - * operations issued before the fence will not be re-ordered with operations - * issued after the fence. - **/ -static INLINE void memoryFence(void) -{ - smp_mb(); -} - -/** - * Access the value of a 32-bit atomic variable, ensuring that the load is not - * re-ordered by the compiler or CPU with any subsequent load operations. - * - * @param atom a pointer to the atomic variable to access - * - * @return the value that was in the atom at the moment it was accessed - **/ -static INLINE uint32_t atomicLoad32(const Atomic32 *atom) -{ - uint32_t value = atomic_read(&atom->value); - loadFence(); - return value; -} - -/** - * Access the value of a 64-bit atomic variable, ensuring that the memory load - * is not re-ordered by the compiler or CPU with any subsequent load - * operations. - * - * @param atom a pointer to the atomic variable to access - * - * @return the value that was in the atom at the moment it was accessed - **/ -static INLINE uint64_t atomicLoad64(const Atomic64 *atom) -{ - uint64_t value = atomic64_read(&atom->value); - loadFence(); - return value; -} - -/** - * Access the value of a boolean atomic variable, ensuring that the load is not - * re-ordered by the compiler or CPU with any subsequent load operations. - * - * @param atom a pointer to the atomic variable to access - * - * @return the value that was in the atom at the moment it was accessed - **/ -static INLINE bool atomicLoadBool(const AtomicBool *atom) -{ - return (atomicLoad32(&atom->value) > 0); -} - -/** - * Set the value of a 32-bit atomic variable, ensuring that the memory store - * operation is not re-ordered by the compiler or CPU with any preceding store - * operations. - * - * @param atom a pointer to the atomic variable to modify - * @param newValue the value to assign to the atomic variable - **/ -static INLINE void atomicStore32(Atomic32 *atom, uint32_t newValue) -{ - storeFence(); - atomic_set(&atom->value, newValue); -} - -/** - * Set the value of a 64-bit atomic variable, ensuring that the memory store - * operation is not re-ordered by the compiler or CPU with any preceding store - * operations. - * - * @param atom a pointer to the atomic variable to modify - * @param newValue the value to assign to the atomic variable - **/ -static INLINE void atomicStore64(Atomic64 *atom, uint64_t newValue) -{ - storeFence(); - atomic64_set(&atom->value, newValue); -} - -/** - * Set the value of a boolean atomic variable, ensuring that the memory store - * operation is not re-ordered by the compiler or CPU with any preceding store - * operations. - * - * @param atom a pointer to the atomic variable to modify - * @param newValue the value to assign to the atomic variable - **/ -static INLINE void atomicStoreBool(AtomicBool *atom, bool newValue) -{ - atomicStore32(&atom->value, (newValue ? 1 : 0)); -} - -/** - * Add a 32-bit signed delta to a 32-bit atomic variable. - * - * @param atom a pointer to the atomic variable - * @param delta the value to be added (or subtracted) from the variable - * - * @return the new value of the atom after the add operation - **/ -static INLINE uint32_t atomicAdd32(Atomic32 *atom, int32_t delta) -{ - return atomic_add_return(delta, &atom->value); -} - -/** - * Add a 64-bit signed delta to a 64-bit atomic variable. - * - * @param atom a pointer to the atomic variable - * @param delta the value to be added (or subtracted) from the variable - * - * @return the new value of the atom after the add operation - **/ -static INLINE uint64_t atomicAdd64(Atomic64 *atom, int64_t delta) -{ - return atomic64_add_return(delta, &atom->value); -} - -/** - * Atomic 32-bit compare-and-swap. If the atom is identical to a required - * value, atomically replace it with the new value and return true, otherwise - * do nothing and return false. - * - * @param atom a pointer to the atomic variable - * @param requiredValue the value that must be present to perform the swap - * @param newValue the value to be swapped for the required value - * - * @return true if the atom was changed, false otherwise - **/ -static INLINE bool compareAndSwap32(Atomic32 *atom, - uint32_t requiredValue, - uint32_t newValue) -{ - /* - * Our initial implementation, for x86, effectively got a full - * memory barrier because of how "lock cmpxchg" operates. The - * atomic_cmpxchg interface provides for a full barrier *if* the - * exchange is done, but not necessarily if it is not. - * - * Do we need the full barrier always? We need to investigate that, - * as part of (eventually) converting to using that API directly. - * For now, play it safe, and ensure the same behavior on other - * architectures too. - */ -#ifndef __x86_64__ - smp_mb(); -#endif - int oldValue = atomic_cmpxchg(&atom->value, requiredValue, newValue); -#ifndef __x86_64__ - smp_mb(); -#endif - return requiredValue == (uint32_t) oldValue; -} - -/** - * Atomic 64-bit compare-and-swap. If the atom is identical to a required - * value, atomically replace it with the new value and return true, otherwise - * do nothing and return false. - * - * @param atom a pointer to the atomic variable - * @param requiredValue the value that must be present to perform the swap - * @param newValue the value to be swapped for the required value - * - * @return true if the atom was changed, false otherwise - **/ -static INLINE bool compareAndSwap64(Atomic64 *atom, - uint64_t requiredValue, - uint64_t newValue) -{ -#ifndef __x86_64__ - smp_mb(); -#endif - long oldValue = atomic64_cmpxchg(&atom->value, requiredValue, newValue); -#ifndef __x86_64__ - smp_mb(); -#endif - return requiredValue == (uint64_t) oldValue; -} - -/** - * Atomic boolean compare-and-swap. If the atom is identical to a required - * value, atomically replace it with the new value and return true, otherwise - * do nothing and return false. - * - * @param atom a pointer to the atomic variable - * @param requiredValue the value that must be present to perform the swap - * @param newValue the value to be swapped for the required value - * - * @return true if the atom was changed, false otherwise - **/ -static INLINE bool compareAndSwapBool(AtomicBool *atom, - bool requiredValue, - bool newValue) -{ - return compareAndSwap32(&atom->value, (requiredValue ? 1 : 0), - (newValue ? 1 : 0)); -} - -/** - * Access the value of a 32-bit atomic variable using relaxed memory order, - * without any compiler or CPU fences. - * - * @param atom a pointer to the atomic variable to access - * - * @return the value that was in the atom at the moment it was accessed - **/ -static INLINE uint32_t relaxedLoad32(const Atomic32 *atom) -{ - return atomic_read(&atom->value); -} - -/** - * Access the value of a 64-bit atomic variable using relaxed memory order, - * without any compiler or CPU fences. - * - * @param atom a pointer to the atomic variable to access - * - * @return the value that was in the atom at the moment it was accessed - **/ -static INLINE uint64_t relaxedLoad64(const Atomic64 *atom) -{ - return atomic64_read(&atom->value); -} - -/** - * Access the value of a boolean atomic variable using relaxed memory order, - * without any compiler or CPU fences. - * - * @param atom a pointer to the atomic variable to access - * - * @return the value that was in the atom at the moment it was accessed - **/ -static INLINE bool relaxedLoadBool(const AtomicBool *atom) -{ - return (relaxedLoad32(&atom->value) > 0); -} - -/** - * Set the value of a 32-bit atomic variable using relaxed memory order, - * without any compiler or CPU fences. - * - * @param atom a pointer to the atomic variable to modify - * @param newValue the value to assign to the atomic variable - **/ -static INLINE void relaxedStore32(Atomic32 *atom, uint32_t newValue) -{ - atomic_set(&atom->value, newValue); -} - -/** - * Set the value of a 64-bit atomic variable using relaxed memory order, - * without any compiler or CPU fences. - * - * @param atom a pointer to the atomic variable to modify - * @param newValue the value to assign to the atomic variable - **/ -static INLINE void relaxedStore64(Atomic64 *atom, uint64_t newValue) -{ - atomic64_set(&atom->value, newValue); -} - -/** - * Set the value of a boolean atomic variable using relaxed memory order, - * without any compiler or CPU fences. - * - * @param atom a pointer to the atomic variable to modify - * @param newValue the value to assign to the atomic variable - **/ -static INLINE void relaxedStoreBool(AtomicBool *atom, bool newValue) -{ - relaxedStore32(&atom->value, (newValue ? 1 : 0)); -} - -/** - * Non-atomically add a 32-bit signed delta to a 32-bit atomic variable, - * without any compiler or CPU fences. - * - * @param atom a pointer to the atomic variable - * @param delta the value to be added (or subtracted) from the variable - * - * @return the new value of the atom after the add operation - **/ -static INLINE uint32_t relaxedAdd32(Atomic32 *atom, int32_t delta) -{ - uint32_t newValue = (relaxedLoad32(atom) + delta); - relaxedStore32(atom, newValue); - return newValue; -} - -/** - * Non-atomically add a 64-bit signed delta to a 64-bit atomic variable, - * without any compiler or CPU fences. - * - * @param atom a pointer to the atomic variable - * @param delta the value to be added (or subtracted) from the variable - * - * @return the new value of the atom after the add operation - **/ -static INLINE uint64_t relaxedAdd64(Atomic64 *atom, int64_t delta) -{ - uint64_t newValue = (relaxedLoad64(atom) + delta); - relaxedStore64(atom, newValue); - return newValue; -} - -#endif /* ATOMIC_H */ diff --git a/vdo/base/blockAllocator.c b/vdo/base/blockAllocator.c deleted file mode 100644 index a1eaae4..0000000 --- a/vdo/base/blockAllocator.c +++ /dev/null @@ -1,952 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocator.c#22 $ - */ - -#include "blockAllocatorInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "adminState.h" -#include "heap.h" -#include "numUtils.h" -#include "priorityTable.h" -#include "readOnlyNotifier.h" -#include "refCounts.h" -#include "slab.h" -#include "slabDepotInternals.h" -#include "slabIterator.h" -#include "slabJournalEraser.h" -#include "slabJournalInternals.h" -#include "slabScrubber.h" -#include "slabSummary.h" -#include "vdoRecovery.h" -#include "vio.h" -#include "vioPool.h" - -/** - * Assert that a block allocator function was called from the correct thread. - * - * @param threadID The allocator's thread id - * @param functionName The name of the function - **/ -static inline void assertOnAllocatorThread(ThreadID threadID, - const char *functionName) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() == threadID), - "%s called on correct thread", functionName); -} - -/** - * Get the priority for a slab in the allocator's slab queue. Slabs are - * essentially prioritized by an approximation of the number of free blocks in - * the slab so slabs with lots of free blocks with be opened for allocation - * before slabs that have few free blocks. - * - * @param slab The slab whose queue priority is desired - * - * @return the queue priority of the slab - **/ -static unsigned int calculateSlabPriority(Slab *slab) -{ - BlockCount freeBlocks = getSlabFreeBlockCount(slab); - - // Slabs that are completely full must be the only ones with the lowest - // priority: zero. - if (freeBlocks == 0) { - return 0; - } - - /* - * Slabs that have never been opened (empty, newly initialized, never been - * written to) have lower priority than previously opened slabs that have a - * signficant number of free blocks. This ranking causes VDO to avoid - * writing physical blocks for the first time until there are very few free - * blocks that have been previously written to. That policy makes VDO a - * better client of any underlying storage that is thinly-provisioned - * [VDOSTORY-123]. - */ - unsigned int unopenedSlabPriority = slab->allocator->unopenedSlabPriority; - if (isSlabJournalBlank(slab->journal)) { - return unopenedSlabPriority; - } - - /* - * For all other slabs, the priority is derived from the logarithm of the - * number of free blocks. Slabs with the same order of magnitude of free - * blocks have the same priority. With 2^23 blocks, the priority will range - * from 1 to 25. The reserved unopenedSlabPriority divides the range and is - * skipped by the logarithmic mapping. - */ - unsigned int priority = (1 + logBaseTwo(freeBlocks)); - return ((priority < unopenedSlabPriority) ? priority : priority + 1); -} - -/** - * Add a slab to the priority queue of slabs available for allocation. - * - * @param slab The slab to prioritize - **/ -static void prioritizeSlab(Slab *slab) -{ - ASSERT_LOG_ONLY(isRingEmpty(&slab->ringNode), - "a slab must not already be on a ring when prioritizing"); - slab->priority = calculateSlabPriority(slab); - priorityTableEnqueue(slab->allocator->prioritizedSlabs, slab->priority, - &slab->ringNode); -} - -/**********************************************************************/ -void registerSlabWithAllocator(BlockAllocator *allocator, Slab *slab) -{ - allocator->slabCount++; - allocator->lastSlab = slab->slabNumber; -} - -/** - * Get an iterator over all the slabs in the allocator. - * - * @param allocator The allocator - * - * @return An iterator over the allocator's slabs - **/ -static SlabIterator getSlabIterator(const BlockAllocator *allocator) -{ - return iterateSlabs(allocator->depot->slabs, allocator->lastSlab, - allocator->zoneNumber, allocator->depot->zoneCount); -} - -/** - * Notify a block allocator that the VDO has entered read-only mode. - * - * Implements ReadOnlyNotification. - * - * @param listener The block allocator - * @param parent The completion to notify in order to acknowledge the - * notification - **/ -static void notifyBlockAllocatorOfReadOnlyMode(void *listener, - VDOCompletion *parent) -{ - BlockAllocator *allocator = listener; - assertOnAllocatorThread(allocator->threadID, __func__); - SlabIterator iterator = getSlabIterator(allocator); - while (hasNextSlab(&iterator)) { - Slab *slab = nextSlab(&iterator); - abortSlabJournalWaiters(slab->journal); - } - - completeCompletion(parent); -} - -/**********************************************************************/ -int makeAllocatorPoolVIOs(PhysicalLayer *layer, - void *parent, - void *buffer, - VIO **vioPtr) -{ - return createVIO(layer, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, parent, - buffer, vioPtr); -} - -/** - * Allocate those component of the block allocator which are needed only at - * load time, not at format time. - * - * @param allocator The allocator - * @param layer The physical layer below this allocator - * @param vioPoolSize The VIO pool size - * - * @return VDO_SUCCESS or an error - **/ -static int allocateComponents(BlockAllocator *allocator, - PhysicalLayer *layer, - BlockCount vioPoolSize) -{ - /* - * If createVIO is NULL, the block allocator is only being used to format - * or audit the VDO. These only require the SuperBlock component, so we can - * just skip allocating all the memory needed for runtime components. - */ - if (layer->createMetadataVIO == NULL) { - return VDO_SUCCESS; - } - - int result = registerReadOnlyListener(allocator->readOnlyNotifier, - allocator, - notifyBlockAllocatorOfReadOnlyMode, - allocator->threadID); - if (result != VDO_SUCCESS) { - return result; - } - - SlabDepot *depot = allocator->depot; - result = initializeEnqueueableCompletion(&allocator->completion, - BLOCK_ALLOCATOR_COMPLETION, layer); - if (result != VDO_SUCCESS) { - return result; - } - - allocator->summary = getSlabSummaryForZone(depot, allocator->zoneNumber); - - result = makeVIOPool(layer, vioPoolSize, allocator->threadID, - makeAllocatorPoolVIOs, NULL, &allocator->vioPool); - if (result != VDO_SUCCESS) { - return result; - } - - BlockCount slabJournalSize = depot->slabConfig.slabJournalBlocks; - result = makeSlabScrubber(layer, slabJournalSize, - allocator->readOnlyNotifier, - &allocator->slabScrubber); - if (result != VDO_SUCCESS) { - return result; - } - - // The number of data blocks is the maximum number of free blocks that could - // be used in calculateSlabPriority(). - BlockCount maxFreeBlocks = depot->slabConfig.dataBlocks; - unsigned int maxPriority = (2 + logBaseTwo(maxFreeBlocks)); - result = makePriorityTable(maxPriority, &allocator->prioritizedSlabs); - if (result != VDO_SUCCESS) { - return result; - } - - /* - * VDOSTORY-123 requires that we try to open slabs that already have - * allocated blocks in preference to slabs that have never been opened. For - * reasons we have not been able to fully understand, performance tests on - * SSD harvards have been very sensitive (50% reduction in test throughput) - * to very slight differences in the timing and locality of block - * allocation. Assigning a low priority to unopened slabs (maxPriority/2, - * say) would be ideal for the story, but anything less than a very high - * threshold (maxPriority - 1) hurts PMI results. - * - * This sets the free block threshold for preferring to open an unopened - * slab to the binary floor of 3/4ths the total number of datablocks in a - * slab, which will generally evaluate to about half the slab size, but - * avoids degenerate behavior in unit tests where the number of data blocks - * is artificially constrained to a power of two. - */ - allocator->unopenedSlabPriority = (1 + logBaseTwo((maxFreeBlocks * 3) / 4)); - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeBlockAllocator(SlabDepot *depot, - ZoneCount zoneNumber, - ThreadID threadID, - Nonce nonce, - BlockCount vioPoolSize, - PhysicalLayer *layer, - ReadOnlyNotifier *readOnlyNotifier, - BlockAllocator **allocatorPtr) -{ - - BlockAllocator *allocator; - int result = ALLOCATE(1, BlockAllocator, __func__, &allocator); - if (result != VDO_SUCCESS) { - return result; - } - - allocator->depot = depot; - allocator->zoneNumber = zoneNumber; - allocator->threadID = threadID; - allocator->nonce = nonce; - allocator->readOnlyNotifier = readOnlyNotifier; - initializeRing(&allocator->dirtySlabJournals); - - result = allocateComponents(allocator, layer, vioPoolSize); - if (result != VDO_SUCCESS) { - freeBlockAllocator(&allocator); - return result; - } - - *allocatorPtr = allocator; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeBlockAllocator(BlockAllocator **blockAllocatorPtr) -{ - BlockAllocator *allocator = *blockAllocatorPtr; - if (allocator == NULL) { - return; - } - - freeSlabScrubber(&allocator->slabScrubber); - freeVIOPool(&allocator->vioPool); - freePriorityTable(&allocator->prioritizedSlabs); - destroyEnqueueable(&allocator->completion); - FREE(allocator); - *blockAllocatorPtr = NULL; -} - -/**********************************************************************/ -int replaceVIOPool(BlockAllocator *allocator, - size_t size, - PhysicalLayer *layer) -{ - freeVIOPool(&allocator->vioPool); - return makeVIOPool(layer, size, allocator->threadID, makeAllocatorPoolVIOs, - NULL, &allocator->vioPool); -} - -/** - * Get the maximum number of data blocks that can be allocated. - * - * @param allocator The block allocator to query - * - * @return The number of data blocks that can be allocated - **/ -__attribute__((warn_unused_result)) -static inline BlockCount getDataBlockCount(const BlockAllocator *allocator) -{ - return (allocator->slabCount * allocator->depot->slabConfig.dataBlocks); -} - -/**********************************************************************/ -BlockCount getAllocatedBlocks(const BlockAllocator *allocator) -{ - return relaxedLoad64(&allocator->statistics.allocatedBlocks); -} - -/**********************************************************************/ -BlockCount getUnrecoveredSlabCount(const BlockAllocator *allocator) -{ - return getScrubberSlabCount(allocator->slabScrubber); -} - -/**********************************************************************/ -void queueSlab(Slab *slab) -{ - ASSERT_LOG_ONLY(isRingEmpty(&slab->ringNode), - "a requeued slab must not already be on a ring"); - BlockAllocator *allocator = slab->allocator; - BlockCount freeBlocks = getSlabFreeBlockCount(slab); - int result = ASSERT((freeBlocks <= allocator->depot->slabConfig.dataBlocks), - "rebuilt slab %u must have a valid free block count" - " (has %llu, expected maximum %llu)", - slab->slabNumber, freeBlocks, - allocator->depot->slabConfig.dataBlocks); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(allocator->readOnlyNotifier, result); - return; - } - - if (isUnrecoveredSlab(slab)) { - registerSlabForScrubbing(allocator->slabScrubber, slab, false); - return; - } - - if (!isSlabResuming(slab)) { - // If the slab is resuming, we've already accounted for it here, so don't - // do it again. - relaxedAdd64(&allocator->statistics.allocatedBlocks, -freeBlocks); - if (!isSlabJournalBlank(slab->journal)) { - relaxedAdd64(&allocator->statistics.slabsOpened, 1); - } - } - - // All slabs are kept in a priority queue for allocation. - prioritizeSlab(slab); -} - -/**********************************************************************/ -void adjustFreeBlockCount(Slab *slab, bool increment) -{ - BlockAllocator *allocator = slab->allocator; - // The sense of increment is reversed since allocations are being counted. - relaxedAdd64(&allocator->statistics.allocatedBlocks, (increment ? -1 : 1)); - - // The open slab doesn't need to be reprioritized until it is closed. - if (slab == allocator->openSlab) { - return; - } - - // The slab priority rarely changes; if no change, then don't requeue it. - if (slab->priority == calculateSlabPriority(slab)) { - return; - } - - // Reprioritize the slab to reflect the new free block count by removing it - // from the table and re-enqueuing it with the new priority. - priorityTableRemove(allocator->prioritizedSlabs, &slab->ringNode); - prioritizeSlab(slab); -} - -/** - * Allocate the next free physical block in a slab. - * - * The block allocated will have a provisional reference and the - * reference must be either confirmed with a subsequent call to - * incrementReferenceCount() or vacated with a subsequent call to - * decrementReferenceCount(). - * - * @param [in] slab The slab - * @param [out] blockNumberPtr A pointer to receive the allocated block number - * - * @return UDS_SUCCESS or an error code - **/ -static int allocateSlabBlock(Slab *slab, PhysicalBlockNumber *blockNumberPtr) -{ - PhysicalBlockNumber pbn; - int result = allocateUnreferencedBlock(slab->referenceCounts, &pbn); - if (result != VDO_SUCCESS) { - return result; - } - - adjustFreeBlockCount(slab, false); - - *blockNumberPtr = pbn; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int allocateBlock(BlockAllocator *allocator, - PhysicalBlockNumber *blockNumberPtr) -{ - if (allocator->openSlab != NULL) { - // Try to allocate the next block in the currently open slab. - int result = allocateSlabBlock(allocator->openSlab, blockNumberPtr); - if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE)) { - return result; - } - - // Put the exhausted open slab back into the priority table. - prioritizeSlab(allocator->openSlab); - } - - // Remove the highest priority slab from the priority table and make it - // the open slab. - allocator->openSlab - = slabFromRingNode(priorityTableDequeue(allocator->prioritizedSlabs)); - - if (isSlabJournalBlank(allocator->openSlab->journal)) { - relaxedAdd64(&allocator->statistics.slabsOpened, 1); - dirtyAllReferenceBlocks(allocator->openSlab->referenceCounts); - } else { - relaxedAdd64(&allocator->statistics.slabsReopened, 1); - } - - // Try allocating again. If we're out of space immediately after opening a - // slab, then every slab must be fully allocated. - return allocateSlabBlock(allocator->openSlab, blockNumberPtr); -} - -/**********************************************************************/ -void releaseBlockReference(BlockAllocator *allocator, - PhysicalBlockNumber pbn, - const char *why) -{ - if (pbn == ZERO_BLOCK) { - return; - } - - Slab *slab = getSlab(allocator->depot, pbn); - ReferenceOperation operation = { - .type = DATA_DECREMENT, - .pbn = pbn, - }; - int result = modifySlabReferenceCount(slab, NULL, operation); - if (result != VDO_SUCCESS) { - logErrorWithStringError(result, - "Failed to release reference to %s " - "physical block %llu", - why, pbn); - } -} - -/** - * This is a HeapComparator function that orders SlabStatuses using the - * 'isClean' field as the primary key and the 'emptiness' field as the - * secondary key. - * - * Slabs need to be pushed onto the rings in the same order they are to be - * popped off. Popping should always get the most empty first, so pushing - * should be from most empty to least empty. Thus, the comparator order is - * the usual sense since Heap returns larger elements before smaller ones. - * - * @param item1 The first item to compare - * @param item2 The second item to compare - * - * @return 1 if the first item is cleaner or emptier than the second; - * 0 if the two items are equally clean and empty; - -1 otherwise - **/ -static int compareSlabStatuses(const void *item1, const void *item2) -{ - const SlabStatus *info1 = (const SlabStatus *) item1; - const SlabStatus *info2 = (const SlabStatus *) item2; - - if (info1->isClean != info2->isClean) { - return (info1->isClean ? 1 : -1); - } - if (info1->emptiness != info2->emptiness) { - return ((info1->emptiness > info2->emptiness) ? 1 : -1); - } - return ((info1->slabNumber < info2->slabNumber) ? 1 : -1); -} - -/** - * Swap two SlabStatus structures. Implements HeapSwapper. - **/ -static void swapSlabStatuses(void *item1, void *item2) -{ - SlabStatus *info1 = item1; - SlabStatus *info2 = item2; - SlabStatus temp = *info1; - *info1 = *info2; - *info2 = temp; -} - -/** - * Inform the allocator that a slab action has finished on some slab. This - * callback is registered in applyToSlabs(). - * - * @param completion The allocator completion - **/ -static void slabActionCallback(VDOCompletion *completion) -{ - BlockAllocator *allocator = container_of(completion, BlockAllocator, - completion); - SlabActor *actor = &allocator->slabActor; - if (--actor->slabActionCount == 0) { - actor->callback(completion); - return; - } - - resetCompletion(completion); -} - -/** - * Preserve the error from part of an administrative action and continue. - * - * @param completion The allocator completion - **/ -static void handleOperationError(VDOCompletion *completion) -{ - BlockAllocator *allocator = (BlockAllocator *) completion; - setOperationResult(&allocator->state, completion->result); - completion->callback(completion); -} - -/** - * Perform an administrative action on each of an allocator's slabs in - * parallel. - * - * @param allocator The allocator - * @param callback The method to call when the action is complete on every - * slab - **/ -static void applyToSlabs(BlockAllocator *allocator, VDOAction *callback) -{ - prepareCompletion(&allocator->completion, slabActionCallback, - handleOperationError, allocator->threadID, NULL); - allocator->completion.requeue = false; - - // Since we are going to dequeue all of the slabs, the open slab will become - // invalid, so clear it. - allocator->openSlab = NULL; - - // Ensure that we don't finish before we're done starting. - allocator->slabActor = (SlabActor) { - .slabActionCount = 1, - .callback = callback, - }; - - SlabIterator iterator = getSlabIterator(allocator); - while (hasNextSlab(&iterator)) { - Slab *slab = nextSlab(&iterator); - unspliceRingNode(&slab->ringNode); - allocator->slabActor.slabActionCount++; - startSlabAction(slab, allocator->state.state, &allocator->completion); - } - - slabActionCallback(&allocator->completion); -} - -/** - * Inform the allocator that all load I/O has finished. - * - * @param completion The allocator completion - **/ -static void finishLoadingAllocator(VDOCompletion *completion) -{ - BlockAllocator *allocator = (BlockAllocator *) completion; - if (allocator->state.state == ADMIN_STATE_LOADING_FOR_RECOVERY) { - void *context = getCurrentActionContext(allocator->depot->actionManager); - replayIntoSlabJournals(allocator, completion, context); - return; - } - - finishLoading(&allocator->state); -} - -/** - * Initiate a load. - * - * Implements AdminInitiator. - **/ -static void initiateLoad(AdminState *state) -{ - BlockAllocator *allocator = container_of(state, BlockAllocator, state); - if (state->state == ADMIN_STATE_LOADING_FOR_REBUILD) { - prepareCompletion(&allocator->completion, finishLoadingAllocator, - handleOperationError, allocator->threadID, NULL); - eraseSlabJournals(allocator->depot, getSlabIterator(allocator), - &allocator->completion); - return; - } - - applyToSlabs(allocator, finishLoadingAllocator); -} - -/**********************************************************************/ -void loadBlockAllocator(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); - startLoading(&allocator->state, - getCurrentManagerOperation(allocator->depot->actionManager), - parent, initiateLoad); -} - -/**********************************************************************/ -void notifySlabJournalsAreRecovered(BlockAllocator *allocator, int result) -{ - finishLoadingWithResult(&allocator->state, result); -} - -/**********************************************************************/ -int prepareSlabsForAllocation(BlockAllocator *allocator) -{ - relaxedStore64(&allocator->statistics.allocatedBlocks, - getDataBlockCount(allocator)); - - SlabDepot *depot = allocator->depot; - SlabCount slabCount = depot->slabCount; - - SlabStatus *slabStatuses; - int result = ALLOCATE(slabCount, SlabStatus, __func__, &slabStatuses); - if (result != VDO_SUCCESS) { - return result; - } - - getSummarizedSlabStatuses(allocator->summary, slabCount, slabStatuses); - - // Sort the slabs by cleanliness, then by emptiness hint. - Heap heap; - initializeHeap(&heap, compareSlabStatuses, swapSlabStatuses, - slabStatuses, slabCount, sizeof(SlabStatus)); - buildHeap(&heap, slabCount); - - SlabStatus currentSlabStatus; - while (popMaxHeapElement(&heap, ¤tSlabStatus)) { - Slab *slab = depot->slabs[currentSlabStatus.slabNumber]; - if (slab->allocator != allocator) { - continue; - } - - if ((depot->loadType == REBUILD_LOAD) - || (!mustLoadRefCounts(allocator->summary, slab->slabNumber) - && currentSlabStatus.isClean)) { - queueSlab(slab); - continue; - } - - markSlabUnrecovered(slab); - bool highPriority - = ((currentSlabStatus.isClean && (depot->loadType == NORMAL_LOAD)) - || requiresScrubbing(slab->journal)); - registerSlabForScrubbing(allocator->slabScrubber, slab, highPriority); - } - FREE(slabStatuses); - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void prepareAllocatorToAllocate(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); - int result = prepareSlabsForAllocation(allocator); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - scrubHighPrioritySlabs(allocator->slabScrubber, - isPriorityTableEmpty(allocator->prioritizedSlabs), - parent, finishParentCallback, finishParentCallback); -} - -/**********************************************************************/ -void registerNewSlabsForAllocator(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); - SlabDepot *depot = allocator->depot; - for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { - Slab *slab = depot->newSlabs[i]; - if (slab->allocator == allocator) { - registerSlabWithAllocator(allocator, slab); - } - } - completeCompletion(parent); -} - -/** - * Perform a step in draining the allocator. This method is its own callback. - * - * @param completion The allocator's completion - **/ -static void doDrainStep(VDOCompletion *completion) -{ - BlockAllocator *allocator = (BlockAllocator *) completion; - prepareForRequeue(&allocator->completion, doDrainStep, handleOperationError, - allocator->threadID, NULL); - switch (++allocator->drainStep) { - case DRAIN_ALLOCATOR_STEP_SCRUBBER: - stopScrubbing(allocator->slabScrubber, completion); - return; - - case DRAIN_ALLOCATOR_STEP_SLABS: - applyToSlabs(allocator, doDrainStep); - return; - - case DRAIN_ALLOCATOR_STEP_SUMMARY: - drainSlabSummaryZone(allocator->summary, allocator->state.state, - completion); - return; - - case DRAIN_ALLOCATOR_STEP_FINISHED: - ASSERT_LOG_ONLY(!isVIOPoolBusy(allocator->vioPool), "VIO Pool not busy"); - finishDrainingWithResult(&allocator->state, completion->result); - return; - - default: - finishDrainingWithResult(&allocator->state, UDS_BAD_STATE); - } -} - -/** - * Initiate a drain. - * - * Implements AdminInitiator. - **/ -static void initiateDrain(AdminState *state) -{ - BlockAllocator *allocator = container_of(state, BlockAllocator, state); - allocator->drainStep = DRAIN_ALLOCATOR_START; - doDrainStep(&allocator->completion); -} - -/**********************************************************************/ -void drainBlockAllocator(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); - startDraining(&allocator->state, - getCurrentManagerOperation(allocator->depot->actionManager), - parent, initiateDrain); -} - -/** - * Perform a step in resuming a quiescent allocator. This method is its own - * callback. - * - * @param completion The allocator's completion - **/ -static void doResumeStep(VDOCompletion *completion) -{ - BlockAllocator *allocator = (BlockAllocator *) completion; - prepareForRequeue(&allocator->completion, doResumeStep, handleOperationError, - allocator->threadID, NULL); - switch (--allocator->drainStep) { - case DRAIN_ALLOCATOR_STEP_SUMMARY: - resumeSlabSummaryZone(allocator->summary, completion); - return; - - case DRAIN_ALLOCATOR_STEP_SLABS: - applyToSlabs(allocator, doResumeStep); - return; - - case DRAIN_ALLOCATOR_STEP_SCRUBBER: - resumeScrubbing(allocator->slabScrubber, completion); - return; - - case DRAIN_ALLOCATOR_START: - finishResumingWithResult(&allocator->state, completion->result); - return; - - default: - finishResumingWithResult(&allocator->state, UDS_BAD_STATE); - } -} - -/** - * Initiate a resume. - * - * Implements AdminInitiator. - **/ -static void initiateResume(AdminState *state) -{ - BlockAllocator *allocator = container_of(state, BlockAllocator, state); - allocator->drainStep = DRAIN_ALLOCATOR_STEP_FINISHED; - doResumeStep(&allocator->completion); -} - -/**********************************************************************/ -void resumeBlockAllocator(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); - startResuming(&allocator->state, - getCurrentManagerOperation(allocator->depot->actionManager), - parent, initiateResume); -} - -/**********************************************************************/ -void releaseTailBlockLocks(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); - RingNode *ring = &allocator->dirtySlabJournals; - while (!isRingEmpty(ring)) { - if (!releaseRecoveryJournalLock(slabJournalFromDirtyNode(ring->next), - allocator->depot->activeReleaseRequest)) { - break; - } - } - completeCompletion(parent); -} - -/**********************************************************************/ -SlabSummaryZone *getSlabSummaryZone(const BlockAllocator *allocator) -{ - return allocator->summary; -} - -/**********************************************************************/ -int acquireVIO(BlockAllocator *allocator, Waiter *waiter) -{ - return acquireVIOFromPool(allocator->vioPool, waiter); -} - -/**********************************************************************/ -void returnVIO(BlockAllocator *allocator, VIOPoolEntry *entry) -{ - returnVIOToPool(allocator->vioPool, entry); -} - -/**********************************************************************/ -void scrubAllUnrecoveredSlabsInZone(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockAllocator *allocator = getBlockAllocatorForZone(context, zoneNumber); - scrubSlabs(allocator->slabScrubber, allocator->depot, - notifyZoneFinishedScrubbing, noopCallback); - completeCompletion(parent); -} - -/**********************************************************************/ -int enqueueForCleanSlab(BlockAllocator *allocator, Waiter *waiter) -{ - return enqueueCleanSlabWaiter(allocator->slabScrubber, waiter); -} - -/**********************************************************************/ -void increaseScrubbingPriority(Slab *slab) -{ - registerSlabForScrubbing(slab->allocator->slabScrubber, slab, true); -} - -/**********************************************************************/ -void allocateFromAllocatorLastSlab(BlockAllocator *allocator) -{ - ASSERT_LOG_ONLY(allocator->openSlab == NULL, "mustn't have an open slab"); - Slab *lastSlab = allocator->depot->slabs[allocator->lastSlab]; - priorityTableRemove(allocator->prioritizedSlabs, &lastSlab->ringNode); - allocator->openSlab = lastSlab; -} - -/**********************************************************************/ -BlockAllocatorStatistics -getBlockAllocatorStatistics(const BlockAllocator *allocator) -{ - const AtomicAllocatorStatistics *atoms = &allocator->statistics; - return (BlockAllocatorStatistics) { - .slabCount = allocator->slabCount, - .slabsOpened = relaxedLoad64(&atoms->slabsOpened), - .slabsReopened = relaxedLoad64(&atoms->slabsReopened), - }; -} - -/**********************************************************************/ -SlabJournalStatistics getSlabJournalStatistics(const BlockAllocator *allocator) -{ - const AtomicSlabJournalStatistics *atoms = &allocator->slabJournalStatistics; - return (SlabJournalStatistics) { - .diskFullCount = atomicLoad64(&atoms->diskFullCount), - .flushCount = atomicLoad64(&atoms->flushCount), - .blockedCount = atomicLoad64(&atoms->blockedCount), - .blocksWritten = atomicLoad64(&atoms->blocksWritten), - .tailBusyCount = atomicLoad64(&atoms->tailBusyCount), - }; -} - -/**********************************************************************/ -RefCountsStatistics getRefCountsStatistics(const BlockAllocator *allocator) -{ - const AtomicRefCountStatistics *atoms = &allocator->refCountStatistics; - return (RefCountsStatistics) { - .blocksWritten = atomicLoad64(&atoms->blocksWritten), - }; -} - -/**********************************************************************/ -void dumpBlockAllocator(const BlockAllocator *allocator) -{ - unsigned int pauseCounter = 0; - logInfo("BlockAllocator zone %u", allocator->zoneNumber); - SlabIterator iterator = getSlabIterator(allocator); - while (hasNextSlab(&iterator)) { - dumpSlab(nextSlab(&iterator)); - - // Wait for a while after each batch of 32 slabs dumped, allowing the - // kernel log a chance to be flushed instead of being overrun. - if (pauseCounter++ == 31) { - pauseCounter = 0; - pauseForLogger(); - } - } - - dumpSlabScrubber(allocator->slabScrubber); -} diff --git a/vdo/base/blockAllocator.h b/vdo/base/blockAllocator.h deleted file mode 100644 index cd8eb39..0000000 --- a/vdo/base/blockAllocator.h +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocator.h#12 $ - */ - -#ifndef BLOCK_ALLOCATOR_H -#define BLOCK_ALLOCATOR_H - -#include "completion.h" -#include "fixedLayout.h" -#include "statistics.h" -#include "types.h" -#include "vioPool.h" -#include "waitQueue.h" - -/** - * Create a block allocator. - * - * @param [in] depot The slab depot for this allocator - * @param [in] zoneNumber The physical zone number for this allocator - * @param [in] threadID The thread ID for this allocator's zone - * @param [in] nonce The nonce of the VDO - * @param [in] vioPoolSize The size of the VIO pool - * @param [in] layer The physical layer below this allocator - * @param [in] readOnlyNotifier The context for entering read-only mode - * @param [out] allocatorPtr A pointer to hold the allocator - * - * @return A success or error code - **/ -int makeBlockAllocator(SlabDepot *depot, - ZoneCount zoneNumber, - ThreadID threadID, - Nonce nonce, - BlockCount vioPoolSize, - PhysicalLayer *layer, - ReadOnlyNotifier *readOnlyNotifier, - BlockAllocator **allocatorPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy a block allocator and null out the reference to it. - * - * @param blockAllocatorPtr The reference to the allocator to destroy - **/ -void freeBlockAllocator(BlockAllocator **blockAllocatorPtr); - -/** - * Queue a slab for allocation or scrubbing. - * - * @param slab The slab to queue - **/ -void queueSlab(Slab *slab); - -/** - * Update the block allocator to reflect an increment or decrement of the free - * block count in a slab. This adjusts the allocated block count and - * reprioritizes the slab when appropriate. - * - * @param slab The slab whose free block count changed - * @param increment True if the free block count went up by one, - * false if it went down by one - **/ -void adjustFreeBlockCount(Slab *slab, bool increment); - -/** - * Allocate a physical block. - * - * The block allocated will have a provisional reference and the - * reference must be either confirmed with a subsequent call to - * incrementReferenceCount() or vacated with a subsequent call to - * decrementReferenceCount(). - * - * @param [in] allocator The block allocator - * @param [out] blockNumberPtr A pointer to receive the allocated block number - * - * @return UDS_SUCCESS or an error code - **/ -int allocateBlock(BlockAllocator *allocator, - PhysicalBlockNumber *blockNumberPtr) - __attribute__((warn_unused_result)); - -/** - * Release an unused provisional reference. - * - * @param allocator The block allocator - * @param pbn The block to dereference - * @param why Why the block was referenced (for logging) - **/ -void releaseBlockReference(BlockAllocator *allocator, - PhysicalBlockNumber pbn, - const char *why); - -/** - * Get the number of allocated blocks, which is the total number of - * blocks in all slabs that have a non-zero reference count. - * - * @param allocator The block allocator - * - * @return The number of blocks with a non-zero reference count - **/ -BlockCount getAllocatedBlocks(const BlockAllocator *allocator) - __attribute__((warn_unused_result)); - -/** - * Get the number of unrecovered slabs. - * - * @param allocator The block allocator - * - * @return The number of slabs that are unrecovered - **/ -BlockCount getUnrecoveredSlabCount(const BlockAllocator *allocator) - __attribute__((warn_unused_result)); - -/** - * Load the state of an allocator from disk. - * - *

Implements ZoneAction. - **/ -void loadBlockAllocator(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent); - -/** - * Inform a block allocator that its slab journals have been recovered from the - * recovery journal. - * - * @param allocator The allocator to inform - * @param result The result of the recovery operation - **/ -void notifySlabJournalsAreRecovered(BlockAllocator *allocator, int result); - -/** - * Prepare the block allocator to come online and start allocating blocks. - * - *

Implements ZoneAction. - **/ -void prepareAllocatorToAllocate(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent); - -/** - * Register a slab with the allocator, ready for use. - * - * @param allocator The allocator to use - * @param slab The slab in question - **/ -void registerSlabWithAllocator(BlockAllocator *allocator, Slab *slab); - -/** - * Register the new slabs belonging to this allocator. - * - *

Implements ZoneAction. - **/ -void registerNewSlabsForAllocator(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent); - -/** - * Drain all allocator I/O. Depending upon the type of drain, some or all - * dirty metadata may be written to disk. The type of drain will be determined - * from the state of the allocator's depot. - * - *

Implements ZoneAction. - **/ -void drainBlockAllocator(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent); - -/** - * Resume a quiescent allocator. - * - *

Implements ZoneAction. - **/ -void resumeBlockAllocator(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent); - -/** - * Request a commit of all dirty tail blocks which are locking a given recovery - * journal block. - * - *

Implements ZoneAction. - **/ -void releaseTailBlockLocks(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent); - -/** - * Get the slab summary zone for an allocator. - * - * @param allocator The allocator - * - * @return The SlabSummaryZone for that allocator - **/ -SlabSummaryZone *getSlabSummaryZone(const BlockAllocator *allocator) - __attribute__((warn_unused_result)); - -/** - * Acquire a VIO from a block allocator's VIO pool (asynchronous). - * - * @param allocator The allocator from which to get a VIO - * @param waiter The object requesting the VIO - * - * @return VDO_SUCCESS or an error - **/ -int acquireVIO(BlockAllocator *allocator, Waiter *waiter) - __attribute__((warn_unused_result)); - -/** - * Return a VIO to a block allocator's VIO pool - * - * @param allocator The block allocator which owns the VIO - * @param entry The VIO being returned - **/ -void returnVIO(BlockAllocator *allocator, VIOPoolEntry *entry); - -/** - * Initiate scrubbing all unrecovered slabs. - * - *

Implements ZoneAction. - **/ -void scrubAllUnrecoveredSlabsInZone(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent); - -/** - * Queue a waiter for a clean slab. - * - * @param allocator The allocator to wait on - * @param waiter The waiter - * - * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no - * slabs to scrub, and some other error otherwise - **/ -int enqueueForCleanSlab(BlockAllocator *allocator, Waiter *waiter) - __attribute__((warn_unused_result)); - -/** - * Increase the scrubbing priority of a slab. - * - * @param slab The slab - **/ -void increaseScrubbingPriority(Slab *slab); - -/** - * Get the statistics for this allocator. - * - * @param allocator The allocator to query - * - * @return A copy of the current statistics for the allocator - **/ -BlockAllocatorStatistics -getBlockAllocatorStatistics(const BlockAllocator *allocator) - __attribute__((warn_unused_result)); - -/** - * Get the aggregated slab journal statistics for the slabs in this allocator. - * - * @param allocator The allocator to query - * - * @return A copy of the current statistics for the allocator - **/ -SlabJournalStatistics getSlabJournalStatistics(const BlockAllocator *allocator) - __attribute__((warn_unused_result)); - -/** - * Get the cumulative RefCounts statistics for the slabs in this allocator. - * - * @param allocator The allocator to query - * - * @return A copy of the current statistics for the allocator - **/ -RefCountsStatistics getRefCountsStatistics(const BlockAllocator *allocator) - __attribute__((warn_unused_result)); - -/** - * Dump information about a block allocator to the log for debugging. - * - * @param allocator The allocator to dump - **/ -void dumpBlockAllocator(const BlockAllocator *allocator); - -#endif // BLOCK_ALLOCATOR_H diff --git a/vdo/base/blockAllocatorInternals.h b/vdo/base/blockAllocatorInternals.h deleted file mode 100644 index 83db684..0000000 --- a/vdo/base/blockAllocatorInternals.h +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockAllocatorInternals.h#11 $ - */ - -#ifndef BLOCK_ALLOCATOR_INTERNALS_H -#define BLOCK_ALLOCATOR_INTERNALS_H - -#include "adminState.h" -#include "atomic.h" -#include "blockAllocator.h" -#include "priorityTable.h" -#include "ringNode.h" -#include "slabScrubber.h" -#include "vioPool.h" - -enum { - /* - * The number of VIOs in the VIO pool is proportional to the throughput of - * the VDO. - */ - VIO_POOL_SIZE = 128, -}; - -typedef enum { - DRAIN_ALLOCATOR_START = 0, - DRAIN_ALLOCATOR_STEP_SCRUBBER, - DRAIN_ALLOCATOR_STEP_SLABS, - DRAIN_ALLOCATOR_STEP_SUMMARY, - DRAIN_ALLOCATOR_STEP_FINISHED, -} BlockAllocatorDrainStep; - -/** - * A sub-structure for applying actions in parallel to all an allocator's - * slabs. - **/ -typedef struct { - /** The number of slabs performing a slab action */ - SlabCount slabActionCount; - /** The method to call when a slab action has been completed by all slabs */ - VDOAction *callback; -} SlabActor; - -/** - * These fields are only modified by the physical zone thread, but are queried - * by other threads. - **/ -typedef struct atomicAllocatorStatistics { - /** The count of allocated blocks in this zone */ - Atomic64 allocatedBlocks; - /** The number of slabs from which blocks have ever been allocated */ - Atomic64 slabsOpened; - /** The number of times since loading that a slab been re-opened */ - Atomic64 slabsReopened; -} AtomicAllocatorStatistics; - -/** - * The statistics for all the slab journals in the slabs owned by this - * allocator. These fields are all mutated only by the physical zone thread, - * but are read by other threads when gathering statistics for the entire - * depot. - **/ -typedef struct atomicSlabJournalStatistics { - /** Number of times the on-disk journal was full */ - Atomic64 diskFullCount; - /** Number of times an entry was added over the flush threshold */ - Atomic64 flushCount; - /** Number of times an entry was added over the block threshold */ - Atomic64 blockedCount; - /** Number of times the tail block was written */ - Atomic64 blocksWritten; - /** Number of times we had to wait for the tail block commit */ - Atomic64 tailBusyCount; -} AtomicSlabJournalStatistics; - -/** - * The statistics for all the RefCounts in the slabs owned by this - * allocator. These fields are all mutated only by the physical zone thread, - * but are read by other threads when gathering statistics for the entire - * depot. - **/ -typedef struct atomicRefCountStatistics { - /** Number of blocks written */ - Atomic64 blocksWritten; -} AtomicRefCountStatistics; - -struct blockAllocator { - VDOCompletion completion; - /** The slab depot for this allocator */ - SlabDepot *depot; - /** The slab summary zone for this allocator */ - SlabSummaryZone *summary; - /** The notifier for entering read-only mode */ - ReadOnlyNotifier *readOnlyNotifier; - /** The nonce of the VDO */ - Nonce nonce; - /** The physical zone number of this allocator */ - ZoneCount zoneNumber; - /** The thread ID for this allocator's physical zone */ - ThreadID threadID; - /** The number of slabs in this allocator */ - SlabCount slabCount; - /** The number of the last slab owned by this allocator */ - SlabCount lastSlab; - /** The reduced priority level used to preserve unopened slabs */ - unsigned int unopenedSlabPriority; - /** The state of this allocator */ - AdminState state; - /** The actor for applying an action to all slabs */ - SlabActor slabActor; - - /** The slab from which blocks are currently being allocated */ - Slab *openSlab; - /** A priority queue containing all slabs available for allocation */ - PriorityTable *prioritizedSlabs; - /** The slab scrubber */ - SlabScrubber *slabScrubber; - /** What phase of the close operation the allocator is to perform */ - BlockAllocatorDrainStep drainStep; - /** Statistics for this block allocator */ - AtomicAllocatorStatistics statistics; - /** Cumulative statistics for the slab journals in this zone */ - AtomicSlabJournalStatistics slabJournalStatistics; - /** Cumulative statistics for the RefCounts in this zone */ - AtomicRefCountStatistics refCountStatistics; - - /** - * This is the head of a queue of slab journals which have entries in their - * tail blocks which have not yet started to commit. When the recovery - * journal is under space pressure, slab journals which have uncommitted - * entries holding a lock on the recovery journal head are forced to commit - * their blocks early. This list is kept in order, with the tail containing - * the slab journal holding the most recent recovery journal lock. - **/ - RingNode dirtySlabJournals; - - /** The VIO pool for reading and writing block allocator metadata */ - VIOPool *vioPool; -}; - -/** - * Construct allocator metadata VIOs. Exposed for unit tests. - * - * Implements VIOConstructor - **/ -int makeAllocatorPoolVIOs(PhysicalLayer *layer, - void *parent, - void *buffer, - VIO **vioPtr) - __attribute__((warn_unused_result)); - -/** - * Replace the VIO pool in a block allocator. This method exists for unit - * tests. - * - * @param allocator The block allocator - * @param size The number of entries in the pool - * @param layer The physical layer from which to allocate VIOs - * - * @return VDO_SUCCESS or an error - **/ -int replaceVIOPool(BlockAllocator *allocator, - size_t size, - PhysicalLayer *layer) - __attribute__((warn_unused_result)); - -/** - * Prepare slabs for allocation or scrubbing. This method is exposed for - * testing. - * - * @param allocator The allocator to prepare - * - * @return VDO_SUCCESS or an error code - **/ -int prepareSlabsForAllocation(BlockAllocator *allocator) - __attribute__((warn_unused_result)); - -/** - * Start allocating from the highest numbered slab. - * - * @param allocator The allocator - **/ -void allocateFromAllocatorLastSlab(BlockAllocator *allocator); - -#endif // BLOCK_ALLOCATOR_INTERNALS_H diff --git a/vdo/base/blockMap.c b/vdo/base/blockMap.c deleted file mode 100644 index 9a13c30..0000000 --- a/vdo/base/blockMap.c +++ /dev/null @@ -1,861 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMap.c#24 $ - */ - -#include "blockMap.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "actionManager.h" -#include "adminState.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "blockMapTree.h" -#include "constants.h" -#include "dataVIO.h" -#include "forest.h" -#include "numUtils.h" -#include "recoveryJournal.h" -#include "statusCodes.h" -#include "types.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" - -typedef struct { - PhysicalBlockNumber flatPageOrigin; - BlockCount flatPageCount; - PhysicalBlockNumber rootOrigin; - BlockCount rootCount; -} __attribute__((packed)) BlockMapState2_0; - -static const Header BLOCK_MAP_HEADER_2_0 = { - .id = BLOCK_MAP, - .version = { - .majorVersion = 2, - .minorVersion = 0, - }, - .size = sizeof(BlockMapState2_0), -}; - -/** - * State associated which each block map page while it is in the VDO page - * cache. - **/ -typedef struct { - /** - * The earliest recovery journal block containing uncommitted updates to the - * block map page associated with this context. A reference (lock) is held - * on that block to prevent it from being reaped. When this value changes, - * the reference on the old value must be released and a reference on the - * new value must be acquired. - **/ - SequenceNumber recoveryLock; -} BlockMapPageContext; - -/** - * Implements VDOPageReadFunction. - **/ -static int validatePageOnRead(void *buffer, - PhysicalBlockNumber pbn, - BlockMapZone *zone, - void *pageContext) -{ - BlockMapPage *page = buffer; - BlockMapPageContext *context = pageContext; - Nonce nonce = zone->blockMap->nonce; - - BlockMapPageValidity validity = validateBlockMapPage(page, nonce, pbn); - if (validity == BLOCK_MAP_PAGE_BAD) { - return logErrorWithStringError(VDO_BAD_PAGE, - "Expected page %" PRIu64 - " but got page %llu instead", - pbn, getBlockMapPagePBN(page)); - } - - if (validity == BLOCK_MAP_PAGE_INVALID) { - formatBlockMapPage(page, nonce, pbn, false); - } - - context->recoveryLock = 0; - return VDO_SUCCESS; -} - -/** - * Handle journal updates and torn write protection. - * - * Implements VDOPageWriteFunction. - **/ -static bool handlePageWrite(void *rawPage, - BlockMapZone *zone, - void *pageContext) -{ - BlockMapPage *page = rawPage; - BlockMapPageContext *context = pageContext; - - if (markBlockMapPageInitialized(page, true)) { - // Cause the page to be re-written. - return true; - } - - // Release the page's references on the recovery journal. - releaseRecoveryJournalBlockReference(zone->blockMap->journal, - context->recoveryLock, - ZONE_TYPE_LOGICAL, zone->zoneNumber); - context->recoveryLock = 0; - return false; -} - -/**********************************************************************/ -PageCount computeBlockMapPageCount(BlockCount entries) -{ - return computeBucketCount(entries, BLOCK_MAP_ENTRIES_PER_PAGE); -} - -/**********************************************************************/ -int makeBlockMap(BlockCount logicalBlocks, - const ThreadConfig *threadConfig, - BlockCount flatPageCount, - PhysicalBlockNumber rootOrigin, - BlockCount rootCount, - BlockMap **mapPtr) -{ - STATIC_ASSERT(BLOCK_MAP_ENTRIES_PER_PAGE - == ((VDO_BLOCK_SIZE - sizeof(BlockMapPage)) - / sizeof(BlockMapEntry))); - - BlockMap *map; - int result = ALLOCATE_EXTENDED(BlockMap, threadConfig->logicalZoneCount, - BlockMapZone, __func__, &map); - if (result != UDS_SUCCESS) { - return result; - } - - map->flatPageCount = flatPageCount; - map->rootOrigin = rootOrigin; - map->rootCount = rootCount; - map->entryCount = logicalBlocks; - - ZoneCount zoneCount = threadConfig->logicalZoneCount; - for (ZoneCount zone = 0; zone < zoneCount; zone++) { - BlockMapZone *blockMapZone = &map->zones[zone]; - blockMapZone->zoneNumber = zone; - blockMapZone->threadID = getLogicalZoneThread(threadConfig, zone); - blockMapZone->blockMap = map; - map->zoneCount++; - } - - *mapPtr = map; - return VDO_SUCCESS; -} - -/** - * Decode block map component state version 2.0 from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param state The state structure to receive the decoded values - * - * @return UDS_SUCCESS or an error code - **/ -static int decodeBlockMapState_2_0(Buffer *buffer, BlockMapState2_0 *state) -{ - size_t initialLength = contentLength(buffer); - - PhysicalBlockNumber flatPageOrigin; - int result = getUInt64LEFromBuffer(buffer, &flatPageOrigin); - if (result != UDS_SUCCESS) { - return result; - } - - BlockCount flatPageCount; - result = getUInt64LEFromBuffer(buffer, &flatPageCount); - if (result != UDS_SUCCESS) { - return result; - } - - PhysicalBlockNumber rootOrigin; - result = getUInt64LEFromBuffer(buffer, &rootOrigin); - if (result != UDS_SUCCESS) { - return result; - } - - BlockCount rootCount; - result = getUInt64LEFromBuffer(buffer, &rootCount); - if (result != UDS_SUCCESS) { - return result; - } - - *state = (BlockMapState2_0) { - .flatPageOrigin = flatPageOrigin, - .flatPageCount = flatPageCount, - .rootOrigin = rootOrigin, - .rootCount = rootCount, - }; - - size_t decodedSize = initialLength - contentLength(buffer); - return ASSERT(BLOCK_MAP_HEADER_2_0.size == decodedSize, - "decoded block map component size must match header size"); -} - -/**********************************************************************/ -int decodeBlockMap(Buffer *buffer, - BlockCount logicalBlocks, - const ThreadConfig *threadConfig, - BlockMap **mapPtr) -{ - Header header; - int result = decodeHeader(buffer, &header); - if (result != VDO_SUCCESS) { - return result; - } - - result = validateHeader(&BLOCK_MAP_HEADER_2_0, &header, true, __func__); - if (result != VDO_SUCCESS) { - return result; - } - - BlockMapState2_0 state; - result = decodeBlockMapState_2_0(buffer, &state); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(state.flatPageOrigin == BLOCK_MAP_FLAT_PAGE_ORIGIN, - "Flat page origin must be %u (recorded as %llu)", - BLOCK_MAP_FLAT_PAGE_ORIGIN, state.flatPageOrigin); - if (result != UDS_SUCCESS) { - return result; - } - - BlockMap *map; - result = makeBlockMap(logicalBlocks, threadConfig, - state.flatPageCount, state.rootOrigin, - state.rootCount, &map); - if (result != VDO_SUCCESS) { - return result; - } - - *mapPtr = map; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int decodeSodiumBlockMap(Buffer *buffer, - BlockCount logicalBlocks, - const ThreadConfig *threadConfig, - BlockMap **mapPtr) -{ - // Sodium uses state version 2.0. - return decodeBlockMap(buffer, logicalBlocks, threadConfig, mapPtr); -} - -/** - * Initialize the per-zone portions of the block map. - * - * @param zone The zone to initialize - * @param layer The physical layer on which the zone resides - * @param readOnlyNotifier The read-only context for the VDO - * @param cacheSize The size of the page cache for the zone - * @param maximumAge The number of journal blocks before a dirtied page - * is considered old and must be written out - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int initializeBlockMapZone(BlockMapZone *zone, - PhysicalLayer *layer, - ReadOnlyNotifier *readOnlyNotifier, - PageCount cacheSize, - BlockCount maximumAge) -{ - zone->readOnlyNotifier = readOnlyNotifier; - int result = initializeTreeZone(zone, layer, maximumAge); - if (result != VDO_SUCCESS) { - return result; - } - - return makeVDOPageCache(layer, cacheSize, validatePageOnRead, - handlePageWrite, sizeof(BlockMapPageContext), - maximumAge, zone, &zone->pageCache); -} - -/**********************************************************************/ -BlockMapZone *getBlockMapZone(BlockMap *map, ZoneCount zoneNumber) -{ - return &map->zones[zoneNumber]; -} - -/** - * Get the ID of the thread on which a given block map zone operates. - * - *

Implements ZoneThreadGetter. - **/ -static ThreadID getBlockMapZoneThreadID(void *context, ZoneCount zoneNumber) -{ - return getBlockMapZone(context, zoneNumber)->threadID; -} - -/** - * Prepare for an era advance. - * - *

Implements ActionPreamble. - **/ -static void prepareForEraAdvance(void *context, VDOCompletion *parent) -{ - BlockMap *map = context; - map->currentEraPoint = map->pendingEraPoint; - completeCompletion(parent); -} - -/** - * Update the progress of the era in a zone. - * - *

Implements ZoneAction. - **/ -static void advanceBlockMapZoneEra(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockMapZone *zone = getBlockMapZone(context, zoneNumber); - advanceVDOPageCachePeriod(zone->pageCache, zone->blockMap->currentEraPoint); - advanceZoneTreePeriod(&zone->treeZone, zone->blockMap->currentEraPoint); - finishCompletion(parent, VDO_SUCCESS); -} - -/** - * Schedule an era advance if necessary. This method should not be called - * directly. Rather, call scheduleDefaultAction() on the block map's action - * manager. - * - *

Implements ActionScheduler. - **/ -static bool scheduleEraAdvance(void *context) -{ - BlockMap *map = context; - if (map->currentEraPoint == map->pendingEraPoint) { - return false; - } - - return scheduleAction(map->actionManager, prepareForEraAdvance, - advanceBlockMapZoneEra, NULL, NULL); -} - -/**********************************************************************/ -int makeBlockMapCaches(BlockMap *map, - PhysicalLayer *layer, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *journal, - Nonce nonce, - PageCount cacheSize, - BlockCount maximumAge) -{ - int result = ASSERT(cacheSize > 0, "block map cache size is specified"); - if (result != UDS_SUCCESS) { - return result; - } - - map->journal = journal; - map->nonce = nonce; - - result = makeForest(map, map->entryCount); - if (result != VDO_SUCCESS) { - return result; - } - - replaceForest(map); - for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { - result = initializeBlockMapZone(&map->zones[zone], layer, readOnlyNotifier, - cacheSize / map->zoneCount, maximumAge); - if (result != VDO_SUCCESS) { - return result; - } - } - - return makeActionManager(map->zoneCount, getBlockMapZoneThreadID, - getRecoveryJournalThreadID(journal), map, - scheduleEraAdvance, layer, - &map->actionManager); -} - -/** - * Clean up a BlockMapZone. - * - * @param zone The zone to uninitialize - **/ -static void uninitializeBlockMapZone(BlockMapZone *zone) -{ - uninitializeBlockMapTreeZone(&zone->treeZone); - freeVDOPageCache(&zone->pageCache); -} - -/**********************************************************************/ -void freeBlockMap(BlockMap **mapPtr) -{ - BlockMap *map = *mapPtr; - if (map == NULL) { - return; - } - - for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { - uninitializeBlockMapZone(&map->zones[zone]); - } - - abandonBlockMapGrowth(map); - freeForest(&map->forest); - freeActionManager(&map->actionManager); - - FREE(map); - *mapPtr = NULL; -} - -/**********************************************************************/ -size_t getBlockMapEncodedSize(void) -{ - return ENCODED_HEADER_SIZE + sizeof(BlockMapState2_0); -} - -/**********************************************************************/ -int encodeBlockMap(const BlockMap *map, Buffer *buffer) -{ - int result = encodeHeader(&BLOCK_MAP_HEADER_2_0, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - size_t initialLength = contentLength(buffer); - - result = putUInt64LEIntoBuffer(buffer, BLOCK_MAP_FLAT_PAGE_ORIGIN); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, map->flatPageCount); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, map->rootOrigin); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, map->rootCount); - if (result != UDS_SUCCESS) { - return result; - } - - size_t encodedSize = contentLength(buffer) - initialLength; - return ASSERT(BLOCK_MAP_HEADER_2_0.size == encodedSize, - "encoded block map component size must match header size"); -} - -/**********************************************************************/ -void initializeBlockMapFromJournal(BlockMap *map, RecoveryJournal *journal) -{ - map->currentEraPoint = getCurrentJournalSequenceNumber(journal); - map->pendingEraPoint = map->currentEraPoint; - - for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { - setTreeZoneInitialPeriod(&map->zones[zone].treeZone, map->currentEraPoint); - setVDOPageCacheInitialPeriod(map->zones[zone].pageCache, - map->currentEraPoint); - } -} - -/**********************************************************************/ -ZoneCount computeLogicalZone(DataVIO *dataVIO) -{ - BlockMap *map = getBlockMap(getVDOFromDataVIO(dataVIO)); - TreeLock *treeLock = &dataVIO->treeLock; - PageNumber pageNumber = computePageNumber(dataVIO->logical.lbn); - treeLock->treeSlots[0].pageIndex = pageNumber; - treeLock->rootIndex = pageNumber % map->rootCount; - return (treeLock->rootIndex % map->zoneCount); -} - -/**********************************************************************/ -void findBlockMapSlotAsync(DataVIO *dataVIO, - VDOAction *callback, - ThreadID threadID) -{ - BlockMap *map = getBlockMap(getVDOFromDataVIO(dataVIO)); - if (dataVIO->logical.lbn >= map->entryCount) { - finishDataVIO(dataVIO, VDO_OUT_OF_RANGE); - return; - } - - TreeLock *treeLock = &dataVIO->treeLock; - BlockMapTreeSlot *slot = &treeLock->treeSlots[0]; - slot->blockMapSlot.slot = computeSlot(dataVIO->logical.lbn); - if (slot->pageIndex < map->flatPageCount) { - slot->blockMapSlot.pbn = slot->pageIndex + BLOCK_MAP_FLAT_PAGE_ORIGIN; - launchCallback(dataVIOAsCompletion(dataVIO), callback, threadID); - return; - } - - treeLock->callback = callback; - treeLock->threadID = threadID; - lookupBlockMapPBN(dataVIO); -} - -/**********************************************************************/ -PageCount getNumberOfFixedBlockMapPages(const BlockMap *map) -{ - return (map->flatPageCount + map->rootCount); -} - -/**********************************************************************/ -BlockCount getNumberOfBlockMapEntries(const BlockMap *map) -{ - return map->entryCount; -} - -/**********************************************************************/ -void advanceBlockMapEra(BlockMap *map, SequenceNumber recoveryBlockNumber) -{ - if (map == NULL) { - return; - } - - map->pendingEraPoint = recoveryBlockNumber; - scheduleDefaultAction(map->actionManager); -} - -/**********************************************************************/ -void checkForDrainComplete(BlockMapZone *zone) -{ - if (isDraining(&zone->state) - && !isTreeZoneActive(&zone->treeZone) - && !isPageCacheActive(zone->pageCache)) { - finishDrainingWithResult(&zone->state, - (isReadOnly(zone->readOnlyNotifier) - ? VDO_READ_ONLY : VDO_SUCCESS)); - } -} - -/** - * Initiate a drain of the trees and page cache of a block map zone. - * - * Implements AdminInitiator - **/ -static void initiateDrain(AdminState *state) -{ - BlockMapZone *zone = container_of(state, BlockMapZone, state); - drainZoneTrees(&zone->treeZone); - drainVDOPageCache(zone->pageCache); - checkForDrainComplete(zone); -} - -/** - * Drain a zone of the block map. - * - *

Implements ZoneAction. - **/ -static void drainZone(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockMapZone *zone = getBlockMapZone(context, zoneNumber); - startDraining(&zone->state, - getCurrentManagerOperation(zone->blockMap->actionManager), - parent, initiateDrain); -} - -/**********************************************************************/ -void drainBlockMap(BlockMap *map, - AdminStateCode operation, - VDOCompletion *parent) -{ - scheduleOperation(map->actionManager, operation, NULL, drainZone, NULL, - parent); -} - -/** - * Resume a zone of the block map. - * - *

Implements ZoneAction. - **/ -static void resumeBlockMapZone(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - BlockMapZone *zone = getBlockMapZone(context, zoneNumber); - finishCompletion(parent, resumeIfQuiescent(&zone->state)); -} - -/**********************************************************************/ -void resumeBlockMap(BlockMap *map, VDOCompletion *parent) -{ - scheduleOperation(map->actionManager, ADMIN_STATE_RESUMING, NULL, - resumeBlockMapZone, NULL, parent); -} - -/**********************************************************************/ -int prepareToGrowBlockMap(BlockMap *map, BlockCount newLogicalBlocks) -{ - if (map->nextEntryCount == newLogicalBlocks) { - return VDO_SUCCESS; - } - - if (map->nextEntryCount > 0) { - abandonBlockMapGrowth(map); - } - - if (newLogicalBlocks < map->entryCount) { - map->nextEntryCount = map->entryCount; - return VDO_SUCCESS; - } - - return makeForest(map, newLogicalBlocks); -} - -/**********************************************************************/ -BlockCount getNewEntryCount(BlockMap *map) -{ - return map->nextEntryCount; -} - -/** - * Grow the block map by replacing the forest with the one which was prepared. - * - * Implements ActionPreamble - **/ -static void growForest(void *context, VDOCompletion *completion) -{ - replaceForest(context); - completeCompletion(completion); -} - -/**********************************************************************/ -void growBlockMap(BlockMap *map, VDOCompletion *parent) -{ - scheduleOperation(map->actionManager, ADMIN_STATE_SUSPENDED_OPERATION, - growForest, NULL, NULL, parent); -} - -/**********************************************************************/ -void abandonBlockMapGrowth(BlockMap *map) -{ - abandonForest(map); -} - -/** - * Finish processing a block map get or put operation. This function releases - * the page completion and then continues the requester. - * - * @param completion The completion for the page fetch - * @param result The result of the block map operation - **/ -static inline void finishProcessingPage(VDOCompletion *completion, int result) -{ - VDOCompletion *parent = completion->parent; - releaseVDOPageCompletion(completion); - continueCompletion(parent, result); -} - -/** - * Handle an error fetching a page from the cache. This error handler is - * registered in setupMappedBlock(). - * - * @param completion The page completion which got an error - **/ -static void handlePageError(VDOCompletion *completion) -{ - finishProcessingPage(completion, completion->result); -} - -/** - * Get the mapping page for a get/put mapped block operation and dispatch to - * the appropriate handler. - * - * @param dataVIO The dataVIO - * @param modifiable Whether we intend to modify the mapping - * @param action The handler to process the mapping page - **/ -static void setupMappedBlock(DataVIO *dataVIO, - bool modifiable, - VDOAction *action) -{ - BlockMapZone *zone = getBlockMapForZone(dataVIO->logical.zone); - if (isDraining(&zone->state)) { - finishDataVIO(dataVIO, VDO_SHUTTING_DOWN); - return; - } - - initVDOPageCompletion(&dataVIO->pageCompletion, zone->pageCache, - dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn, - modifiable, dataVIOAsCompletion(dataVIO), action, - handlePageError); - getVDOPageAsync(&dataVIO->pageCompletion.completion); -} - -/** - * Decode and validate a block map entry and attempt to use it to set the - * mapped location of a DataVIO. - * - * @param dataVIO The DataVIO to update with the map entry - * @param entry The block map entry for the logical block - * - * @return VDO_SUCCESS or VDO_BAD_MAPPING if the map entry is invalid - * or an error code for any other failure - **/ -__attribute__((warn_unused_result)) -static int setMappedEntry(DataVIO *dataVIO, const BlockMapEntry *entry) -{ - // Unpack the PBN for logging purposes even if the entry is invalid. - DataLocation mapped = unpackBlockMapEntry(entry); - - if (isValidLocation(&mapped)) { - int result = setMappedLocation(dataVIO, mapped.pbn, mapped.state); - /* - * Return success and all errors not specifically known to be errors from - * validating the location. Yes, this expression is redundant; it is - * intentional. - */ - if ((result == VDO_SUCCESS) - || ((result != VDO_OUT_OF_RANGE) && (result != VDO_BAD_MAPPING))) { - return result; - } - } - - // Log the corruption even if we wind up ignoring it for write VIOs, - // converting all cases to VDO_BAD_MAPPING. - logErrorWithStringError(VDO_BAD_MAPPING, "PBN %" PRIu64 - " with state %u read from the block map was invalid", - mapped.pbn, mapped.state); - - // A read VIO has no option but to report the bad mapping--reading - // zeros would be hiding known data loss. - if (isReadDataVIO(dataVIO)) { - return VDO_BAD_MAPPING; - } - - // A write VIO only reads this mapping to decref the old block. Treat - // this as an unmapped entry rather than fail the write. - clearMappedLocation(dataVIO); - return VDO_SUCCESS; -} - -/** - * This callback is registered in getMappedBlockAsync(). - **/ -static void getMappingFromFetchedPage(VDOCompletion *completion) -{ - if (completion->result != VDO_SUCCESS) { - finishProcessingPage(completion, completion->result); - return; - } - - const BlockMapPage *page = dereferenceReadableVDOPage(completion); - int result = ASSERT(page != NULL, "page available"); - if (result != VDO_SUCCESS) { - finishProcessingPage(completion, result); - return; - } - - DataVIO *dataVIO = asDataVIO(completion->parent); - BlockMapTreeSlot *treeSlot = &dataVIO->treeLock.treeSlots[0]; - const BlockMapEntry *entry = &page->entries[treeSlot->blockMapSlot.slot]; - - result = setMappedEntry(dataVIO, entry); - finishProcessingPage(completion, result); -} - -/** - * This callback is registered in putMappedBlockAsync(). - **/ -static void putMappingInFetchedPage(VDOCompletion *completion) -{ - if (completion->result != VDO_SUCCESS) { - finishProcessingPage(completion, completion->result); - return; - } - - BlockMapPage *page = dereferenceWritableVDOPage(completion); - int result = ASSERT(page != NULL, "page available"); - if (result != VDO_SUCCESS) { - finishProcessingPage(completion, result); - return; - } - - DataVIO *dataVIO = asDataVIO(completion->parent); - BlockMapPageContext *context = getVDOPageCompletionContext(completion); - SequenceNumber oldLock = context->recoveryLock; - updateBlockMapPage(page, dataVIO, dataVIO->newMapped.pbn, - dataVIO->newMapped.state, &context->recoveryLock); - markCompletedVDOPageDirty(completion, oldLock, context->recoveryLock); - finishProcessingPage(completion, VDO_SUCCESS); -} - -/**********************************************************************/ -void getMappedBlockAsync(DataVIO *dataVIO) -{ - if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) { - // We know that the block map page for this LBN has not been allocated, - // so the block must be unmapped. - clearMappedLocation(dataVIO); - continueDataVIO(dataVIO, VDO_SUCCESS); - return; - } - - setupMappedBlock(dataVIO, false, getMappingFromFetchedPage); -} - -/**********************************************************************/ -void putMappedBlockAsync(DataVIO *dataVIO) -{ - setupMappedBlock(dataVIO, true, putMappingInFetchedPage); -} - -/**********************************************************************/ -BlockMapStatistics getBlockMapStatistics(BlockMap *map) -{ - BlockMapStatistics stats; - memset(&stats, 0, sizeof(BlockMapStatistics)); - - for (ZoneCount zone = 0; zone < map->zoneCount; zone++) { - const AtomicPageCacheStatistics *atoms - = getVDOPageCacheStatistics(map->zones[zone].pageCache); - stats.dirtyPages += atomicLoad64(&atoms->counts.dirtyPages); - stats.cleanPages += atomicLoad64(&atoms->counts.cleanPages); - stats.freePages += atomicLoad64(&atoms->counts.freePages); - stats.failedPages += atomicLoad64(&atoms->counts.failedPages); - stats.incomingPages += atomicLoad64(&atoms->counts.incomingPages); - stats.outgoingPages += atomicLoad64(&atoms->counts.outgoingPages); - - stats.cachePressure += atomicLoad64(&atoms->cachePressure); - stats.readCount += atomicLoad64(&atoms->readCount); - stats.writeCount += atomicLoad64(&atoms->writeCount); - stats.failedReads += atomicLoad64(&atoms->failedReads); - stats.failedWrites += atomicLoad64(&atoms->failedWrites); - stats.reclaimed += atomicLoad64(&atoms->reclaimed); - stats.readOutgoing += atomicLoad64(&atoms->readOutgoing); - stats.foundInCache += atomicLoad64(&atoms->foundInCache); - stats.discardRequired += atomicLoad64(&atoms->discardRequired); - stats.waitForPage += atomicLoad64(&atoms->waitForPage); - stats.fetchRequired += atomicLoad64(&atoms->fetchRequired); - stats.pagesLoaded += atomicLoad64(&atoms->pagesLoaded); - stats.pagesSaved += atomicLoad64(&atoms->pagesSaved); - stats.flushCount += atomicLoad64(&atoms->flushCount); - } - - return stats; -} diff --git a/vdo/base/blockMap.h b/vdo/base/blockMap.h deleted file mode 100644 index 48073a9..0000000 --- a/vdo/base/blockMap.h +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMap.h#4 $ - */ - -#ifndef BLOCK_MAP_H -#define BLOCK_MAP_H - -#include "adminState.h" -#include "blockMapEntry.h" -#include "completion.h" -#include "fixedLayout.h" -#include "statistics.h" -#include "types.h" - -/** - * Create a block map. - * - * @param [in] logicalBlocks The number of logical blocks for the VDO - * @param [in] threadConfig The thread configuration of the VDO - * @param [in] flatPageCount The number of flat pages - * @param [in] rootOrigin The absolute PBN of the first root page - * @param [in] rootCount The number of tree roots - * @param [out] mapPtr The pointer to hold the new block map - * - * @return VDO_SUCCESS or an error code - **/ -int makeBlockMap(BlockCount logicalBlocks, - const ThreadConfig *threadConfig, - BlockCount flatPageCount, - PhysicalBlockNumber rootOrigin, - BlockCount rootCount, - BlockMap **mapPtr) - __attribute__((warn_unused_result)); - -/** - * Quiesce all block map I/O, possibly writing out all dirty metadata. - * - * @param map The block map to drain - * @param operation The type of drain to perform - * @param parent The completion to notify when the drain is complete - **/ -void drainBlockMap(BlockMap *map, - AdminStateCode operation, - VDOCompletion *parent); - -/** - * Resume I/O for a quiescent block map. - * - * @param map The block map to resume - * @param parent The completion to notify when the resume is complete - **/ -void resumeBlockMap(BlockMap *map, VDOCompletion *parent); - -/** - * Prepare to grow the block map by allocating an expanded collection of trees. - * - * @param map The block map to grow - * @param newLogicalBlocks The new logical size of the VDO - * - * @return VDO_SUCCESS or an error - **/ -int prepareToGrowBlockMap(BlockMap *map, BlockCount newLogicalBlocks) - __attribute__((warn_unused_result)); - -/** - * Get the logical size to which this block map is prepared to grow. - * - * @param map The block map - * - * @return The new number of entries the block map will be grown to or 0 if - * the block map is not prepared to grow - **/ -BlockCount getNewEntryCount(BlockMap *map) - __attribute__((warn_unused_result)); - -/** - * Grow a block map on which prepareToGrowBlockMap() has already been called. - * - * @param map The block map to grow - * @param parent The object to notify when the growth is complete - **/ -void growBlockMap(BlockMap *map, VDOCompletion *parent); - -/** - * Abandon any preparations which were made to grow this block map. - * - * @param map The map which won't be grown - **/ -void abandonBlockMapGrowth(BlockMap *map); - -/** - * Decode the state of a block map saved in a buffer, without creating page - * caches. - * - * @param [in] buffer A buffer containing the super block state - * @param [in] logicalBlocks The number of logical blocks for the VDO - * @param [in] threadConfig The thread configuration of the VDO - * @param [out] mapPtr The pointer to hold the new block map - * - * @return VDO_SUCCESS or an error code - **/ -int decodeBlockMap(Buffer *buffer, - BlockCount logicalBlocks, - const ThreadConfig *threadConfig, - BlockMap **mapPtr) - __attribute__((warn_unused_result)); - -/** - * Create a block map from the saved state of a Sodium block map, and do any - * necessary upgrade work. - * - * @param [in] buffer A buffer containing the super block state - * @param [in] logicalBlocks The number of logical blocks for the VDO - * @param [in] threadConfig The thread configuration of the VDO - * @param [out] mapPtr The pointer to hold the new block map - * - * @return VDO_SUCCESS or an error code - **/ -int decodeSodiumBlockMap(Buffer *buffer, - BlockCount logicalBlocks, - const ThreadConfig *threadConfig, - BlockMap **mapPtr) - __attribute__((warn_unused_result)); - -/** - * Allocate the page caches for a block map. - * - * @param map The block map needing caches. - * @param layer The physical layer for the cache - * @param readOnlyNotifier The read only mode context - * @param journal The recovery journal (may be NULL) - * @param nonce The nonce to distinguish initialized pages - * @param cacheSize The block map cache size, in pages - * @param maximumAge The number of journal blocks before a dirtied page - * is considered old and must be written out - * - * @return VDO_SUCCESS or an error code - **/ -int makeBlockMapCaches(BlockMap *map, - PhysicalLayer *layer, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *journal, - Nonce nonce, - PageCount cacheSize, - BlockCount maximumAge) - __attribute__((warn_unused_result)); - -/** - * Free a block map and null out the reference to it. - * - * @param mapPtr A pointer to the block map to free - **/ -void freeBlockMap(BlockMap **mapPtr); - -/** - * Get the size of the encoded state of a block map. - * - * @return The encoded size of the map's state - **/ -size_t getBlockMapEncodedSize(void) - __attribute__((warn_unused_result)); - -/** - * Encode the state of a block map into a buffer. - * - * @param map The block map to encode - * @param buffer The buffer to encode into - * - * @return UDS_SUCCESS or an error - **/ -int encodeBlockMap(const BlockMap *map, Buffer *buffer) - __attribute__((warn_unused_result)); - -/** - * Obtain any necessary state from the recovery journal that is needed for - * normal block map operation. - * - * @param map The map in question - * @param journal The journal to initialize from - **/ -void initializeBlockMapFromJournal(BlockMap *map, RecoveryJournal *journal); - -/** - * Get the portion of the block map for a given logical zone. - * - * @param map The map - * @param zoneNumber The number of the zone - * - * @return The requested block map zone - **/ -BlockMapZone *getBlockMapZone(BlockMap *map, ZoneCount zoneNumber) - __attribute__((warn_unused_result)); - -/** - * Compute the logical zone on which the entry for a DataVIO - * resides - * - * @param dataVIO The DataVIO - * - * @return The logical zone number for the DataVIO - **/ -ZoneCount computeLogicalZone(DataVIO *dataVIO); - -/** - * Compute the block map slot in which the block map entry for a DataVIO - * resides, and cache that number in the DataVIO. - * - * @param dataVIO The DataVIO - * @param callback The function to call once the slot has been found - * @param threadID The thread on which to run the callback - **/ -void findBlockMapSlotAsync(DataVIO *dataVIO, - VDOAction *callback, - ThreadID threadID); - -/** - * Get number of block map pages at predetermined locations. - * - * @param map The block map - * - * @return The number of fixed pages used by the map - **/ -PageCount getNumberOfFixedBlockMapPages(const BlockMap *map) - __attribute__((warn_unused_result)); - -/** - * Get number of block map entries. - * - * @param map The block map - * - * @return The number of entries stored in the map - **/ -BlockCount getNumberOfBlockMapEntries(const BlockMap *map) - __attribute__((warn_unused_result)); - -/** - * Notify the block map that the recovery journal has finished a new block. - * This method must be called from the journal zone thread. - * - * @param map The block map - * @param recoveryBlockNumber The sequence number of the finished recovery - * journal block - **/ -void advanceBlockMapEra(BlockMap *map, SequenceNumber recoveryBlockNumber); - -/** - * Get the block number of the physical block containing the data for the - * specified logical block number. All blocks are mapped to physical block - * zero by default, which is conventionally the zero block. - * - * @param dataVIO The DataVIO of the block to map - **/ -void getMappedBlockAsync(DataVIO *dataVIO); - -/** - * Associate the logical block number for a block represented by a DataVIO - * with the physical block number in its newMapped field. - * - * @param dataVIO The DataVIO of the block to map - **/ -void putMappedBlockAsync(DataVIO *dataVIO); - -/** - * Get the stats for the block map page cache. - * - * @param map The block map containing the cache - * - * @return The block map statistics - **/ -BlockMapStatistics getBlockMapStatistics(BlockMap *map) - __attribute__((warn_unused_result)); - -#endif // BLOCK_MAP_H diff --git a/vdo/base/blockMapEntry.h b/vdo/base/blockMapEntry.h deleted file mode 100644 index 78304e9..0000000 --- a/vdo/base/blockMapEntry.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapEntry.h#4 $ - */ - -#ifndef BLOCK_MAP_ENTRY_H -#define BLOCK_MAP_ENTRY_H - -#include "blockMappingState.h" -#include "constants.h" -#include "numeric.h" -#include "types.h" - -/** - * The entry for each logical block in the block map is encoded into five - * bytes, which saves space in both the on-disk and in-memory layouts. It - * consists of the 36 low-order bits of a PhysicalBlockNumber (addressing 256 - * terabytes with a 4KB block size) and a 4-bit encoding of a - * BlockMappingState. - **/ -typedef union __attribute__((packed)) blockMapEntry { - struct __attribute__((packed)) { - /** - * Bits 7..4: The four highest bits of the 36-bit physical block number - * Bits 3..0: The 4-bit BlockMappingState - **/ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned mappingState : 4; - unsigned pbnHighNibble : 4; -#else - unsigned pbnHighNibble : 4; - unsigned mappingState : 4; -#endif - - /** 32 low-order bits of the 36-bit PBN, in little-endian byte order */ - byte pbnLowWord[4]; - } fields; - - // A raw view of the packed encoding. - uint8_t raw[5]; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - // This view is only valid on little-endian machines and is only present for - // ease of directly examining packed entries in GDB. - struct __attribute__((packed)) { - unsigned mappingState : 4; - unsigned pbnHighNibble : 4; - uint32_t pbnLowWord; - } littleEndian; -#endif -} BlockMapEntry; - -/** - * Unpack the fields of a BlockMapEntry, returning them as a DataLocation. - * - * @param entry A pointer to the entry to unpack - * - * @return the location of the data mapped by the block map entry - **/ -static inline DataLocation unpackBlockMapEntry(const BlockMapEntry *entry) -{ - PhysicalBlockNumber low32 = getUInt32LE(entry->fields.pbnLowWord); - PhysicalBlockNumber high4 = entry->fields.pbnHighNibble; - return (DataLocation) { - .pbn = ((high4 << 32) | low32), - .state = entry->fields.mappingState, - }; -} - -/**********************************************************************/ -static inline bool isMappedLocation(const DataLocation *location) -{ - return (location->state != MAPPING_STATE_UNMAPPED); -} - -/**********************************************************************/ -static inline bool isValidLocation(const DataLocation *location) -{ - if (location->pbn == ZERO_BLOCK) { - return !isCompressed(location->state); - } else { - return isMappedLocation(location); - } -} - -/** - * Pack a PhysicalBlockNumber into a BlockMapEntry. - * - * @param pbn The physical block number to convert to its - * packed five-byte representation - * @param mappingState The mapping state of the block - * - * @return the packed representation of the block number and mapping state - * - * @note unrepresentable high bits of the unpacked PBN are silently truncated - **/ -static inline BlockMapEntry packPBN(PhysicalBlockNumber pbn, - BlockMappingState mappingState) -{ - BlockMapEntry entry; - entry.fields.mappingState = (mappingState & 0x0F); - entry.fields.pbnHighNibble = ((pbn >> 32) & 0x0F), - storeUInt32LE(entry.fields.pbnLowWord, pbn & UINT_MAX); - return entry; -} - -#endif // BLOCK_MAP_ENTRY_H diff --git a/vdo/base/blockMapInternals.h b/vdo/base/blockMapInternals.h deleted file mode 100644 index 9b2f7a5..0000000 --- a/vdo/base/blockMapInternals.h +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapInternals.h#12 $ - */ - -#ifndef BLOCK_MAP_INTERNALS_H -#define BLOCK_MAP_INTERNALS_H - -#include "adminState.h" -#include "blockMapEntry.h" -#include "blockMapTree.h" -#include "completion.h" -#include "dirtyLists.h" -#include "header.h" -#include "intMap.h" -#include "ringNode.h" -#include "types.h" -#include "vdoPageCache.h" -#include "vioPool.h" - -/** - * The per-zone fields used by the block map tree. - **/ -struct blockMapTreeZone { - /** The BlockMapZone which owns this tree zone */ - BlockMapZone *mapZone; - /** The lists of dirty tree pages */ - DirtyLists *dirtyLists; - /** The number of tree lookups in progress */ - VIOCount activeLookups; - /** The map of pages currently being loaded */ - IntMap *loadingPages; - /** The pool of VIOs for tree I/O */ - VIOPool *vioPool; - /** The tree page which has issued or will be issuing a flush */ - TreePage *flusher; - /** The queue of pages waiting for a flush so they can be written out */ - WaitQueue flushWaiters; - /** The generation after the most recent flush */ - uint8_t generation; - /** The oldest active generation */ - uint8_t oldestGeneration; - /** The counts of dirty pages in each generation */ - uint32_t dirtyPageCounts[256]; -}; - -/** - * The per-zone fields of the block map. - **/ -struct blockMapZone { - /** The number of the zone this is */ - ZoneCount zoneNumber; - /** The ID of this zone's logical thread */ - ThreadID threadID; - /** The BlockMap which owns this BlockMapZone */ - BlockMap *blockMap; - /** The ReadOnlyNotifier of the VDO */ - ReadOnlyNotifier *readOnlyNotifier; - /** The page cache for this zone */ - VDOPageCache *pageCache; - /** The per-zone portion of the tree for this zone */ - BlockMapTreeZone treeZone; - /** The administrative state of the zone */ - AdminState state; -}; - -struct blockMap { - /** The manager for block map actions */ - ActionManager *actionManager; - /** The count of pages in the linear part of the block map */ - BlockCount flatPageCount; - /** The absolute PBN of the first root of the tree part of the block map */ - PhysicalBlockNumber rootOrigin; - /** The count of root pages of the tree part of the block map */ - BlockCount rootCount; - - /** The era point we are currently distributing to the zones */ - SequenceNumber currentEraPoint; - /** The next era point, not yet distributed to any zone */ - SequenceNumber pendingEraPoint; - - /** The number of entries in block map */ - BlockCount entryCount; - /** The VDO's nonce, for the pages */ - Nonce nonce; - /** The recovery journal for this map */ - RecoveryJournal *journal; - - /** The trees for finding block map pages */ - Forest *forest; - /** The expanded trees awaiting growth */ - Forest *nextForest; - /** The number of entries after growth */ - BlockCount nextEntryCount; - - /** The number of logical zones */ - ZoneCount zoneCount; - /** The per zone block map structure */ - BlockMapZone zones[]; -}; - -/** - * Compute the number of pages required for a block map with the specified - * parameters. - * - * @param entries The number of block map entries - * - * @return The number of pages required - **/ -PageCount computeBlockMapPageCount(BlockCount entries); - -/** - * Compute the number of the block map page on which the entry for a given - * logical block resides. - * - * @param lbn The logical block number whose page is desired - * - * @return The number of the block map page containing the entry for - * the given logical block number - **/ -__attribute__((warn_unused_result)) -static inline PageNumber computePageNumber(LogicalBlockNumber lbn) -{ - return (lbn / BLOCK_MAP_ENTRIES_PER_PAGE); -} - -/** - * Find the block map page slot in which the entry for a given logical - * block resides. - * - * @param lbn The logical block number whose slot - * - * @return The slot containing the entry for the given logical block number - **/ -__attribute__((warn_unused_result)) -static inline SlotNumber computeSlot(LogicalBlockNumber lbn) -{ - return (lbn % BLOCK_MAP_ENTRIES_PER_PAGE); -} - -/** - * Check whether a zone of the block map has drained, and if so, send a - * notification thereof. - * - * @param zone The zone to check - **/ -void checkForDrainComplete(BlockMapZone *zone); - - -#endif // BLOCK_MAP_INTERNALS_H diff --git a/vdo/base/blockMapPage.c b/vdo/base/blockMapPage.c deleted file mode 100644 index 8272e12..0000000 --- a/vdo/base/blockMapPage.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapPage.c#8 $ - */ - -#include "blockMapPage.h" - -#include "permassert.h" - -#include "blockMap.h" -#include "blockMapInternals.h" -#include "blockMapTree.h" -#include "constants.h" -#include "dataVIO.h" -#include "recoveryJournal.h" -#include "statusCodes.h" -#include "types.h" - -enum { - PAGE_HEADER_4_1_SIZE = 8 + 8 + 8 + 1 + 1 + 1 + 1, -}; - -static const VersionNumber BLOCK_MAP_4_1 = { - .majorVersion = 4, - .minorVersion = 1, -}; - -/**********************************************************************/ -bool isCurrentBlockMapPage(const BlockMapPage *page) -{ - return areSameVersion(BLOCK_MAP_4_1, unpackVersionNumber(page->version)); -} - -/**********************************************************************/ -BlockMapPage *formatBlockMapPage(void *buffer, - Nonce nonce, - PhysicalBlockNumber pbn, - bool initialized) -{ - memset(buffer, 0, VDO_BLOCK_SIZE); - BlockMapPage *page = (BlockMapPage *) buffer; - page->version = packVersionNumber(BLOCK_MAP_4_1); - storeUInt64LE(page->header.fields.nonce, nonce); - storeUInt64LE(page->header.fields.pbn, pbn); - page->header.fields.initialized = initialized; - return page; -} - -/**********************************************************************/ -BlockMapPageValidity validateBlockMapPage(BlockMapPage *page, - Nonce nonce, - PhysicalBlockNumber pbn) -{ - // Make sure the page layout isn't accidentally changed by changing the - // length of the page header. - STATIC_ASSERT_SIZEOF(PageHeader, PAGE_HEADER_4_1_SIZE); - - if (!areSameVersion(BLOCK_MAP_4_1, unpackVersionNumber(page->version)) - || !isBlockMapPageInitialized(page) - || (nonce != getUInt64LE(page->header.fields.nonce))) { - return BLOCK_MAP_PAGE_INVALID; - } - - if (pbn != getBlockMapPagePBN(page)) { - return BLOCK_MAP_PAGE_BAD; - } - - return BLOCK_MAP_PAGE_VALID; -} - -/**********************************************************************/ -void updateBlockMapPage(BlockMapPage *page, - DataVIO *dataVIO, - PhysicalBlockNumber pbn, - BlockMappingState mappingState, - SequenceNumber *recoveryLock) -{ - // Encode the new mapping. - TreeLock *treeLock = &dataVIO->treeLock; - SlotNumber slot = treeLock->treeSlots[treeLock->height].blockMapSlot.slot; - page->entries[slot] = packPBN(pbn, mappingState); - - // Adjust references (locks) on the recovery journal blocks. - BlockMapZone *zone = getBlockMapForZone(dataVIO->logical.zone); - BlockMap *blockMap = zone->blockMap; - RecoveryJournal *journal = blockMap->journal; - SequenceNumber oldLocked = *recoveryLock; - SequenceNumber newLocked = dataVIO->recoverySequenceNumber; - - if ((oldLocked == 0) || (oldLocked > newLocked)) { - // Acquire a lock on the newly referenced journal block. - acquireRecoveryJournalBlockReference(journal, newLocked, ZONE_TYPE_LOGICAL, - zone->zoneNumber); - - // If the block originally held a newer lock, release it. - if (oldLocked > 0) { - releaseRecoveryJournalBlockReference(journal, oldLocked, - ZONE_TYPE_LOGICAL, - zone->zoneNumber); - } - - *recoveryLock = newLocked; - } - - // Release the transferred lock from the DataVIO. - releasePerEntryLockFromOtherZone(journal, newLocked); - dataVIO->recoverySequenceNumber = 0; -} diff --git a/vdo/base/blockMapPage.h b/vdo/base/blockMapPage.h deleted file mode 100644 index ee011b3..0000000 --- a/vdo/base/blockMapPage.h +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapPage.h#8 $ - */ - -#ifndef BLOCK_MAP_PAGE_H -#define BLOCK_MAP_PAGE_H - -#include "numeric.h" - -#include "blockMapEntry.h" -#include "header.h" -#include "types.h" - -/** - * The packed, on-disk representation of a block map page header. - **/ -typedef union __attribute__((packed)) { - struct __attribute__((packed)) { - /** - * The 64-bit nonce of the current VDO, in little-endian byte order. Used - * to determine whether or not a page has been formatted. - **/ - byte nonce[8]; - - /** The 64-bit PBN of this page, in little-endian byte order */ - byte pbn[8]; - - /** Formerly recoverySequenceNumber; may be non-zero on disk */ - byte unusedLongWord[8]; - - /** Whether this page has been initialized on disk (i.e. written twice) */ - bool initialized; - - /** Formerly entryOffset; now unused since it should always be zero */ - byte unusedByte1; - - /** Formerly interiorTreePageWriting; may be non-zero on disk */ - byte unusedByte2; - - /** Formerly generation (for dirty tree pages); may be non-zero on disk */ - byte unusedByte3; - } fields; - - // A raw view of the packed encoding. - uint8_t raw[8 + 8 + 8 + 1 + 1 + 1 + 1]; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - // This view is only valid on little-endian machines and is only present for - // ease of directly examining packed entries in GDB. - struct __attribute__((packed)) { - uint64_t nonce; - PhysicalBlockNumber pbn; - uint64_t unusedLongWord; - bool initialized; - uint8_t unusedByte1; - uint8_t unusedByte2; - uint8_t unusedByte3; - } littleEndian; -#endif -} PageHeader; - -/** - * The format of a block map page. - **/ -typedef struct __attribute__((packed)) { - PackedVersionNumber version; - PageHeader header; - BlockMapEntry entries[]; -} BlockMapPage; - -typedef enum { - // A block map page is correctly initialized - BLOCK_MAP_PAGE_VALID, - // A block map page is uninitialized - BLOCK_MAP_PAGE_INVALID, - // A block map page is intialized, but is the wrong page - BLOCK_MAP_PAGE_BAD, -} BlockMapPageValidity; - -/** - * Check whether a block map page has been initialized. - * - * @param page The page to check - * - * @return true if the page has been initialized - **/ -__attribute__((warn_unused_result)) -static inline bool isBlockMapPageInitialized(const BlockMapPage *page) -{ - return page->header.fields.initialized; -} - -/** - * Mark whether a block map page has been initialized. - * - * @param page The page to mark - * @param initialized The state to set - * - * @return true if the initialized flag was modified - **/ -static inline bool markBlockMapPageInitialized(BlockMapPage *page, - bool initialized) -{ - if (initialized == page->header.fields.initialized) { - return false; - } - - page->header.fields.initialized = initialized; - return true; -} - -/** - * Get the physical block number where a block map page is stored. - * - * @param page The page to query - * - * @return the page's physical block number - **/ -__attribute__((warn_unused_result)) -static inline PhysicalBlockNumber getBlockMapPagePBN(const BlockMapPage *page) -{ - return getUInt64LE(page->header.fields.pbn); -} - -/** - * Check whether a block map page is of the current version. - * - * @param page The page to check - * - * @return true if the page has the current version - **/ -bool isCurrentBlockMapPage(const BlockMapPage *page) - __attribute__((warn_unused_result)); - -/** - * Format a block map page in memory. - * - * @param buffer The buffer which holds the page - * @param nonce The VDO nonce - * @param pbn The absolute PBN of the page - * @param initialized Whether the page should be marked as initialized - * - * @return the buffer pointer, as a block map page (for convenience) - **/ -BlockMapPage *formatBlockMapPage(void *buffer, - Nonce nonce, - PhysicalBlockNumber pbn, - bool initialized); - -/** - * Check whether a newly read page is valid, upgrading its in-memory format if - * possible and necessary. If the page is valid, clear fields which are not - * meaningful on disk. - * - * @param page The page to validate - * @param nonce The VDO nonce - * @param pbn The expected absolute PBN of the page - * - * @return The validity of the page - **/ -BlockMapPageValidity validateBlockMapPage(BlockMapPage *page, - Nonce nonce, - PhysicalBlockNumber pbn) - __attribute__((warn_unused_result)); - -/** - * Update an entry on a block map page. - * - * @param [in] page The page to update - * @param [in] dataVIO The DataVIO making the update - * @param [in] pbn The new PBN for the entry - * @param [in] mappingState The new mapping state for the entry - * @param [in,out] recoveryLock A reference to the current recovery sequence - * number lock held by the page. Will be updated - * if the lock changes to protect the new entry - **/ -void updateBlockMapPage(BlockMapPage *page, - DataVIO *dataVIO, - PhysicalBlockNumber pbn, - BlockMappingState mappingState, - SequenceNumber *recoveryLock); - -#endif // BLOCK_MAP_PAGE_H diff --git a/vdo/base/blockMapRecovery.c b/vdo/base/blockMapRecovery.c deleted file mode 100644 index f70be42..0000000 --- a/vdo/base/blockMapRecovery.c +++ /dev/null @@ -1,542 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.c#7 $ - */ - -#include "blockMapRecovery.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "heap.h" -#include "numUtils.h" -#include "refCounts.h" -#include "slabDepot.h" -#include "types.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" - -/** - * A completion to manage recovering the block map from the recovery journal. - * Note that the page completions kept in this structure are not immediately - * freed, so the corresponding pages will be locked down in the page cache - * until the recovery frees them. - **/ -typedef struct { - /** completion header */ - VDOCompletion completion; - /** the completion for flushing the block map */ - VDOCompletion subTaskCompletion; - /** the thread from which the block map may be flushed */ - ThreadID adminThread; - /** the thread on which all block map operations must be done */ - ThreadID logicalThreadID; - /** the block map */ - BlockMap *blockMap; - /** whether this recovery has been aborted */ - bool aborted; - /** whether we are currently launching the initial round of requests */ - bool launching; - - // Fields for the journal entries. - /** the journal entries to apply */ - NumberedBlockMapping *journalEntries; - /** - * a heap wrapping journalEntries. It re-orders and sorts journal entries in - * ascending LBN order, then original journal order. This permits efficient - * iteration over the journal entries in order. - **/ - Heap replayHeap; - - // Fields tracking progress through the journal entries. - /** a pointer to the next journal entry to apply */ - NumberedBlockMapping *currentEntry; - /** the next entry for which the block map page has not been requested */ - NumberedBlockMapping *currentUnfetchedEntry; - - // Fields tracking requested pages. - /** the absolute PBN of the current page being processed */ - PhysicalBlockNumber pbn; - /** number of pending (non-ready) requests */ - PageCount outstanding; - /** number of page completions */ - PageCount pageCount; - /** array of requested, potentially ready page completions */ - VDOPageCompletion pageCompletions[]; -} BlockMapRecoveryCompletion; - -/** - * This is a HeapComparator function that orders NumberedBlockMappings using - * the 'blockMapSlot' field as the primary key and the mapping 'number' field - * as the secondary key. Using the mapping number preserves the journal order - * of entries for the same slot, allowing us to sort by slot while still - * ensuring we replay all entries with the same slot in the exact order as they - * appeared in the journal. - * - *

The comparator order is reversed from the usual sense since Heap is a - * max-heap, returning larger elements before smaller ones, but we want to pop - * entries off the heap in ascending LBN order. - **/ -static int compareMappings(const void *item1, const void *item2) -{ - const NumberedBlockMapping *mapping1 = (const NumberedBlockMapping *) item1; - const NumberedBlockMapping *mapping2 = (const NumberedBlockMapping *) item2; - - if (mapping1->blockMapSlot.pbn != mapping2->blockMapSlot.pbn) { - return - ((mapping1->blockMapSlot.pbn < mapping2->blockMapSlot.pbn) ? 1 : -1); - } - - if (mapping1->blockMapSlot.slot != mapping2->blockMapSlot.slot) { - return - ((mapping1->blockMapSlot.slot < mapping2->blockMapSlot.slot) ? 1 : -1); - } - - if (mapping1->number != mapping2->number) { - return ((mapping1->number < mapping2->number) ? 1 : -1); - } - - return 0; -} - -/** - * Swap two NumberedBlockMapping structures. Implements HeapSwapper. - **/ -static void swapMappings(void *item1, void *item2) -{ - NumberedBlockMapping *mapping1 = item1; - NumberedBlockMapping *mapping2 = item2; - NumberedBlockMapping temp = *mapping1; - *mapping1 = *mapping2; - *mapping2 = temp; -} - -/** - * Convert a VDOCompletion to a BlockMapRecoveryCompletion. - * - * @param completion The completion to convert - * - * @return The completion as a BlockMapRecoveryCompletion - **/ -__attribute__((warn_unused_result)) -static inline BlockMapRecoveryCompletion * -asBlockMapRecoveryCompletion(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(BlockMapRecoveryCompletion, completion) == 0); - assertCompletionType(completion->type, BLOCK_MAP_RECOVERY_COMPLETION); - return (BlockMapRecoveryCompletion *) completion; -} - -/** - * Free a BlockMapRecoveryCompletion and null out the reference to it. - * - * @param completionPtr a pointer to the completion to free - **/ -static void freeRecoveryCompletion(VDOCompletion **completionPtr) -{ - VDOCompletion *completion = *completionPtr; - if (completion == NULL) { - return; - } - - BlockMapRecoveryCompletion *recovery - = asBlockMapRecoveryCompletion(*completionPtr); - destroyEnqueueable(completion); - destroyEnqueueable(&recovery->subTaskCompletion); - FREE(recovery); - *completionPtr = NULL; -} - -/** - * Free the BlockMapRecoveryCompletion and notify the parent that the block map - * recovery is done. This callback is registered in makeRecoveryCompletion(). - * - * @param completion The BlockMapRecoveryCompletion - **/ -static void finishBlockMapRecovery(VDOCompletion *completion) -{ - int result = completion->result; - VDOCompletion *parent = completion->parent; - freeRecoveryCompletion(&completion); - finishCompletion(parent, result); -} - -/** - * Make a new block map recovery completion. - * - * @param [in] vdo The VDO - * @param [in] entryCount The number of journal entries - * @param [in] journalEntries An array of journal entries to process - * @param [in] parent The parent of the recovery completion - * @param [out] recoveryPtr The new block map recovery completion - * - * @return a success or error code - **/ -static int makeRecoveryCompletion(VDO *vdo, - BlockCount entryCount, - NumberedBlockMapping *journalEntries, - VDOCompletion *parent, - BlockMapRecoveryCompletion **recoveryPtr) -{ - BlockMap *blockMap = getBlockMap(vdo); - PageCount pageCount - = minPageCount(getConfiguredCacheSize(vdo) >> 1, - MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS); - - BlockMapRecoveryCompletion *recovery; - int result = ALLOCATE_EXTENDED(BlockMapRecoveryCompletion, pageCount, - VDOPageCompletion, __func__, &recovery); - if (result != UDS_SUCCESS) { - return result; - } - - result = initializeEnqueueableCompletion(&recovery->completion, - BLOCK_MAP_RECOVERY_COMPLETION, - vdo->layer); - if (result != VDO_SUCCESS) { - VDOCompletion *completion = &recovery->completion; - freeRecoveryCompletion(&completion); - return result; - } - - result = initializeEnqueueableCompletion(&recovery->subTaskCompletion, - SUB_TASK_COMPLETION, vdo->layer); - if (result != VDO_SUCCESS) { - VDOCompletion *completion = &recovery->completion; - freeRecoveryCompletion(&completion); - return result; - } - - recovery->blockMap = blockMap; - recovery->journalEntries = journalEntries; - recovery->pageCount = pageCount; - recovery->currentEntry = &recovery->journalEntries[entryCount - 1]; - - const ThreadConfig *threadConfig = getThreadConfig(vdo); - recovery->adminThread = getAdminThread(threadConfig); - recovery->logicalThreadID = getLogicalZoneThread(threadConfig, 0); - - // Organize the journal entries into a binary heap so we can iterate over - // them in sorted order incrementally, avoiding an expensive sort call. - initializeHeap(&recovery->replayHeap, compareMappings, swapMappings, - journalEntries, entryCount, sizeof(NumberedBlockMapping)); - buildHeap(&recovery->replayHeap, entryCount); - - ASSERT_LOG_ONLY((getCallbackThreadID() == recovery->logicalThreadID), - "%s must be called on logical thread %u (not %u)", __func__, - recovery->logicalThreadID, getCallbackThreadID()); - prepareCompletion(&recovery->completion, finishBlockMapRecovery, - finishBlockMapRecovery, recovery->logicalThreadID, parent); - - // This message must be recognizable by VDOTest::RebuildBase. - logInfo("Replaying %zu recovery entries into block map", - recovery->replayHeap.count); - - *recoveryPtr = recovery; - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void flushBlockMap(VDOCompletion *completion) -{ - logInfo("Flushing block map changes"); - BlockMapRecoveryCompletion *recovery - = asBlockMapRecoveryCompletion(completion->parent); - ASSERT_LOG_ONLY((completion->callbackThreadID == recovery->adminThread), - "flushBlockMap() called on admin thread"); - - prepareToFinishParent(completion, completion->parent); - drainBlockMap(recovery->blockMap, ADMIN_STATE_RECOVERING, completion); -} - -/** - * Check whether the recovery is done. If so, finish it by either flushing the - * block map (if the recovery was successful), or by cleaning up (if it - * wasn't). - * - * @param recovery The recovery completion - * - * @return true if the recovery or recovery is complete - **/ -static bool finishIfDone(BlockMapRecoveryCompletion *recovery) -{ - // Pages are still being launched or there is still work to do - if (recovery->launching || (recovery->outstanding > 0) - || (!recovery->aborted - && (recovery->currentEntry >= recovery->journalEntries))) { - return false; - } - - if (recovery->aborted) { - /* - * We need to be careful here to only free completions that exist. But - * since we know none are outstanding, we just go through the ready ones. - */ - for (size_t i = 0; i < recovery->pageCount; i++) { - VDOPageCompletion *pageCompletion = &recovery->pageCompletions[i]; - if (recovery->pageCompletions[i].ready) { - releaseVDOPageCompletion(&pageCompletion->completion); - } - } - completeCompletion(&recovery->completion); - } else { - launchCallbackWithParent(&recovery->subTaskCompletion, flushBlockMap, - recovery->adminThread, &recovery->completion); - } - - return true; -} - -/** - * Note that there has been an error during the recovery and finish it if there - * is nothing else outstanding. - * - * @param recovery The BlockMapRecoveryCompletion - * @param result The error result to use, if one is not already saved - **/ -static void abortRecovery(BlockMapRecoveryCompletion *recovery, int result) -{ - recovery->aborted = true; - setCompletionResult(&recovery->completion, result); - finishIfDone(recovery); -} - -/** - * Find the first journal entry after a given entry which is not on the same - * block map page. - * - * @param recovery the BlockMapRecoveryCompletion - * @param currentEntry the entry to search from - * @param needsSort Whether sorting is needed to proceed - * - * @return Pointer to the first later journal entry on a different block map - * page, or a pointer to just before the journal entries if no - * subsequent entry is on a different block map page. - **/ -static NumberedBlockMapping * -findEntryStartingNextPage(BlockMapRecoveryCompletion *recovery, - NumberedBlockMapping *currentEntry, - bool needsSort) -{ - // If currentEntry is invalid, return immediately. - if (currentEntry < recovery->journalEntries) { - return currentEntry; - } - size_t currentPage = currentEntry->blockMapSlot.pbn; - - // Decrement currentEntry until it's out of bounds or on a different page. - while ((currentEntry >= recovery->journalEntries) - && (currentEntry->blockMapSlot.pbn == currentPage)) { - if (needsSort) { - NumberedBlockMapping *justSortedEntry - = sortNextHeapElement(&recovery->replayHeap); - ASSERT_LOG_ONLY(justSortedEntry < currentEntry, - "heap is returning elements in an unexpected order"); - } - currentEntry--; - } - return currentEntry; -} - -/** - * Apply a range of journal entries to a block map page. - * - * @param page The block map page being modified - * @param startingEntry The first journal entry to apply - * @param endingEntry The entry just past the last journal entry to apply - **/ -static void applyJournalEntriesToPage(BlockMapPage *page, - NumberedBlockMapping *startingEntry, - NumberedBlockMapping *endingEntry) -{ - NumberedBlockMapping *currentEntry = startingEntry; - while (currentEntry != endingEntry) { - page->entries[currentEntry->blockMapSlot.slot] - = currentEntry->blockMapEntry; - currentEntry--; - } -} - -/**********************************************************************/ -static void recoverReadyPages(BlockMapRecoveryCompletion *recovery, - VDOCompletion *completion); - -/** - * Note that a page is now ready and attempt to process pages. This callback is - * registered in fetchPage(). - * - * @param completion The VDOPageCompletion for the fetched page - **/ -static void pageLoaded(VDOCompletion *completion) -{ - BlockMapRecoveryCompletion *recovery - = asBlockMapRecoveryCompletion(completion->parent); - recovery->outstanding--; - if (!recovery->launching) { - recoverReadyPages(recovery, completion); - } -} - -/** - * Handle an error loading a page. - * - * @param completion The VDOPageCompletion - **/ -static void handlePageLoadError(VDOCompletion *completion) -{ - BlockMapRecoveryCompletion *recovery - = asBlockMapRecoveryCompletion(completion->parent); - recovery->outstanding--; - abortRecovery(recovery, completion->result); -} - -/** - * Fetch a page from the block map. - * - * @param recovery the BlockMapRecoveryCompletion - * @param completion the page completion to use - **/ -static void fetchPage(BlockMapRecoveryCompletion *recovery, - VDOCompletion *completion) -{ - if (recovery->currentUnfetchedEntry < recovery->journalEntries) { - // Nothing left to fetch. - return; - } - - // Fetch the next page we haven't yet requested. - PhysicalBlockNumber newPBN - = recovery->currentUnfetchedEntry->blockMapSlot.pbn; - recovery->currentUnfetchedEntry - = findEntryStartingNextPage(recovery, recovery->currentUnfetchedEntry, - true); - initVDOPageCompletion(((VDOPageCompletion *) completion), - recovery->blockMap->zones[0].pageCache, - newPBN, true, &recovery->completion, - pageLoaded, handlePageLoadError); - recovery->outstanding++; - getVDOPageAsync(completion); -} - -/** - * Get the next page completion to process. If it isn't ready, we'll try again - * when it is. - * - * @param recovery The recovery completion - * @param completion The current page completion - * - * @return The next page completion to process - **/ -static VDOPageCompletion * -getNextPageCompletion(BlockMapRecoveryCompletion *recovery, - VDOPageCompletion *completion) -{ - completion++; - if (completion == (&recovery->pageCompletions[recovery->pageCount])) { - completion = &recovery->pageCompletions[0]; - } - return completion; -} - -/** - * Recover from as many pages as possible. - * - * @param recovery The recovery completion - * @param completion The first page completion to process - **/ -static void recoverReadyPages(BlockMapRecoveryCompletion *recovery, - VDOCompletion *completion) -{ - if (finishIfDone(recovery)) { - return; - } - - VDOPageCompletion *pageCompletion = (VDOPageCompletion *) completion; - if (recovery->pbn != pageCompletion->pbn) { - return; - } - - while (pageCompletion->ready) { - BlockMapPage *page = dereferenceWritableVDOPage(completion); - int result = ASSERT(page != NULL, "page available"); - if (result != VDO_SUCCESS) { - abortRecovery(recovery, result); - return; - } - - NumberedBlockMapping *startOfNextPage - = findEntryStartingNextPage(recovery, recovery->currentEntry, false); - applyJournalEntriesToPage(page, recovery->currentEntry, startOfNextPage); - recovery->currentEntry = startOfNextPage; - requestVDOPageWrite(completion); - releaseVDOPageCompletion(completion); - - if (finishIfDone(recovery)) { - return; - } - - recovery->pbn = recovery->currentEntry->blockMapSlot.pbn; - fetchPage(recovery, completion); - pageCompletion = getNextPageCompletion(recovery, pageCompletion); - completion = &pageCompletion->completion; - } -} - -/**********************************************************************/ -void recoverBlockMap(VDO *vdo, - BlockCount entryCount, - NumberedBlockMapping *journalEntries, - VDOCompletion *parent) -{ - BlockMapRecoveryCompletion *recovery; - int result = makeRecoveryCompletion(vdo, entryCount, journalEntries, parent, - &recovery); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - if (isHeapEmpty(&recovery->replayHeap)) { - finishCompletion(&recovery->completion, VDO_SUCCESS); - return; - } - - NumberedBlockMapping *firstSortedEntry - = sortNextHeapElement(&recovery->replayHeap); - ASSERT_LOG_ONLY(firstSortedEntry == recovery->currentEntry, - "heap is returning elements in an unexpected order"); - - // Prevent any page from being processed until all pages have been launched. - recovery->launching = true; - recovery->pbn = recovery->currentEntry->blockMapSlot.pbn; - recovery->currentUnfetchedEntry = recovery->currentEntry; - for (PageCount i = 0; i < recovery->pageCount; i++) { - if (recovery->currentUnfetchedEntry < recovery->journalEntries) { - break; - } - - fetchPage(recovery, &recovery->pageCompletions[i].completion); - } - recovery->launching = false; - - // Process any ready pages. - recoverReadyPages(recovery, &recovery->pageCompletions[0].completion); -} diff --git a/vdo/base/blockMapRecovery.h b/vdo/base/blockMapRecovery.h deleted file mode 100644 index 9029bf0..0000000 --- a/vdo/base/blockMapRecovery.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapRecovery.h#1 $ - */ - -#ifndef BLOCK_MAP_RECOVERY_H -#define BLOCK_MAP_RECOVERY_H - -#include "blockMap.h" -#include "blockMappingState.h" -#include "types.h" - -/** - * An explicitly numbered block mapping. Numbering the mappings allows them to - * be sorted by logical block number during recovery while still preserving - * the relative order of journal entries with the same logical block number. - **/ -typedef struct { - BlockMapSlot blockMapSlot; // Block map slot to map - BlockMapEntry blockMapEntry; // The encoded block map entry for the LBN - uint32_t number; // The serial number to use during replay -} __attribute__((packed)) NumberedBlockMapping; - -/** - * Recover the block map (normal rebuild). - * - * @param vdo The VDO - * @param entryCount The number of journal entries - * @param journalEntries An array of journal entries to process - * @param parent The completion to notify when the rebuild is complete - **/ -void recoverBlockMap(VDO *vdo, - BlockCount entryCount, - NumberedBlockMapping *journalEntries, - VDOCompletion *parent); - -#endif // BLOCK_MAP_RECOVERY_H diff --git a/vdo/base/blockMapTree.c b/vdo/base/blockMapTree.c deleted file mode 100644 index fb2b4f4..0000000 --- a/vdo/base/blockMapTree.c +++ /dev/null @@ -1,1272 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTree.c#21 $ - */ - -#include "blockMapTree.h" - -#include "logger.h" - -#include "blockMap.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "blockMapTreeInternals.h" -#include "constants.h" -#include "dataVIO.h" -#include "dirtyLists.h" -#include "forest.h" -#include "numUtils.h" -#include "recoveryJournal.h" -#include "referenceOperation.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "types.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" -#include "vioPool.h" - -enum { - BLOCK_MAP_VIO_POOL_SIZE = 64, -}; - -typedef struct __attribute__((packed)) { - RootCount rootIndex; - Height height; - PageNumber pageIndex; - SlotNumber slot; -} PageDescriptor; - -typedef union { - PageDescriptor descriptor; - uint64_t key; -} PageKey; - -typedef struct { - BlockMapTreeZone *zone; - uint8_t generation; -} WriteIfNotDirtiedContext; - -/** - * An invalid PBN used to indicate that the page holding the location of a - * tree root has been "loaded". - **/ -const PhysicalBlockNumber INVALID_PBN = 0xFFFFFFFFFFFFFFFF; - -/** - * Convert a RingNode to a TreePage. - * - * @param ringNode The RingNode to convert - * - * @return The TreePage which owns the RingNode - **/ -static inline TreePage *treePageFromRingNode(RingNode *ringNode) -{ - return (TreePage *) ((byte *) ringNode - offsetof(TreePage, node)); -} - -/**********************************************************************/ -static void writeDirtyPagesCallback(RingNode *expired, void *context); - -/** - * Make VIOs for reading, writing, and allocating the arboreal block map. - * - * Implements VIOConstructor. - **/ -__attribute__((warn_unused_result)) -static int makeBlockMapVIOs(PhysicalLayer *layer, - void *parent, - void *buffer, - VIO **vioPtr) -{ - return createVIO(layer, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA, - parent, buffer, vioPtr); -} - -/**********************************************************************/ -int initializeTreeZone(BlockMapZone *zone, - PhysicalLayer *layer, - BlockCount eraLength) -{ - STATIC_ASSERT_SIZEOF(PageDescriptor, sizeof(uint64_t)); - BlockMapTreeZone *treeZone = &zone->treeZone; - treeZone->mapZone = zone; - - int result = makeDirtyLists(eraLength, writeDirtyPagesCallback, treeZone, - &treeZone->dirtyLists); - if (result != VDO_SUCCESS) { - return result; - } - - result = makeIntMap(LOCK_MAP_CAPACITY, 0, &treeZone->loadingPages); - if (result != VDO_SUCCESS) { - return result; - } - - return makeVIOPool(layer, BLOCK_MAP_VIO_POOL_SIZE, zone->threadID, - makeBlockMapVIOs, treeZone, &treeZone->vioPool); -} - -/**********************************************************************/ -int replaceTreeZoneVIOPool(BlockMapTreeZone *zone, - PhysicalLayer *layer, - size_t poolSize) -{ - freeVIOPool(&zone->vioPool); - return makeVIOPool(layer, poolSize, zone->mapZone->threadID, - makeBlockMapVIOs, zone, &zone->vioPool); -} - -/**********************************************************************/ -void uninitializeBlockMapTreeZone(BlockMapTreeZone *treeZone) -{ - freeDirtyLists(&treeZone->dirtyLists); - freeVIOPool(&treeZone->vioPool); - freeIntMap(&treeZone->loadingPages); -} - -/**********************************************************************/ -void setTreeZoneInitialPeriod(BlockMapTreeZone *treeZone, - SequenceNumber period) -{ - setCurrentPeriod(treeZone->dirtyLists, period); -} - -/** - * Get the BlockMapTreeZone in which a DataVIO is operating. - * - * @param dataVIO The DataVIO - * - * @return The BlockMapTreeZone - **/ -__attribute__((warn_unused_result)) -static inline BlockMapTreeZone *getBlockMapTreeZone(DataVIO *dataVIO) -{ - return &(getBlockMapForZone(dataVIO->logical.zone)->treeZone); -} - -/** - * Get the TreePage for a given lock. This will be the page referred to by the - * lock's tree slot for the lock's current height. - * - * @param zone The tree zone of the tree - * @param lock The lock describing the page to get - * - * @return The requested page - **/ -static inline TreePage *getTreePage(const BlockMapTreeZone *zone, - const TreeLock *lock) -{ - return getTreePageByIndex(zone->mapZone->blockMap->forest, - lock->rootIndex, - lock->height, - lock->treeSlots[lock->height].pageIndex); -} - -/**********************************************************************/ -bool copyValidPage(char *buffer, - Nonce nonce, - PhysicalBlockNumber pbn, - BlockMapPage *page) -{ - BlockMapPage *loaded = (BlockMapPage *) buffer; - BlockMapPageValidity validity = validateBlockMapPage(loaded, nonce, pbn); - if (validity == BLOCK_MAP_PAGE_VALID) { - memcpy(page, loaded, VDO_BLOCK_SIZE); - return true; - } - - if (validity == BLOCK_MAP_PAGE_BAD) { - logErrorWithStringError(VDO_BAD_PAGE, - "Expected page %" PRIu64 - " but got page %llu instead", - pbn, getBlockMapPagePBN(loaded)); - } - - return false; -} - -/**********************************************************************/ -bool isTreeZoneActive(BlockMapTreeZone *zone) -{ - return ((zone->activeLookups != 0) - || hasWaiters(&zone->flushWaiters) - || isVIOPoolBusy(zone->vioPool)); -} - -/** - * Put the VDO in read-only mode and wake any VIOs waiting for a flush. - * - * @param zone The zone - * @param result The error which is causing read-only mode - **/ -static void enterZoneReadOnlyMode(BlockMapTreeZone *zone, int result) -{ - enterReadOnlyMode(zone->mapZone->readOnlyNotifier, result); - - // We are in read-only mode, so we won't ever write any page out. Just take - // all waiters off the queue so the tree zone can be closed. - while (hasWaiters(&zone->flushWaiters)) { - dequeueNextWaiter(&zone->flushWaiters); - } - - checkForDrainComplete(zone->mapZone); -} - -/** - * Check whether a generation is strictly older than some other generation in - * the context of a zone's current generation range. - * - * @param zone The zone in which to do the comparison - * @param a The generation in question - * @param b The generation to compare to - * - * @return true if generation a is not strictly older than - * generation b in the context of the zone - **/ -__attribute__((warn_unused_result)) -static bool isNotOlder(BlockMapTreeZone *zone, uint8_t a, uint8_t b) -{ - int result = ASSERT((inCyclicRange(zone->oldestGeneration, a, - zone->generation, 1 << 8) - && inCyclicRange(zone->oldestGeneration, b, - zone->generation, 1 << 8)), - "generation(s) %u, %u are out of range [%u, %u]", - a, b, zone->oldestGeneration, zone->generation); - if (result != VDO_SUCCESS) { - enterZoneReadOnlyMode(zone, result); - return true; - } - - return inCyclicRange(b, a, zone->generation, 1 << 8); -} - -/** - * Decrement the count for a generation and roll the oldest generation if there - * are no longer any active pages in it. - * - * @param zone The zone - * @param generation The generation to release - **/ -static void releaseGeneration(BlockMapTreeZone *zone, uint8_t generation) -{ - int result = ASSERT((zone->dirtyPageCounts[generation] > 0), - "dirty page count underflow for generation %u", - generation); - if (result != VDO_SUCCESS) { - enterZoneReadOnlyMode(zone, result); - return; - } - - zone->dirtyPageCounts[generation]--; - while ((zone->dirtyPageCounts[zone->oldestGeneration] == 0) - && (zone->oldestGeneration != zone->generation)) { - zone->oldestGeneration++; - } -} - -/** - * Set the generation of a page and update the dirty page count in the zone. - * - * @param zone The zone which owns the page - * @param page The page - * @param newGeneration The generation to set - * @param decrementOld Whether to decrement the count of the page's old - * generation - **/ -static void setGeneration(BlockMapTreeZone *zone, - TreePage *page, - uint8_t newGeneration, - bool decrementOld) -{ - uint8_t oldGeneration = page->generation; - if (decrementOld && (oldGeneration == newGeneration)) { - return; - } - - page->generation = newGeneration; - uint32_t newCount = ++zone->dirtyPageCounts[newGeneration]; - int result = ASSERT((newCount != 0), - "dirty page count overflow for generation %u", - newGeneration); - if (result != VDO_SUCCESS) { - enterZoneReadOnlyMode(zone, result); - return; - } - - if (decrementOld) { - releaseGeneration(zone, oldGeneration); - } -} - -/**********************************************************************/ -static void writePage(TreePage *treePage, VIOPoolEntry *entry); - -/** - * Write out a dirty page if it is still covered by the most recent flush - * or if it is the flusher. - * - *

Implements WaiterCallback - * - * @param waiter The page to write - * @param context The VIOPoolEntry with which to do the write - **/ -static void writePageCallback(Waiter *waiter, void *context) -{ - STATIC_ASSERT(offsetof(TreePage, waiter) == 0); - writePage((TreePage *) waiter, (VIOPoolEntry *) context); -} - -/** - * Acquire a VIO for writing a dirty page. - * - * @param waiter The page which needs a VIO - * @param zone The zone - **/ -static void acquireVIO(Waiter *waiter, BlockMapTreeZone *zone) -{ - waiter->callback = writePageCallback; - int result = acquireVIOFromPool(zone->vioPool, waiter); - if (result != VDO_SUCCESS) { - enterZoneReadOnlyMode(zone, result); - } -} - -/** - * Attempt to increment the generation. - * - * @param zone The zone whose generation is to be incremented - * - * @return true if all possible generations were not already - * active - **/ -static bool attemptIncrement(BlockMapTreeZone *zone) -{ - uint8_t generation = zone->generation + 1; - if (zone->oldestGeneration == generation) { - return false; - } - - zone->generation = generation; - return true; -} - -/** - * Enqueue a page to either launch a flush or wait for the current flush which - * is already in progress. - * - * @param page The page to enqueue - * @param zone The zone - **/ -static void enqueuePage(TreePage *page, BlockMapTreeZone *zone) -{ - if ((zone->flusher == NULL) && attemptIncrement(zone)) { - zone->flusher = page; - acquireVIO(&page->waiter, zone); - return; - } - - int result = enqueueWaiter(&zone->flushWaiters, &page->waiter); - if (result != VDO_SUCCESS) { - enterZoneReadOnlyMode(zone, result); - } -} - -/** - * Write pages which were waiting for a flush and have not been redirtied. - * Requeue those pages which were redirtied. - * - *

Implements WaiterCallback. - * - * @param waiter The dirty page - * @param context The zone and generation - **/ -static void writePageIfNotDirtied(Waiter *waiter, void *context) -{ - STATIC_ASSERT(offsetof(TreePage, waiter) == 0); - TreePage *page = (TreePage *) waiter; - WriteIfNotDirtiedContext *writeContext = context; - if (page->generation == writeContext->generation) { - acquireVIO(waiter, writeContext->zone); - return; - } - - enqueuePage(page, writeContext->zone); -} - -/** - * Return a VIO to the zone's pool. - * - * @param zone The zone which owns the pool - * @param entry The pool entry to return - **/ -static void returnToPool(BlockMapTreeZone *zone, VIOPoolEntry *entry) -{ - returnVIOToPool(zone->vioPool, entry); - checkForDrainComplete(zone->mapZone); -} - -/** - * Handle the successful write of a tree page. This callback is registered in - * writeInitializedPage(). - * - * @param completion The VIO doing the write - **/ -static void finishPageWrite(VDOCompletion *completion) -{ - VIOPoolEntry *entry = completion->parent; - TreePage *page = entry->parent; - BlockMapTreeZone *zone = entry->context; - releaseRecoveryJournalBlockReference(zone->mapZone->blockMap->journal, - page->writingRecoveryLock, - ZONE_TYPE_LOGICAL, - zone->mapZone->zoneNumber); - - bool dirty = (page->writingGeneration != page->generation); - releaseGeneration(zone, page->writingGeneration); - page->writing = false; - - if (zone->flusher == page) { - WriteIfNotDirtiedContext context = { - .zone = zone, - .generation = page->writingGeneration, - }; - notifyAllWaiters(&zone->flushWaiters, writePageIfNotDirtied, &context); - if (dirty && attemptIncrement(zone)) { - writePage(page, entry); - return; - } - - zone->flusher = NULL; - } - - if (dirty) { - enqueuePage(page, zone); - } else if ((zone->flusher == NULL) - && hasWaiters(&zone->flushWaiters) - && attemptIncrement(zone)) { - zone->flusher = (TreePage *) dequeueNextWaiter(&zone->flushWaiters); - writePage(zone->flusher, entry); - return; - } - - returnToPool(zone, entry); -} - -/** - * Handle an error writing a tree page. This error handler is registered in - * writePage() and writeInitializedPage(). - * - * @param completion The VIO doing the write - **/ -static void handleWriteError(VDOCompletion *completion) -{ - int result = completion->result; - VIOPoolEntry *entry = completion->parent; - BlockMapTreeZone *zone = entry->context; - enterZoneReadOnlyMode(zone, result); - returnToPool(zone, entry); -} - -/** - * Write a page which has been written at least once. This callback is - * registered in (or called directly from) writePage(). - * - * @param completion The VIO which will do the write - **/ -static void writeInitializedPage(VDOCompletion *completion) -{ - VIOPoolEntry *entry = completion->parent; - BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; - TreePage *treePage = (TreePage *) entry->parent; - - /* - * Set the initialized field of the copy of the page we are writing to true. - * We don't want to set it true on the real page in memory until after this - * write succeeds. - */ - BlockMapPage *page = (BlockMapPage *) entry->buffer; - markBlockMapPageInitialized(page, true); - launchWriteMetadataVIOWithFlush(entry->vio, getBlockMapPagePBN(page), - finishPageWrite, handleWriteError, - (zone->flusher == treePage), false); -} - -/** - * Write a dirty tree page now that we have a VIO with which to write it. - * - * @param treePage The page to write - * @param entry The VIOPoolEntry with which to write - **/ -static void writePage(TreePage *treePage, VIOPoolEntry *entry) -{ - BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; - if ((zone->flusher != treePage) - && (isNotOlder(zone, treePage->generation, zone->generation))) { - // This page was re-dirtied after the last flush was issued, hence we need - // to do another flush. - enqueuePage(treePage, zone); - returnToPool(zone, entry); - return; - } - - entry->parent = treePage; - memcpy(entry->buffer, treePage->pageBuffer, VDO_BLOCK_SIZE); - - VDOCompletion *completion = vioAsCompletion(entry->vio); - completion->callbackThreadID = zone->mapZone->threadID; - - treePage->writing = true; - treePage->writingGeneration = treePage->generation; - treePage->writingRecoveryLock = treePage->recoveryLock; - - // Clear this now so that we know this page is not on any dirty list. - treePage->recoveryLock = 0; - - BlockMapPage *page = asBlockMapPage(treePage); - if (!markBlockMapPageInitialized(page, true)) { - writeInitializedPage(completion); - return; - } - - launchWriteMetadataVIO(entry->vio, getBlockMapPagePBN(page), - writeInitializedPage, handleWriteError); -} - -/** - * Schedule a batch of dirty pages for writing. - * - *

Implements DirtyListsCallback. - * - * @param expired The pages to write - * @param context The zone - **/ -static void writeDirtyPagesCallback(RingNode *expired, void *context) -{ - BlockMapTreeZone *zone = (BlockMapTreeZone *) context; - uint8_t generation = zone->generation; - while (!isRingEmpty(expired)) { - TreePage *page = treePageFromRingNode(chopRingNode(expired)); - - int result = ASSERT(!isWaiting(&page->waiter), - "Newly expired page not already waiting to write"); - if (result != VDO_SUCCESS) { - enterZoneReadOnlyMode(zone, result); - continue; - } - - setGeneration(zone, page, generation, false); - if (!page->writing) { - enqueuePage(page, zone); - } - } -} - -/**********************************************************************/ -void advanceZoneTreePeriod(BlockMapTreeZone *zone, SequenceNumber period) -{ - advancePeriod(zone->dirtyLists, period); -} - -/**********************************************************************/ -void drainZoneTrees(BlockMapTreeZone *zone) -{ - ASSERT_LOG_ONLY((zone->activeLookups == 0), - "drainZoneTrees() called with no active lookups"); - if (!isSuspending(&zone->mapZone->state)) { - flushDirtyLists(zone->dirtyLists); - } -} - -/** - * Release a lock on a page which was being loaded or allocated. - * - * @param dataVIO The DataVIO releasing the page lock - * @param what What the DataVIO was doing (for logging) - **/ -static void releasePageLock(DataVIO *dataVIO, char *what) -{ - TreeLock *lock = &dataVIO->treeLock; - ASSERT_LOG_ONLY(lock->locked, - "release of unlocked block map page %s for key %" PRIu64 - " in tree %u", - what, lock->key, lock->rootIndex); - BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); - TreeLock *lockHolder = intMapRemove(zone->loadingPages, lock->key); - ASSERT_LOG_ONLY((lockHolder == lock), - "block map page %s mismatch for key %llu in tree %u", - what, lock->key, lock->rootIndex); - lock->locked = false; -} - -/** - * Continue a DataVIO now that the lookup is complete. - * - * @param dataVIO The DataVIO - * @param result The result of the lookup - **/ -static void finishLookup(DataVIO *dataVIO, int result) -{ - dataVIO->treeLock.height = 0; - - BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); - --zone->activeLookups; - - VDOCompletion *completion = dataVIOAsCompletion(dataVIO); - setCompletionResult(completion, result); - launchCallback(completion, dataVIO->treeLock.callback, - dataVIO->treeLock.threadID); -} - -/** - * Abort a block map PBN lookup due to an error in the load or allocation on - * which we were waiting. - * - * @param waiter The DataVIO which was waiting for a page load or allocation - * @param context The error which caused the abort - **/ -static void abortLookupForWaiter(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - int result = *((int *) context); - if (isReadDataVIO(dataVIO)) { - if (result == VDO_NO_SPACE) { - result = VDO_SUCCESS; - } - } else if (result != VDO_NO_SPACE) { - result = VDO_READ_ONLY; - } - - finishLookup(dataVIO, result); -} - -/** - * Abort a block map PBN lookup due to an error loading or allocating a page. - * - * @param dataVIO The DataVIO which was loading or allocating a page - * @param result The error code - * @param what What the DataVIO was doing (for logging) - **/ -static void abortLookup(DataVIO *dataVIO, int result, char *what) -{ - if (result != VDO_NO_SPACE) { - enterZoneReadOnlyMode(getBlockMapTreeZone(dataVIO), result); - } - - if (dataVIO->treeLock.locked) { - releasePageLock(dataVIO, what); - notifyAllWaiters(&dataVIO->treeLock.waiters, abortLookupForWaiter, - &result); - } - - finishLookup(dataVIO, result); -} - -/** - * Abort a block map PBN lookup due to an error loading a page. - * - * @param dataVIO The DataVIO doing the page load - * @param result The error code - **/ -static void abortLoad(DataVIO *dataVIO, int result) -{ - abortLookup(dataVIO, result, "load"); -} - -/** - * Determine if a location represents a valid mapping for a tree page. - * - * @param vdo The VDO - * @param mapping The DataLocation to check - * @param height The height of the entry in the tree - * - * @return true if the entry represents a invalid page mapping - **/ -__attribute__((warn_unused_result)) -static bool isInvalidTreeEntry(const VDO *vdo, - const DataLocation *mapping, - Height height) -{ - if (!isValidLocation(mapping) - || isCompressed(mapping->state) - || (isMappedLocation(mapping) && (mapping->pbn == ZERO_BLOCK))) { - return true; - } - - // Roots aren't physical data blocks, so we can't check their PBNs. - if (height == BLOCK_MAP_TREE_HEIGHT) { - return false; - } - - return !isPhysicalDataBlock(vdo->depot, mapping->pbn); -} - -/**********************************************************************/ -static void loadBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO); -static void allocateBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO); - -/** - * Continue a block map PBN lookup now that a page has been loaded by - * descending one level in the tree. - * - * @param dataVIO The DataVIO doing the lookup - * @param page The page which was just loaded - **/ -static void continueWithLoadedPage(DataVIO *dataVIO, BlockMapPage *page) -{ - TreeLock *lock = &dataVIO->treeLock; - BlockMapTreeSlot slot = lock->treeSlots[lock->height]; - DataLocation mapping - = unpackBlockMapEntry(&page->entries[slot.blockMapSlot.slot]); - if (isInvalidTreeEntry(getVDOFromDataVIO(dataVIO), &mapping, lock->height)) { - logErrorWithStringError(VDO_BAD_MAPPING, - "Invalid block map tree PBN: %llu with " - "state %u for page index %u at height %u", - mapping.pbn, mapping.state, - lock->treeSlots[lock->height - 1].pageIndex, - lock->height - 1); - abortLoad(dataVIO, VDO_BAD_MAPPING); - return; - } - - if (!isMappedLocation(&mapping)) { - // The page we need is unallocated - allocateBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); - return; - } - - lock->treeSlots[lock->height - 1].blockMapSlot.pbn = mapping.pbn; - if (lock->height == 1) { - finishLookup(dataVIO, VDO_SUCCESS); - return; - } - - // We know what page we need to load next - loadBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); -} - -/** - * Continue a block map PBN lookup now that the page load we were waiting on - * has finished. - * - * @param waiter The DataVIO waiting for a page to be loaded - * @param context The page which was just loaded - **/ -static void continueLoadForWaiter(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - dataVIO->treeLock.height--; - continueWithLoadedPage(dataVIO, (BlockMapPage *) context); -} - -/** - * Finish loading a page now that it has been read in from disk. This callback - * is registered in loadPage(). - * - * @param completion The VIO doing the page read - **/ -static void finishBlockMapPageLoad(VDOCompletion *completion) -{ - VIOPoolEntry *entry = completion->parent; - DataVIO *dataVIO = entry->parent; - BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; - TreeLock *treeLock = &dataVIO->treeLock; - - treeLock->height--; - PhysicalBlockNumber pbn - = treeLock->treeSlots[treeLock->height].blockMapSlot.pbn; - TreePage *treePage = getTreePage(zone, treeLock); - BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; - Nonce nonce = zone->mapZone->blockMap->nonce; - if (!copyValidPage(entry->buffer, nonce, pbn, page)) { - formatBlockMapPage(page, nonce, pbn, false); - } - returnVIOToPool(zone->vioPool, entry); - - // Release our claim to the load and wake any waiters - releasePageLock(dataVIO, "load"); - notifyAllWaiters(&treeLock->waiters, continueLoadForWaiter, page); - continueWithLoadedPage(dataVIO, page); -} - -/** - * Handle an error loading a tree page. - * - * @param completion The VIO doing the page read - **/ -static void handleIOError(VDOCompletion *completion) -{ - int result = completion->result; - VIOPoolEntry *entry = completion->parent; - DataVIO *dataVIO = entry->parent; - BlockMapTreeZone *zone = (BlockMapTreeZone *) entry->context; - returnVIOToPool(zone->vioPool, entry); - abortLoad(dataVIO, result); -} - -/** - * Read a tree page from disk now that we've gotten a VIO with which to do the - * read. This WaiterCallback is registered in loadBlockMapPage(). - * - * @param waiter The DataVIO which requires a page load - * @param context The VIOPool entry with which to do the read - **/ -static void loadPage(Waiter *waiter, void *context) -{ - VIOPoolEntry *entry = context; - DataVIO *dataVIO = waiterAsDataVIO(waiter); - - entry->parent = dataVIO; - entry->vio->completion.callbackThreadID - = getBlockMapForZone(dataVIO->logical.zone)->threadID; - - TreeLock *lock = &dataVIO->treeLock; - launchReadMetadataVIO(entry->vio, - lock->treeSlots[lock->height - 1].blockMapSlot.pbn, - finishBlockMapPageLoad, handleIOError); -} - -/** - * Attempt to acquire a lock on a page in the block map tree. If the page is - * already locked, queue up to wait for the lock to be released. If the lock is - * acquired, the DataVIO's treeLock.locked field will be set to true. - * - * @param zone The BlockMapTreeZone in which the DataVIO operates - * @param dataVIO The DataVIO which desires a page lock - * - * @return VDO_SUCCESS or an error - **/ -static int attemptPageLock(BlockMapTreeZone *zone, DataVIO *dataVIO) -{ - TreeLock *lock = &dataVIO->treeLock; - Height height = lock->height; - BlockMapTreeSlot treeSlot = lock->treeSlots[height]; - PageKey key; - key.descriptor = (PageDescriptor) { - .rootIndex = lock->rootIndex, - .height = height, - .pageIndex = treeSlot.pageIndex, - .slot = treeSlot.blockMapSlot.slot, - }; - lock->key = key.key; - - TreeLock *lockHolder; - int result = intMapPut(zone->loadingPages, lock->key, lock, false, - (void **) &lockHolder); - if (result != VDO_SUCCESS) { - return result; - } - - if (lockHolder == NULL) { - // We got the lock - dataVIO->treeLock.locked = true; - return VDO_SUCCESS; - } - - // Someone else is loading or allocating the page we need - return enqueueDataVIO(&lockHolder->waiters, dataVIO, - THIS_LOCATION("$F;cb=blockMapTreePage")); -} - -/** - * Load a block map tree page from disk. - * - * @param zone The BlockMapTreeZone in which the DataVIO operates - * @param dataVIO The DataVIO which requires a page to be loaded - **/ -static void loadBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO) -{ - int result = attemptPageLock(zone, dataVIO); - if (result != VDO_SUCCESS) { - abortLoad(dataVIO, result); - return; - } - - if (dataVIO->treeLock.locked) { - Waiter *waiter = dataVIOAsWaiter(dataVIO); - waiter->callback = loadPage; - result = acquireVIOFromPool(zone->vioPool, waiter); - if (result != VDO_SUCCESS) { - abortLoad(dataVIO, result); - } - } -} - -/** - * Set the callback of a DataVIO after it has allocated a block map page. - * - * @param dataVIO The DataVIO - **/ -static void setPostAllocationCallback(DataVIO *dataVIO) -{ - setCallback(dataVIOAsCompletion(dataVIO), dataVIO->treeLock.callback, - dataVIO->treeLock.threadID); -} - -/** - * Abort a block map PBN lookup due to an error allocating a page. - * - * @param dataVIO The DataVIO doing the page allocation - * @param result The error code - **/ -static void abortAllocation(DataVIO *dataVIO, int result) -{ - setPostAllocationCallback(dataVIO); - abortLookup(dataVIO, result, "allocation"); -} - -/** - * Callback to handle an error while attempting to allocate a page. This - * callback is used to transfer back to the logical zone along the block map - * page allocation path. - * - * @param completion The DataVIO doing the allocation - **/ -static void allocationFailure(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - abortAllocation(dataVIO, completion->result); -} - -/** - * Continue with page allocations now that a parent page has been allocated. - * - * @param waiter The DataVIO which was waiting for a page to be allocated - * @param context The physical block number of the page which was just - * allocated - **/ -static void continueAllocationForWaiter(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - TreeLock *treeLock = &dataVIO->treeLock; - PhysicalBlockNumber pbn = *((PhysicalBlockNumber *) context); - - treeLock->height--; - dataVIO->treeLock.treeSlots[treeLock->height].blockMapSlot.pbn = pbn; - - if (treeLock->height == 0) { - finishLookup(dataVIO, VDO_SUCCESS); - return; - } - - allocateBlockMapPage(getBlockMapTreeZone(dataVIO), dataVIO); -} - -/** - * Finish the page allocation process by recording the allocation in the tree - * and waking any waiters now that the write lock has been released. This - * callback is registered in releaseBlockMapWriteLock(). - * - * @param completion The DataVIO doing the allocation - **/ -static void finishBlockMapAllocation(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - if (completion->result != VDO_SUCCESS) { - allocationFailure(completion); - return; - } - - BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); - TreeLock *treeLock = &dataVIO->treeLock; - TreePage *treePage = getTreePage(zone, treeLock); - Height height = treeLock->height; - - PhysicalBlockNumber pbn = treeLock->treeSlots[height - 1].blockMapSlot.pbn; - - // Record the allocation. - BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; - SequenceNumber oldLock = treePage->recoveryLock; - updateBlockMapPage(page, dataVIO, pbn, MAPPING_STATE_UNCOMPRESSED, - &treePage->recoveryLock); - - if (isWaiting(&treePage->waiter)) { - // This page is waiting to be written out. - if (zone->flusher != treePage) { - // The outstanding flush won't cover the update we just made, so mark - // the page as needing another flush. - setGeneration(zone, treePage, zone->generation, true); - } - } else { - // Put the page on a dirty list - if (oldLock == 0) { - initializeRing(&treePage->node); - } - addToDirtyLists(zone->dirtyLists, &treePage->node, oldLock, - treePage->recoveryLock); - } - - treeLock->height--; - if (height > 1) { - // Format the interior node we just allocated (in memory). - treePage = getTreePage(zone, treeLock); - formatBlockMapPage(treePage->pageBuffer, zone->mapZone->blockMap->nonce, - pbn, false); - } - - // Release our claim to the allocation and wake any waiters - releasePageLock(dataVIO, "allocation"); - notifyAllWaiters(&treeLock->waiters, continueAllocationForWaiter, &pbn); - if (treeLock->height == 0) { - finishLookup(dataVIO, VDO_SUCCESS); - return; - } - - allocateBlockMapPage(zone, dataVIO); -} - -/** - * Release the write lock on a newly allocated block map page now that we - * have made its journal entries and reference count updates. This callback - * is registered in setBlockMapPageReferenceCount(). - * - * @param completion The DataVIO doing the allocation - **/ -static void releaseBlockMapWriteLock(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); - assertInAllocatedZone(dataVIO); - if (completion->result != VDO_SUCCESS) { - launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); - return; - } - - releaseAllocationLock(allocatingVIO); - resetAllocation(allocatingVIO); - launchLogicalCallback(dataVIO, finishBlockMapAllocation, - THIS_LOCATION("$F;cb=finishBlockMapAllocation")); -} - -/** - * Set the reference count of a newly allocated block map page to - * MAXIMUM_REFERENCES now that we have made a recovery journal entry for it. - * MAXIMUM_REFERENCES is used to prevent deduplication against the block after - * we release the write lock on it, but before we write out the page. - * - * @param completion The DataVIO doing the allocation - **/ -static void setBlockMapPageReferenceCount(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInAllocatedZone(dataVIO); - if (completion->result != VDO_SUCCESS) { - launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); - return; - } - - TreeLock *lock = &dataVIO->treeLock; - PhysicalBlockNumber pbn = lock->treeSlots[lock->height - 1].blockMapSlot.pbn; - completion->callback = releaseBlockMapWriteLock; - addSlabJournalEntry(getSlabJournal(getVDOFromDataVIO(dataVIO)->depot, pbn), - dataVIO); -} - -/** - * Make a recovery journal entry for a newly allocated block map page. - * This callback is registered in continueBlockMapPageAllocation(). - * - * @param completion The DataVIO doing the allocation - **/ -static void journalBlockMapAllocation(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInJournalZone(dataVIO); - if (completion->result != VDO_SUCCESS) { - launchLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); - return; - } - - setAllocatedZoneCallback(dataVIO, setBlockMapPageReferenceCount, - THIS_LOCATION(NULL)); - addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, - dataVIO); -} - -/** - * Continue the process of allocating a block map page now that the - * BlockAllocator has given us a block. This method is supplied as the callback - * to allocateDataBlock() by allocateBlockMapPage(). - * - * @param allocatingVIO The DataVIO which is doing the allocation - **/ -static void continueBlockMapPageAllocation(AllocatingVIO *allocatingVIO) -{ - DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO); - if (!hasAllocation(dataVIO)) { - setLogicalCallback(dataVIO, allocationFailure, THIS_LOCATION(NULL)); - continueDataVIO(dataVIO, VDO_NO_SPACE); - return; - } - - PhysicalBlockNumber pbn = allocatingVIO->allocation; - TreeLock *lock = &dataVIO->treeLock; - lock->treeSlots[lock->height - 1].blockMapSlot.pbn = pbn; - setUpReferenceOperationWithLock(BLOCK_MAP_INCREMENT, pbn, - MAPPING_STATE_UNCOMPRESSED, - allocatingVIO->allocationLock, - &dataVIO->operation); - launchJournalCallback(dataVIO, journalBlockMapAllocation, - THIS_LOCATION("$F;cb=journalBlockMapAllocation")); -} - -/** - * Allocate a block map page. - * - * @param zone The zone in which the DataVIO is operating - * @param dataVIO The DataVIO which needs to allocate a page - **/ -static void allocateBlockMapPage(BlockMapTreeZone *zone, DataVIO *dataVIO) -{ - if (!isWriteDataVIO(dataVIO) || isTrimDataVIO(dataVIO)) { - // This is a pure read, the read phase of a read-modify-write, or a trim, - // so there's nothing left to do here. - finishLookup(dataVIO, VDO_SUCCESS); - return; - } - - int result = attemptPageLock(zone, dataVIO); - if (result != VDO_SUCCESS) { - abortAllocation(dataVIO, result); - return; - } - - if (!dataVIO->treeLock.locked) { - return; - } - - allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO), - getAllocationSelector(dataVIO->logical.zone), - VIO_BLOCK_MAP_WRITE_LOCK, - continueBlockMapPageAllocation); -} - -/**********************************************************************/ -void lookupBlockMapPBN(DataVIO *dataVIO) -{ - BlockMapTreeZone *zone = getBlockMapTreeZone(dataVIO); - zone->activeLookups++; - if (isDraining(&zone->mapZone->state)) { - finishLookup(dataVIO, VDO_SHUTTING_DOWN); - return; - } - - TreeLock *lock = &dataVIO->treeLock; - PageNumber pageIndex - = ((lock->treeSlots[0].pageIndex - zone->mapZone->blockMap->flatPageCount) - / zone->mapZone->blockMap->rootCount); - BlockMapTreeSlot treeSlot = { - .pageIndex = pageIndex / BLOCK_MAP_ENTRIES_PER_PAGE, - .blockMapSlot = { - .pbn = 0, - .slot = pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE, - }, - }; - - BlockMapPage *page = NULL; - for (lock->height = 1; lock->height <= BLOCK_MAP_TREE_HEIGHT; - lock->height++) { - lock->treeSlots[lock->height] = treeSlot; - page = (BlockMapPage *) (getTreePage(zone, lock)->pageBuffer); - PhysicalBlockNumber pbn = getBlockMapPagePBN(page); - if (pbn != ZERO_BLOCK) { - lock->treeSlots[lock->height].blockMapSlot.pbn = pbn; - break; - } - - // Calculate the index and slot for the next level. - treeSlot.blockMapSlot.slot - = treeSlot.pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE; - treeSlot.pageIndex - = treeSlot.pageIndex / BLOCK_MAP_ENTRIES_PER_PAGE; - } - - // The page at this height has been allocated and loaded. - DataLocation mapping - = unpackBlockMapEntry(&page->entries[treeSlot.blockMapSlot.slot]); - if (isInvalidTreeEntry(getVDOFromDataVIO(dataVIO), &mapping, lock->height)) { - logErrorWithStringError(VDO_BAD_MAPPING, - "Invalid block map tree PBN: %llu with " - "state %u for page index %u at height %u", - mapping.pbn, mapping.state, - lock->treeSlots[lock->height - 1].pageIndex, - lock->height - 1); - abortLoad(dataVIO, VDO_BAD_MAPPING); - return; - } - - if (!isMappedLocation(&mapping)) { - // The page we want one level down has not been allocated, so allocate it. - allocateBlockMapPage(zone, dataVIO); - return; - } - - lock->treeSlots[lock->height - 1].blockMapSlot.pbn = mapping.pbn; - if (lock->height == 1) { - // This is the ultimate block map page, so we're done - finishLookup(dataVIO, VDO_SUCCESS); - return; - } - - // We know what page we need to load. - loadBlockMapPage(zone, dataVIO); -} - -/**********************************************************************/ -PhysicalBlockNumber findBlockMapPagePBN(BlockMap *map, PageNumber pageNumber) -{ - if (pageNumber < map->flatPageCount) { - return (BLOCK_MAP_FLAT_PAGE_ORIGIN + pageNumber); - } - - RootCount rootIndex = pageNumber % map->rootCount; - PageNumber pageIndex = ((pageNumber - map->flatPageCount) / map->rootCount); - SlotNumber slot = pageIndex % BLOCK_MAP_ENTRIES_PER_PAGE; - pageIndex /= BLOCK_MAP_ENTRIES_PER_PAGE; - - TreePage *treePage - = getTreePageByIndex(map->forest, rootIndex, 1, pageIndex); - BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; - if (!isBlockMapPageInitialized(page)) { - return ZERO_BLOCK; - } - - DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); - if (!isValidLocation(&mapping) || isCompressed(mapping.state)) { - return ZERO_BLOCK; - } - return mapping.pbn; -} - -/**********************************************************************/ -void writeTreePage(TreePage *page, BlockMapTreeZone *zone) -{ - bool waiting = isWaiting(&page->waiter); - if (waiting && (zone->flusher == page)) { - return; - } - - setGeneration(zone, page, zone->generation, waiting); - if (waiting || page->writing) { - return; - } - - enqueuePage(page, zone); -} diff --git a/vdo/base/blockMapTree.h b/vdo/base/blockMapTree.h deleted file mode 100644 index c581454..0000000 --- a/vdo/base/blockMapTree.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTree.h#7 $ - */ - -#ifndef BLOCK_MAP_TREE_H -#define BLOCK_MAP_TREE_H - -#include "constants.h" -#include "types.h" - -typedef struct treePage TreePage; - -/** - * Intialize a BlockMapTreeZone. - * - * @param zone The BlockMapZone of the tree zone to intialize - * @param layer The physical layer - * @param maximumAge The number of journal blocks before a dirtied page - * is considered old and may be written out - * - * @return VDO_SUCCESS or an error - **/ -int initializeTreeZone(BlockMapZone *zone, - PhysicalLayer *layer, - BlockCount maximumAge) - __attribute__((warn_unused_result)); - -/** - * Clean up a BlockMapTreeZone. - * - * @param treeZone The zone to clean up - **/ -void uninitializeBlockMapTreeZone(BlockMapTreeZone *treeZone); - -/** - * Set the initial dirty period for a tree zone. - * - * @param treeZone The tree zone - * @param period The initial dirty period to set - **/ -void setTreeZoneInitialPeriod(BlockMapTreeZone *treeZone, - SequenceNumber period); - -/** - * Check whether a tree zone is active (i.e. has any active lookups, - * outstanding I/O, or pending I/O). - * - * @param zone The zone to check - * - * @return true if the zone is active - **/ -bool isTreeZoneActive(BlockMapTreeZone *zone) - __attribute__((warn_unused_result)); - -/** - * Advance the dirty period for a tree zone. - * - * @param zone The BlockMapTreeZone to advance - * @param period The new dirty period - **/ -void advanceZoneTreePeriod(BlockMapTreeZone *zone, SequenceNumber period); - -/** - * Drain the zone trees, i.e. ensure that all I/O is quiesced. If required by - * the drain type, all dirty block map trees will be written to disk. This - * method must not be called when lookups are active. - * - * @param zone The BlockMapTreeZone to drain - **/ -void drainZoneTrees(BlockMapTreeZone *zone); - -/** - * Look up the PBN of the block map page for a DataVIO's LBN in the arboreal - * block map. If necessary, the block map page will be allocated. Also, the - * ancestors of the block map page will be allocated or loaded if necessary. - * - * @param dataVIO The DataVIO requesting the lookup - **/ -void lookupBlockMapPBN(DataVIO *dataVIO); - -/** - * Find the PBN of a leaf block map page. This method may only be used after - * all allocated tree pages have been loaded, otherwise, it may give the wrong - * answer (0). - * - * @param map The block map containing the forest - * @param pageNumber The page number of the desired block map page - * - * @return The PBN of the page - **/ -PhysicalBlockNumber findBlockMapPagePBN(BlockMap *map, PageNumber pageNumber); - -/** - * Write a tree page or indicate that it has been re-dirtied if it is already - * being written. This method is used when correcting errors in the tree during - * read-only rebuild. - * - * @param page The page to write - * @param zone The tree zone managing the page - **/ -void writeTreePage(TreePage *page, BlockMapTreeZone *zone); - -#endif // BLOCK_MAP_TREE_H diff --git a/vdo/base/blockMapTreeInternals.h b/vdo/base/blockMapTreeInternals.h deleted file mode 100644 index 49b69eb..0000000 --- a/vdo/base/blockMapTreeInternals.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMapTreeInternals.h#4 $ - */ - -#ifndef BLOCK_MAP_TREE_INTERNALS_H -#define BLOCK_MAP_TREE_INTERNALS_H - -#include "blockMapTree.h" - -#include "blockMapPage.h" -#include "types.h" - -/** A single page of a block map tree */ -struct treePage { - /** Waiter for a VIO to write out this page */ - Waiter waiter; - - /** Dirty list node */ - RingNode node; - - /** - * If this is a dirty tree page, the tree zone flush generation in which it - * was last dirtied. - */ - uint8_t generation; - - /** Whether this page is an interior tree page being written out. */ - bool writing; - - /** - * If this page is being written, the tree zone flush generation of the - * copy of the page being written. - **/ - uint8_t writingGeneration; - - /** The earliest journal block containing uncommitted updates to this page */ - SequenceNumber recoveryLock; - - /** The value of recoveryLock when the this page last started writing */ - SequenceNumber writingRecoveryLock; - - /** The buffer to hold the on-disk representation of this page */ - char pageBuffer[VDO_BLOCK_SIZE]; -}; - -typedef struct { - PageNumber levels[BLOCK_MAP_TREE_HEIGHT]; -} Boundary; - -/** - * An invalid PBN used to indicate that the page holding the location of a - * tree root has been "loaded". - **/ -extern const PhysicalBlockNumber INVALID_PBN; - -/** - * Extract the BlockMapPage from a TreePage. - * - * @param treePage The TreePage - * - * @return The BlockMapPage of the TreePage - **/ -__attribute__((warn_unused_result)) -static inline BlockMapPage *asBlockMapPage(TreePage *treePage) -{ - return (BlockMapPage *) treePage->pageBuffer; -} - -/** - * Replace the VIOPool in a tree zone. This method is used by unit tests. - * - * @param zone The zone whose pool is to be replaced - * @param layer The physical layer from which to make VIOs - * @param poolSize The size of the new pool - * - * @return VDO_SUCCESS or an error - **/ -int replaceTreeZoneVIOPool(BlockMapTreeZone *zone, - PhysicalLayer *layer, - size_t poolSize) - __attribute__((warn_unused_result)); - -/** - * Check whether a buffer contains a valid page. If the page is bad, log an - * error. If the page is valid, copy it to the supplied page. - * - * @param buffer The buffer to validate (and copy) - * @param nonce The VDO nonce - * @param pbn The absolute PBN of the page - * @param page The page to copy into if valid - * - * @return true if the page was copied (valid) - **/ -bool copyValidPage(char *buffer, - Nonce nonce, - PhysicalBlockNumber pbn, - BlockMapPage *page); - -#endif // BLOCK_MAP_TREE_INTERNALS_H diff --git a/vdo/base/blockMappingState.h b/vdo/base/blockMappingState.h deleted file mode 100644 index ad2460a..0000000 --- a/vdo/base/blockMappingState.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/blockMappingState.h#1 $ - */ - -#ifndef BLOCK_MAPPING_STATE_H -#define BLOCK_MAPPING_STATE_H - -#include "common.h" - -/** - * Four bits of each five-byte block map entry contain a mapping state value - * used to distinguish unmapped or trimmed logical blocks (which are treated - * as mapped to the zero block) from entries that have been mapped to a - * physical block, including the zero block. - **/ -typedef enum { - MAPPING_STATE_UNMAPPED = 0, // Must be zero to be the default value - MAPPING_STATE_UNCOMPRESSED = 1, // A normal (uncompressed) block - MAPPING_STATE_COMPRESSED_BASE = 2, // Compressed in slot 0 - MAPPING_STATE_COMPRESSED_MAX = 15, // Compressed in slot 13 -} BlockMappingState; - -/** - * The total number of compressed blocks that can live in a physical block. - **/ -enum { - MAX_COMPRESSION_SLOTS = - MAPPING_STATE_COMPRESSED_MAX - MAPPING_STATE_COMPRESSED_BASE + 1, -}; - -/**********************************************************************/ -static inline BlockMappingState getStateForSlot(byte slotNumber) -{ - return (slotNumber + MAPPING_STATE_COMPRESSED_BASE); -} - -/**********************************************************************/ -static inline byte getSlotFromState(BlockMappingState mappingState) -{ - return (mappingState - MAPPING_STATE_COMPRESSED_BASE); -} - -/**********************************************************************/ -static inline bool isCompressed(const BlockMappingState mappingState) -{ - return (mappingState > MAPPING_STATE_UNCOMPRESSED); -} - -#endif // BLOCK_MAPPING_STATE_H diff --git a/vdo/base/completion.c b/vdo/base/completion.c deleted file mode 100644 index d27fd72..0000000 --- a/vdo/base/completion.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/completion.c#10 $ - */ - -#include "completion.h" - -#include "logger.h" -#include "statusCodes.h" - -static const char *VDO_COMPLETION_TYPE_NAMES[] = { - // Keep UNSET_COMPLETION_TYPE at the top. - "UNSET_COMPLETION_TYPE", - - // Keep the rest of these in sorted order. If you add or remove an entry, - // be sure to update the corresponding list in completion.h. - "ACTION_COMPLETION", - "ADMIN_COMPLETION", - "ASYNC_ACTION_CONTEXT", - "BLOCK_ALLOCATOR_COMPLETION", - "BLOCK_MAP_RECOVERY_COMPLETION", - "CHECK_IDENTIFIER_COMPLETION", - "EXTERNAL_COMPLETION", - "FLUSH_NOTIFICATION_COMPLETION", - "GENERATION_FLUSHED_COMPLETION", - "HEARTBEAT_COMPLETION", - "LOCK_COUNTER_COMPLETION", - "PARTITION_COPY_COMPLETION", - "READ_ONLY_MODE_COMPLETION", - "READ_ONLY_REBUILD_COMPLETION", - "RECOVERY_COMPLETION", - "REFERENCE_COUNT_REBUILD_COMPLETION", - "SLAB_SCRUBBER_COMPLETION", - "SUB_TASK_COMPLETION", - "TEST_COMPLETION", - "VDO_COMMAND_COMPLETION", - "VDO_COMMAND_SUB_COMPLETION", - "VDO_EXTENT_COMPLETION", - "VDO_PAGE_COMPLETION", - "VIO_COMPLETION", - "WRAPPING_COMPLETION", -}; - -/**********************************************************************/ -void initializeCompletion(VDOCompletion *completion, - VDOCompletionType type, - PhysicalLayer *layer) -{ - memset(completion, 0, sizeof(*completion)); - completion->layer = layer; - completion->type = type; - resetCompletion(completion); -} - -/**********************************************************************/ -int initializeEnqueueableCompletion(VDOCompletion *completion, - VDOCompletionType type, - PhysicalLayer *layer) -{ - initializeCompletion(completion, type, layer); - return ((layer->createEnqueueable == NULL) - ? VDO_SUCCESS : layer->createEnqueueable(completion)); -} - -/**********************************************************************/ -void resetCompletion(VDOCompletion *completion) -{ - completion->result = VDO_SUCCESS; - completion->complete = false; -} - -/** - * Assert that a completion is not complete. - * - * @param completion The completion to check - **/ -static inline void assertIncomplete(VDOCompletion *completion) -{ - ASSERT_LOG_ONLY(!completion->complete, "completion is not complete"); -} - -/**********************************************************************/ -void setCompletionResult(VDOCompletion *completion, int result) -{ - assertIncomplete(completion); - if (completion->result == VDO_SUCCESS) { - completion->result = result; - } -} - -/** - * Check whether a completion's callback must be enqueued, or if it can be run - * on the current thread. Side effect: clears the requeue flag if it is set, - * so the caller MUST requeue if this returns true. - * - * @param completion The completion whose callback is to be invoked - * - * @return false if the callback must be run on this thread - * true if the callback must be enqueued - **/ -__attribute__((warn_unused_result)) -static inline bool requiresEnqueue(VDOCompletion *completion) -{ - if (completion->requeue) { - completion->requeue = false; - return true; - } - - ThreadID callbackThread = completion->callbackThreadID; - return (callbackThread != completion->layer->getCurrentThreadID()); -} - -/**********************************************************************/ -void invokeCallback(VDOCompletion *completion) -{ - if (requiresEnqueue(completion)) { - if (completion->enqueueable != NULL) { - completion->layer->enqueue(completion->enqueueable); - return; - } - ASSERT_LOG_ONLY(false, - "non-enqueueable completion (type %s) on correct thread", - getCompletionTypeName(completion->type)); - } - - runCallback(completion); -} - -/**********************************************************************/ -void continueCompletion(VDOCompletion *completion, int result) -{ - setCompletionResult(completion, result); - invokeCallback(completion); -} - -/**********************************************************************/ -void completeCompletion(VDOCompletion *completion) -{ - assertIncomplete(completion); - completion->complete = true; - if (completion->callback != NULL) { - invokeCallback(completion); - } -} - -/**********************************************************************/ -void releaseCompletion(VDOCompletion **completionPtr) -{ - VDOCompletion *completion = *completionPtr; - if (completion == NULL) { - return; - } - - *completionPtr = NULL; - completeCompletion(completion); -} - -/**********************************************************************/ -void releaseCompletionWithResult(VDOCompletion **completionPtr, int result) -{ - if (*completionPtr == NULL) { - return; - } - - setCompletionResult(*completionPtr, result); - releaseCompletion(completionPtr); -} - -/**********************************************************************/ -void finishParentCallback(VDOCompletion *completion) -{ - finishCompletion((VDOCompletion *) completion->parent, completion->result); -} - -/**********************************************************************/ -void preserveErrorAndContinue(VDOCompletion *completion) -{ - if (completion->parent != NULL) { - setCompletionResult(completion->parent, completion->result); - } - - resetCompletion(completion); - invokeCallback(completion); -} - -/**********************************************************************/ -const char *getCompletionTypeName(VDOCompletionType completionType) -{ - // Try to catch failures to update the array when the enum values change. - STATIC_ASSERT(COUNT_OF(VDO_COMPLETION_TYPE_NAMES) - == (MAX_COMPLETION_TYPE - UNSET_COMPLETION_TYPE)); - - if (completionType >= MAX_COMPLETION_TYPE) { - static char numeric[100]; - snprintf(numeric, 99, "%d (%#x)", completionType, completionType); - return numeric; - } - - return VDO_COMPLETION_TYPE_NAMES[completionType]; -} - -/**********************************************************************/ -void destroyEnqueueable(VDOCompletion *completion) -{ - if ((completion == NULL) || (completion->layer == NULL) - || (completion->layer->destroyEnqueueable == NULL)) { - return; - } - - completion->layer->destroyEnqueueable(&completion->enqueueable); -} - -/**********************************************************************/ -int assertCompletionType(VDOCompletionType actual, - VDOCompletionType expected) -{ - return ASSERT((expected == actual), - "completion type is %s instead of %s", - getCompletionTypeName(actual), - getCompletionTypeName(expected)); -} diff --git a/vdo/base/completion.h b/vdo/base/completion.h deleted file mode 100644 index d245814..0000000 --- a/vdo/base/completion.h +++ /dev/null @@ -1,396 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/completion.h#11 $ - */ - -#ifndef COMPLETION_H -#define COMPLETION_H - -#include "permassert.h" - -#include "physicalLayer.h" -#include "ringNode.h" -#include "types.h" - -typedef enum __attribute__((packed)) { - // Keep UNSET_COMPLETION_TYPE at the top. - UNSET_COMPLETION_TYPE = 0, - - // Keep the rest of these in sorted order. If you add or remove an entry, - // be sure to update the corresponding list in completion.c. - ACTION_COMPLETION, - ADMIN_COMPLETION, - ASYNC_ACTION_CONTEXT, - BLOCK_ALLOCATOR_COMPLETION, - BLOCK_MAP_RECOVERY_COMPLETION, - CHECK_IDENTIFIER_COMPLETION, - EXTERNAL_COMPLETION, - FLUSH_NOTIFICATION_COMPLETION, - GENERATION_FLUSHED_COMPLETION, - HEARTBEAT_COMPLETION, - LOCK_COUNTER_COMPLETION, - PARTITION_COPY_COMPLETION, - READ_ONLY_MODE_COMPLETION, - READ_ONLY_REBUILD_COMPLETION, - RECOVERY_COMPLETION, - REFERENCE_COUNT_REBUILD_COMPLETION, - SLAB_SCRUBBER_COMPLETION, - SUB_TASK_COMPLETION, - TEST_COMPLETION, // each unit test may define its own - VDO_COMMAND_COMPLETION, - VDO_COMMAND_SUB_COMPLETION, - VDO_EXTENT_COMPLETION, - VDO_PAGE_COMPLETION, - VIO_COMPLETION, - WRAPPING_COMPLETION, - - // Keep MAX_COMPLETION_TYPE at the bottom. - MAX_COMPLETION_TYPE -} VDOCompletionType; - -/** - * An asynchronous VDO operation. - * - * @param completion the completion of the operation - **/ -typedef void VDOAction(VDOCompletion *completion); - -struct vdoCompletion { - /** The type of completion this is */ - VDOCompletionType type; - - /** - * true once the processing of the operation is complete. - * This flag should not be used by waiters external to the VDO base as - * it is used to gate calling the callback. - **/ - bool complete; - - /** - * If true, queue this completion on the next callback invocation, even if - * it is already running on the correct thread. - **/ - bool requeue; - - /** The ID of the thread which should run the next callback */ - ThreadID callbackThreadID; - - /** The result of the operation */ - int result; - - /** The physical layer on which this completion operates */ - PhysicalLayer *layer; - - /** The callback which will be called once the operation is complete */ - VDOAction *callback; - - /** The callback which, if set, will be called if an error result is set */ - VDOAction *errorHandler; - - /** The parent object, if any, that spawned this completion */ - void *parent; - - /** The enqueueable for this completion (may be NULL) */ - Enqueueable *enqueueable; -}; - -/** - * Actually run the callback. This function must be called from the correct - * callback thread. - **/ -static inline void runCallback(VDOCompletion *completion) -{ - if ((completion->result != VDO_SUCCESS) - && (completion->errorHandler != NULL)) { - completion->errorHandler(completion); - return; - } - - completion->callback(completion); -} - -/** - * Set the result of a completion. Older errors will not be masked. - * - * @param completion The completion whose result is to be set - * @param result The result to set - **/ -void setCompletionResult(VDOCompletion *completion, int result); - -/** - * Initialize a completion to a clean state, for reused completions. - * - * @param completion The completion to initialize - * @param type The type of the completion - * @param layer The physical layer of the completion - **/ -void initializeCompletion(VDOCompletion *completion, - VDOCompletionType type, - PhysicalLayer *layer); - -/** - * Initialize a completion to a clean state and make an enqueueable for it. - * - * @param completion The completion to initialize - * @param type The type of the completion - * @param layer The physical layer of the completion - * - * @return VDO_SUCCESS or an error - **/ -int initializeEnqueueableCompletion(VDOCompletion *completion, - VDOCompletionType type, - PhysicalLayer *layer) - __attribute__((warn_unused_result)); - -/** - * Reset a completion to a clean state, while keeping - * the type, layer and parent information. - * - * @param completion the completion to reset - **/ -void resetCompletion(VDOCompletion *completion); - -/** - * Invoke the callback of a completion. If called on the correct thread (i.e. - * the one specified in the completion's callbackThreadID field), the - * completion will be run immediately. Otherwise, the completion will be - * enqueued on the correct callback thread. - **/ -void invokeCallback(VDOCompletion *completion); - -/** - * Continue processing a completion by setting the current result and calling - * invokeCallback(). - * - * @param completion The completion to continue - * @param result The current result (will not mask older errors) - **/ -void continueCompletion(VDOCompletion *completion, int result); - -/** - * Complete a completion. - * - * @param completion The completion to complete - **/ -void completeCompletion(VDOCompletion *completion); - -/** - * Finish a completion. - * - * @param completion The completion to finish - * @param result The result of the completion (will not mask older errors) - **/ -static inline void finishCompletion(VDOCompletion *completion, int result) -{ - setCompletionResult(completion, result); - completeCompletion(completion); -} - -/** - * Complete a completion and NULL out the reference to it. - * - * @param completionPtr A pointer to the completion to release - **/ -void releaseCompletion(VDOCompletion **completionPtr); - -/** - * Finish a completion and NULL out the reference to it. - * - * @param completionPtr A pointer to the completion to release - * @param result The result of the completion - **/ -void releaseCompletionWithResult(VDOCompletion **completionPtr, int result); - -/** - * A callback to finish the parent of a completion. - * - * @param completion The completion which has finished and whose parent should - * be finished - **/ -void finishParentCallback(VDOCompletion *completion); - -/** - * Error handler which preserves an error in the parent (if there is one), - * and then resets the failing completion and calls its non-error callback. - * - * @param completion The completion which failed - **/ -void preserveErrorAndContinue(VDOCompletion *completion); - -/** - * A callback which does nothing. This callback is intended to be set as an - * error handler in the case where an error should do nothing. - * - * @param completion The completion being called back - **/ -static inline -void noopCallback(VDOCompletion *completion __attribute__((unused))) -{ -} - -/** - * Destroy the enqueueable associated with this completion. - * - * @param completion The completion - **/ -void destroyEnqueueable(VDOCompletion *completion); - -/** - * Assert that a completion is of the correct type - * - * @param actual The actual completion type - * @param expected The expected completion type - * - * @return VDO_SUCCESS or VDO_PARAMETER_MISMATCH - **/ -int assertCompletionType(VDOCompletionType actual, - VDOCompletionType expected); - -/** - * Return the name of a completion type. - * - * @param completionType the completion type - * - * @return a pointer to a static string; if the completionType is unknown - * this is to a static buffer that may be overwritten. - **/ -const char *getCompletionTypeName(VDOCompletionType completionType); - -/** - * Set the callback for a completion. - * - * @param completion The completion - * @param callback The callback to register - * @param threadID The ID of the thread on which the callback should run - **/ -static inline void setCallback(VDOCompletion *completion, - VDOAction *callback, - ThreadID threadID) -{ - completion->callback = callback; - completion->callbackThreadID = threadID; -} - -/** - * Set the callback for a completion and invoke it immediately. - * - * @param completion The completion - * @param callback The callback to register - * @param threadID The ID of the thread on which the callback should run - **/ -static inline void launchCallback(VDOCompletion *completion, - VDOAction *callback, - ThreadID threadID) -{ - setCallback(completion, callback, threadID); - invokeCallback(completion); -} - -/** - * Set the callback and parent for a completion. - * - * @param completion The completion - * @param callback The callback to register - * @param threadID The ID of the thread on which the callback should run - * @param parent The new parent of the completion - **/ -static inline void setCallbackWithParent(VDOCompletion *completion, - VDOAction *callback, - ThreadID threadID, - void *parent) -{ - setCallback(completion, callback, threadID); - completion->parent = parent; -} - -/** - * Set the callback and parent for a completion and invoke the callback - * immediately. - * - * @param completion The completion - * @param callback The callback to register - * @param threadID The ID of the thread on which the callback should run - * @param parent The new parent of the completion - **/ -static inline void launchCallbackWithParent(VDOCompletion *completion, - VDOAction *callback, - ThreadID threadID, - void *parent) -{ - setCallbackWithParent(completion, callback, threadID, parent); - invokeCallback(completion); -} - -/** - * Prepare a completion for launch. Reset it, and then set its callback, error - * handler, callback thread, and parent. - * - * @param completion The completion - * @param callback The callback to register - * @param errorHandler The error handler to register - * @param threadID The ID of the thread on which the callback should run - * @param parent The new parent of the completion - **/ -static inline void prepareCompletion(VDOCompletion *completion, - VDOAction *callback, - VDOAction *errorHandler, - ThreadID threadID, - void *parent) -{ - resetCompletion(completion); - setCallbackWithParent(completion, callback, threadID, parent); - completion->errorHandler = errorHandler; -} - -/** - * Prepare a completion for launch ensuring that it will always be requeued. - * Reset it, and then set its callback, error handler, callback thread, and - * parent. - * - * @param completion The completion - * @param callback The callback to register - * @param errorHandler The error handler to register - * @param threadID The ID of the thread on which the callback should run - * @param parent The new parent of the completion - **/ -static inline void prepareForRequeue(VDOCompletion *completion, - VDOAction *callback, - VDOAction *errorHandler, - ThreadID threadID, - void *parent) -{ - prepareCompletion(completion, callback, errorHandler, threadID, parent); - completion->requeue = true; -} - -/** - * Prepare a completion for launch which will complete its parent when - * finished. - * - * @param completion The completion - * @param parent The parent to complete - **/ -static inline void prepareToFinishParent(VDOCompletion *completion, - VDOCompletion *parent) -{ - prepareCompletion(completion, finishParentCallback, finishParentCallback, - parent->callbackThreadID, parent); -} - -#endif // COMPLETION_H diff --git a/vdo/base/compressedBlock.c b/vdo/base/compressedBlock.c deleted file mode 100644 index d9f93e8..0000000 --- a/vdo/base/compressedBlock.c +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressedBlock.c#3 $ - */ - -#include "compressedBlock.h" - -#include "memoryAlloc.h" -#include "numeric.h" - -static const VersionNumber COMPRESSED_BLOCK_1_0 = { - .majorVersion = 1, - .minorVersion = 0, -}; - -/**********************************************************************/ -void resetCompressedBlockHeader(CompressedBlockHeader *header) -{ - STATIC_ASSERT(sizeof(header->fields) == sizeof(header->raw)); - - header->fields.version = packVersionNumber(COMPRESSED_BLOCK_1_0); - memset(header->fields.sizes, 0, sizeof(header->fields.sizes)); -} - -/**********************************************************************/ -static uint16_t -getCompressedFragmentSize(const CompressedBlockHeader *header, byte slot) -{ - return getUInt16LE(header->fields.sizes[slot]); -} - -/**********************************************************************/ -int getCompressedBlockFragment(BlockMappingState mappingState, - char *buffer, - BlockSize blockSize, - uint16_t *fragmentOffset, - uint16_t *fragmentSize) -{ - if (!isCompressed(mappingState)) { - return VDO_INVALID_FRAGMENT; - } - - CompressedBlockHeader *header = (CompressedBlockHeader *) buffer; - VersionNumber version = unpackVersionNumber(header->fields.version); - if (!areSameVersion(version, COMPRESSED_BLOCK_1_0)) { - return VDO_INVALID_FRAGMENT; - } - - byte slot = getSlotFromState(mappingState); - if (slot >= MAX_COMPRESSION_SLOTS) { - return VDO_INVALID_FRAGMENT; - } - - uint16_t compressedSize = getCompressedFragmentSize(header, slot); - uint16_t offset = sizeof(CompressedBlockHeader); - for (unsigned int i = 0; i < slot; i++) { - offset += getCompressedFragmentSize(header, i); - if (offset >= blockSize) { - return VDO_INVALID_FRAGMENT; - } - } - - if ((offset + compressedSize) > blockSize) { - return VDO_INVALID_FRAGMENT; - } - - *fragmentOffset = offset; - *fragmentSize = compressedSize; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void putCompressedBlockFragment(CompressedBlock *block, - unsigned int fragment, - uint16_t offset, - const char *data, - uint16_t size) -{ - storeUInt16LE(block->header.fields.sizes[fragment], size); - memcpy(&block->data[offset], data, size); -} diff --git a/vdo/base/compressedBlock.h b/vdo/base/compressedBlock.h deleted file mode 100644 index 603841f..0000000 --- a/vdo/base/compressedBlock.h +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressedBlock.h#3 $ - */ - -#ifndef COMPRESSED_BLOCK_H -#define COMPRESSED_BLOCK_H - -#include "blockMappingState.h" -#include "header.h" - -/** - * The header of a compressed block. - **/ -typedef union __attribute__((packed)) { - struct __attribute__((packed)) { - /** Unsigned 32-bit major and minor versions, in little-endian byte order */ - PackedVersionNumber version; - - /** List of unsigned 16-bit compressed block sizes, in little-endian order */ - byte sizes[MAX_COMPRESSION_SLOTS][2]; - } fields; - - // A raw view of the packed encoding. - byte raw[4 + 4 + (2 * MAX_COMPRESSION_SLOTS)]; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - // This view is only valid on little-endian machines and is only present for - // ease of directly examining compressed block headers in GDB. - struct __attribute__((packed)) { - VersionNumber version; - uint16_t sizes[MAX_COMPRESSION_SLOTS]; - } littleEndian; -#endif -} CompressedBlockHeader; - -/** - * The compressed block overlay. - **/ -typedef struct { - CompressedBlockHeader header; - char data[]; -} __attribute__((packed)) CompressedBlock; - -/** - * Initializes/resets a compressed block header. - * - * @param header the header - * - * When done, the version number is set to the current version, and all - * fragments are empty. - **/ -void resetCompressedBlockHeader(CompressedBlockHeader *header); - -/** - * Get a reference to a compressed fragment from a compression block. - * - * @param [in] mappingState the mapping state for the look up - * @param [in] buffer buffer that contains compressed data - * @param [in] blockSize size of a data block - * @param [out] fragmentOffset the offset of the fragment within a - * compressed block - * @param [out] fragmentSize the size of the fragment - * - * @return If a valid compressed fragment is found, VDO_SUCCESS; - * otherwise, VDO_INVALID_FRAGMENT if the fragment is invalid. - **/ -int getCompressedBlockFragment(BlockMappingState mappingState, - char *buffer, - BlockSize blockSize, - uint16_t *fragmentOffset, - uint16_t *fragmentSize); - -/** - * Copy a fragment into the compressed block. - * - * @param block the compressed block - * @param fragment the number of the fragment - * @param offset the byte offset of the fragment in the data area - * @param data a pointer to the compressed data - * @param size the size of the data - * - * @note no bounds checking -- the data better fit without smashing other stuff - **/ -void putCompressedBlockFragment(CompressedBlock *block, - unsigned int fragment, - uint16_t offset, - const char *data, - uint16_t size); - -#endif // COMPRESSED_BLOCK_H diff --git a/vdo/base/compressionState.c b/vdo/base/compressionState.c deleted file mode 100644 index d773756..0000000 --- a/vdo/base/compressionState.c +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionState.c#2 $ - */ - -#include "compressionStateInternals.h" - -#include "dataVIO.h" -#include "packer.h" - -static const uint32_t STATUS_MASK = 0xff; -static const uint32_t MAY_NOT_COMPRESS_MASK = 0x80000000; - -/**********************************************************************/ -VIOCompressionState getCompressionState(DataVIO *dataVIO) -{ - uint32_t packedValue = atomicLoad32(&dataVIO->compression.state); - return (VIOCompressionState) { - .status = packedValue & STATUS_MASK, - .mayNotCompress = ((packedValue & MAY_NOT_COMPRESS_MASK) != 0), - }; -} - -/** - * Convert a VIOCompressionState into a uint32_t which may be stored - * atomically. - * - * @param state The state to convert - * - * @return The compression state packed into a uint32_t - **/ -__attribute__((warn_unused_result)) -static uint32_t packState(VIOCompressionState state) -{ - return state.status | (state.mayNotCompress ? MAY_NOT_COMPRESS_MASK : 0); -} - -/**********************************************************************/ -bool setCompressionState(DataVIO *dataVIO, - VIOCompressionState state, - VIOCompressionState newState) -{ - return compareAndSwap32(&dataVIO->compression.state, packState(state), - packState(newState)); -} - -/** - * Advance to the next compression state along the compression path. - * - * @param dataVIO The DataVIO to advance - * - * @return The new compression status of the DataVIO - **/ -static VIOCompressionStatus advanceStatus(DataVIO *dataVIO) -{ - for (;;) { - VIOCompressionState state = getCompressionState(dataVIO); - if (state.status == VIO_POST_PACKER) { - // We're already in the last state. - return state.status; - } - - VIOCompressionState newState = state; - if (state.mayNotCompress) { - // Compression has been dis-allowed for this VIO, so skip the rest of the - // path and go to the end. - newState.status = VIO_POST_PACKER; - } else { - // Go to the next state. - newState.status++; - } - - if (setCompressionState(dataVIO, state, newState)) { - return newState.status; - } - - // Another thread changed the state out from under us so try again. - } -} - -/**********************************************************************/ -bool mayCompressDataVIO(DataVIO *dataVIO) -{ - if (!hasAllocation(dataVIO) - || ((getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC) - && vioRequiresFlushAfter(dataVIOAsVIO(dataVIO))) - || !getVDOCompressing(getVDOFromDataVIO(dataVIO))) { - /* - * If this VIO didn't get an allocation, the compressed write probably - * won't either, so don't try compressing it. Also, if compression is off, - * don't compress. - */ - setCompressionDone(dataVIO); - return false; - } - - if (dataVIO->hashLock == NULL) { - // DataVIOs without a HashLock (which should be extremely rare) aren't - // able to share the packer's PBN lock, so don't try to compress them. - return false; - } - - return (advanceStatus(dataVIO) == VIO_COMPRESSING); -} - -/**********************************************************************/ -bool mayPackDataVIO(DataVIO *dataVIO) -{ - if (!isSufficientlyCompressible(dataVIO) - || !getVDOCompressing(getVDOFromDataVIO(dataVIO)) - || getCompressionState(dataVIO).mayNotCompress) { - // If the data in this VIO doesn't compress, or compression is off, or - // compression for this VIO has been canceled, don't send it to the packer. - setCompressionDone(dataVIO); - return false; - } - - return true; -} - -/**********************************************************************/ -bool mayBlockInPacker(DataVIO *dataVIO) -{ - return (advanceStatus(dataVIO) == VIO_PACKING); -} - -/**********************************************************************/ -bool mayWriteCompressedDataVIO(DataVIO *dataVIO) -{ - advanceStatus(dataVIO); - return !getCompressionState(dataVIO).mayNotCompress; -} - -/**********************************************************************/ -void setCompressionDone(DataVIO *dataVIO) -{ - for (;;) { - VIOCompressionState state = getCompressionState(dataVIO); - if (state.status == VIO_POST_PACKER) { - // The VIO is already done. - return; - } - - // If compression was cancelled on this VIO, preserve that fact. - VIOCompressionState newState = { - .status = VIO_POST_PACKER, - .mayNotCompress = true, - }; - if (setCompressionState(dataVIO, state, newState)) { - return; - } - } -} - -/**********************************************************************/ -bool cancelCompression(DataVIO *dataVIO) -{ - VIOCompressionState state; - for (;;) { - state = getCompressionState(dataVIO); - if (state.mayNotCompress || (state.status == VIO_POST_PACKER)) { - // This DataVIO is already set up to not block in the packer. - break; - } - - VIOCompressionState newState = { - .status = state.status, - .mayNotCompress = true, - }; - if (setCompressionState(dataVIO, state, newState)) { - break; - } - } - - return ((state.status == VIO_PACKING) && !state.mayNotCompress); -} diff --git a/vdo/base/compressionState.h b/vdo/base/compressionState.h deleted file mode 100644 index 19a4143..0000000 --- a/vdo/base/compressionState.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionState.h#2 $ - */ - -#ifndef COMPRESSION_STATE_H -#define COMPRESSION_STATE_H - -#include "atomic.h" -#include "types.h" - -/** - * Where a DataVIO is on the compression path; advanceStatus() depends on the - * order of this enum. - **/ -typedef enum { - /* A VIO which has not yet entered the compression path */ - VIO_PRE_COMPRESSOR = 0, - /* A VIO which is in the compressor */ - VIO_COMPRESSING, - /* A VIO which is blocked in the packer */ - VIO_PACKING, - /* A VIO which is no longer on the compression path (and never will be) */ - VIO_POST_PACKER, -} VIOCompressionStatus; - -typedef struct { - VIOCompressionStatus status; - bool mayNotCompress; -} VIOCompressionState; - -/** - * Get the compression state of a DataVIO. - * - * @param dataVIO The DataVIO - * - * @return The compression state - **/ -__attribute__((warn_unused_result)) -VIOCompressionState getCompressionState(DataVIO *dataVIO); - -/** - * Check whether a DataVIO may go to the compressor. - * - * @param dataVIO The DataVIO to check - * - * @return true if the DataVIO may be compressed at this time - **/ -bool mayCompressDataVIO(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Check whether a DataVIO may go to the packer. - * - * @param dataVIO The DataVIO to check - * - * @return true if the DataVIO may be packed at this time - **/ -bool mayPackDataVIO(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Check whether a DataVIO which has gone to the packer may block there. Any - * cancelation after this point and before the DataVIO is written out requires - * this DataVIO to be picked up by the canceling DataVIO. - * - * @param dataVIO The DataVIO to check - * - * @return true if the DataVIO may block in the packer - **/ -bool mayBlockInPacker(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Check whether the packer may write out a DataVIO as part of a compressed - * block. - * - * @param dataVIO The DataVIO to check - * - * @return true if the DataVIO may be written as part of a - * compressed block at this time - **/ -bool mayWriteCompressedDataVIO(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Indicate that this DataVIO is leaving the compression path. - * - * @param dataVIO The DataVIO leaving the compression path - **/ -void setCompressionDone(DataVIO *dataVIO); - -/** - * Prevent this DataVIO from being compressed or packed. - * - * @param dataVIO The DataVIO to cancel - * - * @return true if the DataVIO is in the packer and the caller - * was the first caller to cancel it - **/ -bool cancelCompression(DataVIO *dataVIO); - -#endif /* COMPRESSION_STATE_H */ diff --git a/vdo/base/compressionStateInternals.h b/vdo/base/compressionStateInternals.h deleted file mode 100644 index a9b8dec..0000000 --- a/vdo/base/compressionStateInternals.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/compressionStateInternals.h#1 $ - */ - -#ifndef COMPRESSION_STATE_INTERNALS_H -#define COMPRESSION_STATE_INTERNALS_H - -#include "compressionState.h" - -/** - * Set the compression state of a DataVIO (exposed for testing). - * - * @param dataVIO The DataVIO whose compression state is to be set - * @param state The expected current state of the DataVIO - * @param newState The state to set - * - * @return true if the new state was set, false if the DataVIO's - * compression state did not match the expected state, and so was - * left unchanged - **/ -bool setCompressionState(DataVIO *dataVIO, - VIOCompressionState state, - VIOCompressionState newState); - -#endif /* COMPRESSION_STATE_H */ diff --git a/vdo/base/constants.c b/vdo/base/constants.c deleted file mode 100644 index 05d3a42..0000000 --- a/vdo/base/constants.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/constants.c#1 $ - */ - -#include "types.h" - -/** The maximum logical space is 4 petabytes, which is 1 terablock. */ -const BlockCount MAXIMUM_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024; - -/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ -const BlockCount MAXIMUM_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64; - -// unit test minimum -const BlockCount MINIMUM_SLAB_JOURNAL_BLOCKS = 2; diff --git a/vdo/base/constants.h b/vdo/base/constants.h deleted file mode 100644 index 8b61c5f..0000000 --- a/vdo/base/constants.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/constants.h#2 $ - */ - -#ifndef CONSTANTS_H -#define CONSTANTS_H - -#include "types.h" - -enum { - /** The number of entries on a block map page */ - BLOCK_MAP_ENTRIES_PER_PAGE = 812, - - /** The origin of the flat portion of the block map */ - BLOCK_MAP_FLAT_PAGE_ORIGIN = 1, - - /** - * The height of a block map tree. Assuming a root count of 60 and 812 - * entries per page, this is big enough to represent almost 95 PB of logical - * space. - **/ - BLOCK_MAP_TREE_HEIGHT = 5, - - /** The number of trees in the arboreal block map */ - DEFAULT_BLOCK_MAP_TREE_ROOT_COUNT = 60, - - /** The default size of the recovery journal, in blocks */ - DEFAULT_RECOVERY_JOURNAL_SIZE = 32 * 1024, - - /** The default size of each slab journal, in blocks */ - DEFAULT_SLAB_JOURNAL_SIZE = 224, - - /** - * The initial size of lbnOperations and pbnOperations, which is based - * upon the expected maximum number of outstanding VIOs. This value was - * chosen to make it highly unlikely that the maps would need to be resized. - **/ - LOCK_MAP_CAPACITY = 10000, - - /** The maximum number of logical zones */ - MAX_LOGICAL_ZONES = 60, - - /** The maximum number of physical zones */ - MAX_PHYSICAL_ZONES = 16, - - /** The base-2 logarithm of the maximum blocks in one slab */ - MAX_SLAB_BITS = 23, - - /** The maximum number of slabs the slab depot supports */ - MAX_SLABS = 8192, - - /** - * The maximum number of block map pages to load simultaneously during - * recovery or rebuild. - **/ - MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS = 1024, - - /** The maximum number of VIOs in the system at once */ - MAXIMUM_USER_VIOS = 2048, - - /** - * The number of in-memory recovery journal blocks is determined by: - * -- 311 journal entries in a 4k block - * -- maximum of 2048 VIOs making entries at once - * so we need at least 2048 / 312 = 7 journal blocks. - **/ - RECOVERY_JOURNAL_TAIL_BUFFER_SIZE = 64, - - /** The number of sectors per block */ - SECTORS_PER_BLOCK = 8, - - /** The only physical block size supported by VDO */ - VDO_BLOCK_SIZE = 4096, - - /** The size of a sector that will not be torn */ - VDO_SECTOR_SIZE = 512, - - /** The physical block number reserved for storing the zero block */ - ZERO_BLOCK = 0, -}; - -/** The maximum logical space is 4 petabytes, which is 1 terablock. */ -extern const BlockCount MAXIMUM_LOGICAL_BLOCKS; - -/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */ - extern const BlockCount MAXIMUM_PHYSICAL_BLOCKS; - -// unit test minimum -extern const BlockCount MINIMUM_SLAB_JOURNAL_BLOCKS; - -#endif // CONSTANTS_H diff --git a/vdo/base/dataVIO.c b/vdo/base/dataVIO.c deleted file mode 100644 index a9778f5..0000000 --- a/vdo/base/dataVIO.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dataVIO.c#7 $ - */ - -#include "dataVIO.h" - -#include "logger.h" - -#include "atomic.h" -#include "blockMap.h" -#include "compressionState.h" -#include "extent.h" -#include "logicalZone.h" -#include "threadConfig.h" -#include "vdoInternal.h" -#include "vioRead.h" -#include "vioWrite.h" - -static const char *ASYNC_OPERATION_NAMES[] = { - "launch", - "acknowledgeWrite", - "acquireHashLock", - "acquireLogicalBlockLock", - "acquirePBNReadLock", - "checkForDedupeForRollover", - "checkForDeduplication", - "compressData", - "continueVIOAsync", - "findBlockMapSlot", - "getMappedBlock", - "getMappedBlockForDedupe", - "getMappedBlockForWrite", - "hashData", - "journalDecrementForDedupe", - "journalDecrementForWrite", - "journalIncrementForCompression", - "journalIncrementForDedupe", - "journalIncrementForWrite", - "journalMappingForCompression", - "journalMappingForDedupe", - "journalMappingForWrite", - "journalUnmappingForDedupe", - "journalUnmappingForWrite", - "attemptPacking", - "putMappedBlock", - "putMappedBlockForDedupe", - "readData", - "updateIndex", - "verifyDeduplication", - "writeData", -}; - -/** - * Initialize the LBN lock of a DataVIO. In addition to recording the LBN on - * which the DataVIO will operate, it will also find the logical zone - * associated with the LBN. - * - * @param dataVIO The dataVIO to initialize - * @param lbn The lbn on which the dataVIO will operate - **/ -static void initializeLBNLock(DataVIO *dataVIO, LogicalBlockNumber lbn) -{ - LBNLock *lock = &dataVIO->logical; - lock->lbn = lbn; - lock->locked = false; - initializeWaitQueue(&lock->waiters); - - VDO *vdo = getVDOFromDataVIO(dataVIO); - lock->zone = getLogicalZone(vdo->logicalZones, computeLogicalZone(dataVIO)); -} - -/**********************************************************************/ -void prepareDataVIO(DataVIO *dataVIO, - LogicalBlockNumber lbn, - VIOOperation operation, - bool isTrim, - VDOAction *callback) -{ - // Clearing the tree lock must happen before initializing the LBN lock, - // which also adds information to the tree lock. - memset(&dataVIO->treeLock, 0, sizeof(dataVIO->treeLock)); - initializeLBNLock(dataVIO, lbn); - initializeRing(&dataVIO->hashLockNode); - initializeRing(&dataVIO->writeNode); - - resetAllocation(dataVIOAsAllocatingVIO(dataVIO)); - - dataVIO->isDuplicate = false; - - memset(&dataVIO->chunkName, 0, sizeof(dataVIO->chunkName)); - memset(&dataVIO->duplicate, 0, sizeof(dataVIO->duplicate)); - - VIO *vio = dataVIOAsVIO(dataVIO); - vio->operation = operation; - vio->callback = callback; - dataVIO->pageCompletion.completion.enqueueable - = vioAsCompletion(vio)->enqueueable; - - dataVIO->mapped.state = MAPPING_STATE_UNCOMPRESSED; - dataVIO->newMapped.state - = (isTrim ? MAPPING_STATE_UNMAPPED : MAPPING_STATE_UNCOMPRESSED); - resetCompletion(vioAsCompletion(vio)); - setLogicalCallback(dataVIO, attemptLogicalBlockLock, - THIS_LOCATION("$F;cb=acquireLogicalBlockLock")); -} - -/**********************************************************************/ -void completeDataVIO(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - if (completion->result != VDO_SUCCESS) { - VIO *vio = dataVIOAsVIO(dataVIO); - updateVIOErrorStats(vio, - "Completing %s VIO for LBN %" PRIu64 - " with error after %s", - getVIOReadWriteFlavor(vio), dataVIO->logical.lbn, - getOperationName(dataVIO)); - } - - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F($io)")); - if (isReadDataVIO(dataVIO)) { - cleanupReadDataVIO(dataVIO); - } else { - cleanupWriteDataVIO(dataVIO); - } -} - -/**********************************************************************/ -void finishDataVIO(DataVIO *dataVIO, int result) -{ - VDOCompletion *completion = dataVIOAsCompletion(dataVIO); - setCompletionResult(completion, result); - completeDataVIO(completion); -} - -/**********************************************************************/ -const char *getOperationName(DataVIO *dataVIO) -{ - STATIC_ASSERT((MAX_ASYNC_OPERATION_NUMBER - MIN_ASYNC_OPERATION_NUMBER) - == COUNT_OF(ASYNC_OPERATION_NAMES)); - - return ((dataVIO->lastAsyncOperation < MAX_ASYNC_OPERATION_NUMBER) - ? ASYNC_OPERATION_NAMES[dataVIO->lastAsyncOperation] - : "unknown async operation"); -} - -/**********************************************************************/ -void receiveDedupeAdvice(DataVIO *dataVIO, const DataLocation *advice) -{ - /* - * NOTE: this is called on non-base-code threads. Be very careful to not do - * anything here that needs a base code thread-local variable, such as - * trying to get the current thread ID, or that does a lot of work. - */ - - VDO *vdo = getVDOFromDataVIO(dataVIO); - ZonedPBN duplicate = validateDedupeAdvice(vdo, advice, dataVIO->logical.lbn); - setDuplicateLocation(dataVIO, duplicate); -} - -/**********************************************************************/ -void setDuplicateLocation(DataVIO *dataVIO, const ZonedPBN source) -{ - dataVIO->isDuplicate = (source.pbn != ZERO_BLOCK); - dataVIO->duplicate = source; -} - -/**********************************************************************/ -void clearMappedLocation(DataVIO *dataVIO) -{ - dataVIO->mapped = (ZonedPBN) { .state = MAPPING_STATE_UNMAPPED }; -} - -/**********************************************************************/ -int setMappedLocation(DataVIO *dataVIO, - PhysicalBlockNumber pbn, - BlockMappingState state) -{ - PhysicalZone *zone; - int result = getPhysicalZone(getVDOFromDataVIO(dataVIO), pbn, &zone); - if (result != VDO_SUCCESS) { - return result; - } - - dataVIO->mapped = (ZonedPBN) { - .pbn = pbn, - .state = state, - .zone = zone, - }; - return VDO_SUCCESS; -} - -/** - * Launch a request which has acquired an LBN lock. - * - * @param dataVIO The DataVIO which has just acquired a lock - **/ -static void launchLockedRequest(DataVIO *dataVIO) -{ - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - dataVIO->logical.locked = true; - - if (isWriteDataVIO(dataVIO)) { - launchWriteDataVIO(dataVIO); - } else { - launchReadDataVIO(dataVIO); - } -} - -/**********************************************************************/ -void attemptLogicalBlockLock(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - - if (dataVIO->logical.lbn - >= getVDOFromDataVIO(dataVIO)->config.logicalBlocks) { - finishDataVIO(dataVIO, VDO_OUT_OF_RANGE); - return; - } - - DataVIO *lockHolder; - LBNLock *lock = &dataVIO->logical; - int result = intMapPut(getLBNLockMap(lock->zone), lock->lbn, dataVIO, false, - (void **) &lockHolder); - if (result != VDO_SUCCESS) { - finishDataVIO(dataVIO, result); - return; - } - - if (lockHolder == NULL) { - // We got the lock - launchLockedRequest(dataVIO); - return; - } - - result = ASSERT(lockHolder->logical.locked, "logical block lock held"); - if (result != VDO_SUCCESS) { - finishDataVIO(dataVIO, result); - return; - } - - /* - * If the new request is a pure read request (not read-modify-write) and - * the lockHolder is writing and has received an allocation (VDO-2683), - * service the read request immediately by copying data from the lockHolder - * to avoid having to flush the write out of the packer just to prevent the - * read from waiting indefinitely. If the lockHolder does not yet have an - * allocation, prevent it from blocking in the packer and wait on it. - */ - if (isReadDataVIO(dataVIO) && atomicLoadBool(&lockHolder->hasAllocation)) { - dataVIOAsCompletion(dataVIO)->layer->copyData(lockHolder, dataVIO); - finishDataVIO(dataVIO, VDO_SUCCESS); - return; - } - - dataVIO->lastAsyncOperation = ACQUIRE_LOGICAL_BLOCK_LOCK; - result = enqueueDataVIO(&lockHolder->logical.waiters, dataVIO, - THIS_LOCATION("$F;cb=logicalBlockLock")); - if (result != VDO_SUCCESS) { - finishDataVIO(dataVIO, result); - return; - } - - // Prevent writes and read-modify-writes from blocking indefinitely on - // lock holders in the packer. - if (!isReadDataVIO(lockHolder) && cancelCompression(lockHolder)) { - dataVIO->compression.lockHolder = lockHolder; - launchPackerCallback(dataVIO, removeLockHolderFromPacker, - THIS_LOCATION("$F;cb=removeLockHolderFromPacker")); - } -} - -/** - * Release an uncontended LBN lock. - * - * @param dataVIO The DataVIO holding the lock - **/ -static void releaseLock(DataVIO *dataVIO) -{ - LBNLock *lock = &dataVIO->logical; - IntMap *lockMap = getLBNLockMap(lock->zone); - if (!lock->locked) { - // The lock is not locked, so it had better not be registered in the lock - // map. - DataVIO *lockHolder = intMapGet(lockMap, lock->lbn); - ASSERT_LOG_ONLY((dataVIO != lockHolder), - "no logical block lock held for block %llu", - lock->lbn); - return; - } - - // Remove the lock from the logical block lock map, releasing the lock. - DataVIO *lockHolder = intMapRemove(lockMap, lock->lbn); - ASSERT_LOG_ONLY((dataVIO == lockHolder), - "logical block lock mismatch for block %llu", lock->lbn); - lock->locked = false; - return; -} - -/**********************************************************************/ -void releaseLogicalBlockLock(DataVIO *dataVIO) -{ - assertInLogicalZone(dataVIO); - if (!hasWaiters(&dataVIO->logical.waiters)) { - releaseLock(dataVIO); - return; - } - - LBNLock *lock = &dataVIO->logical; - ASSERT_LOG_ONLY(lock->locked, "LBNLock with waiters is not locked"); - - // Another DataVIO is waiting for the lock, so just transfer it in a single - // lock map operation - DataVIO *nextLockHolder = waiterAsDataVIO(dequeueNextWaiter(&lock->waiters)); - - // Transfer the remaining lock waiters to the next lock holder. - transferAllWaiters(&lock->waiters, &nextLockHolder->logical.waiters); - - DataVIO *lockHolder; - int result = intMapPut(getLBNLockMap(lock->zone), lock->lbn, nextLockHolder, - true, (void **) &lockHolder); - if (result != VDO_SUCCESS) { - finishDataVIO(nextLockHolder, result); - return; - } - - ASSERT_LOG_ONLY((lockHolder == dataVIO), - "logical block lock mismatch for block %llu", lock->lbn); - lock->locked = false; - - /* - * If there are still waiters, other DataVIOs must be trying to get the lock - * we just transferred. We must ensure that the new lock holder doesn't block - * in the packer. - */ - if (hasWaiters(&nextLockHolder->logical.waiters)) { - cancelCompression(nextLockHolder); - } - - // Avoid stack overflow on lock transfer. - // XXX: this is only an issue in the 1 thread config. - dataVIOAsCompletion(nextLockHolder)->requeue = true; - launchLockedRequest(nextLockHolder); -} diff --git a/vdo/base/dataVIO.h b/vdo/base/dataVIO.h deleted file mode 100644 index ec6e9f6..0000000 --- a/vdo/base/dataVIO.h +++ /dev/null @@ -1,945 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dataVIO.h#4 $ - */ - -#ifndef DATA_VIO_H -#define DATA_VIO_H - -#include "allocatingVIO.h" -#include "atomic.h" -#include "blockMapEntry.h" -#include "blockMappingState.h" -#include "constants.h" -#include "hashZone.h" -#include "journalPoint.h" -#include "logicalZone.h" -#include "referenceOperation.h" -#include "ringNode.h" -#include "threadConfig.h" -#include "trace.h" -#include "types.h" -#include "vdoPageCache.h" -#include "vio.h" -#include "waitQueue.h" - -/** - * Codes for describing the last asynchronous operation performed on a VIO. - **/ -typedef enum __attribute__((packed)) { - MIN_ASYNC_OPERATION_NUMBER = 0, - LAUNCH = MIN_ASYNC_OPERATION_NUMBER, - ACKNOWLEDGE_WRITE, - ACQUIRE_HASH_LOCK, - ACQUIRE_LOGICAL_BLOCK_LOCK, - ACQUIRE_PBN_READ_LOCK, - CHECK_FOR_DEDUPE_FOR_ROLLOVER, - CHECK_FOR_DEDUPLICATION, - COMPRESS_DATA, - CONTINUE_VIO_ASYNC, - FIND_BLOCK_MAP_SLOT, - GET_MAPPED_BLOCK, - GET_MAPPED_BLOCK_FOR_DEDUPE, - GET_MAPPED_BLOCK_FOR_WRITE, - HASH_DATA, - JOURNAL_DECREMENT_FOR_DEDUPE, - JOURNAL_DECREMENT_FOR_WRITE, - JOURNAL_INCREMENT_FOR_COMPRESSION, - JOURNAL_INCREMENT_FOR_DEDUPE, - JOURNAL_INCREMENT_FOR_WRITE, - JOURNAL_MAPPING_FOR_COMPRESSION, - JOURNAL_MAPPING_FOR_DEDUPE, - JOURNAL_MAPPING_FOR_WRITE, - JOURNAL_UNMAPPING_FOR_DEDUPE, - JOURNAL_UNMAPPING_FOR_WRITE, - PACK_COMPRESSED_BLOCK, - PUT_MAPPED_BLOCK, - PUT_MAPPED_BLOCK_FOR_DEDUPE, - READ_DATA, - UPDATE_INDEX, - VERIFY_DEDUPLICATION, - WRITE_DATA, - MAX_ASYNC_OPERATION_NUMBER, -} AsyncOperationNumber; - -/* - * An LBN lock. - */ -struct lbnLock { - /* The LBN being locked */ - LogicalBlockNumber lbn; - /* Whether the lock is locked */ - bool locked; - /* The queue of waiters for the lock */ - WaitQueue waiters; - /* The logical zone of the LBN */ - LogicalZone *zone; -}; - -/* - * Fields for using the arboreal block map. - */ -typedef struct { - /* The current height at which this DataVIO is operating */ - Height height; - /* The block map tree for this LBN */ - RootCount rootIndex; - /* Whether we hold a page lock */ - bool locked; - /* The thread on which to run the callback */ - ThreadID threadID; - /* The function to call after looking up a block map slot */ - VDOAction *callback; - /* The key for the lock map */ - uint64_t key; - /* The queue of waiters for the page this VIO is allocating or loading */ - WaitQueue waiters; - /* The block map tree slots for this LBN */ - BlockMapTreeSlot treeSlots[BLOCK_MAP_TREE_HEIGHT + 1]; -} TreeLock; - -typedef struct { - /* - * The current compression state of this VIO. This field contains a value - * which consists of a VIOCompressionState possibly ORed with a flag - * indicating that a request has been made to cancel (or prevent) compression - * for this VIO. - * - * This field should be accessed through the getCompressionState() and - * setCompressionState() methods. It should not be accessed directly. - */ - Atomic32 state; - - /* The compressed size of this block */ - uint16_t size; - - /* The packer input or output bin slot which holds the enclosing DataVIO */ - SlotNumber slot; - - /* The packer input bin to which the enclosing DataVIO has been assigned */ - InputBin *bin; - - /* A pointer to the compressed form of this block */ - char *data; - - /* - * A VIO which is blocked in the packer while holding a lock this VIO needs. - */ - DataVIO *lockHolder; - -} CompressionState; - -/** - * A VIO for processing user data requests. - **/ -struct dataVIO { - /* The underlying AllocatingVIO */ - AllocatingVIO allocatingVIO; - - /* The logical block of this request */ - LBNLock logical; - - /* The state for traversing the block map tree */ - TreeLock treeLock; - - /* The current partition address of this block */ - ZonedPBN mapped; - - /** The hash of this VIO (if not zero) */ - UdsChunkName chunkName; - - /* Used for logging and debugging */ - AsyncOperationNumber lastAsyncOperation; - - /* The operation to record in the recovery and slab journals */ - ReferenceOperation operation; - - /* Whether this VIO is a read-and-write VIO */ - bool isPartialWrite; - - /* Whether this VIO contains all zeros */ - bool isZeroBlock; - - /* Whether this VIO write is a duplicate */ - bool isDuplicate; - - /* - * Whether this VIO has received an allocation (needs to be atomic so it can - * be examined from threads not in the allocation zone). - */ - AtomicBool hasAllocation; - - /* The new partition address of this block after the VIO write completes */ - ZonedPBN newMapped; - - /* The hash zone responsible for the chunk name (NULL if isZeroBlock) */ - HashZone *hashZone; - - /* The lock this VIO holds or shares with other VIOs with the same data */ - HashLock *hashLock; - - /* All DataVIOs sharing a hash lock are kept in a ring linking these nodes */ - RingNode hashLockNode; - - /* The block number in the partition of the albireo deduplication advice */ - ZonedPBN duplicate; - - /* - * The sequence number of the recovery journal block containing the increment - * entry for this VIO. - */ - SequenceNumber recoverySequenceNumber; - - /* The point in the recovery journal where this write last made an entry */ - JournalPoint recoveryJournalPoint; - - /* The RingNode of VIOs in user initiated write requests */ - RingNode writeNode; - - /* A flag indicating that a data write VIO has a flush generation lock */ - bool hasFlushGenerationLock; - - /* The generation number of the VDO that this VIO belongs to */ - SequenceNumber flushGeneration; - - /* The completion to use for fetching block map pages for this vio */ - VDOPageCompletion pageCompletion; - - /* All of the fields necessary for the compression path */ - CompressionState compression; -}; - -/** - * Convert an AllocatingVIO to a DataVIO. - * - * @param allocatingVIO The AllocatingVIO to convert - * - * @return The AllocatingVIO as a DataVIO - **/ -static inline DataVIO *allocatingVIOAsDataVIO(AllocatingVIO *allocatingVIO) -{ - STATIC_ASSERT(offsetof(DataVIO, allocatingVIO) == 0); - ASSERT_LOG_ONLY((allocatingVIOAsVIO(allocatingVIO)->type == VIO_TYPE_DATA), - "AllocatingVIO is a DataVIO"); - return (DataVIO *) allocatingVIO; -} - -/** - * Convert a VIO to a DataVIO. - * - * @param vio The VIO to convert - * - * @return The VIO as a DataVIO - **/ -static inline DataVIO *vioAsDataVIO(VIO *vio) -{ - STATIC_ASSERT(offsetof(DataVIO, allocatingVIO) == 0); - STATIC_ASSERT(offsetof(AllocatingVIO, vio) == 0); - ASSERT_LOG_ONLY((vio->type == VIO_TYPE_DATA), "VIO is a DataVIO"); - return (DataVIO *) vio; -} - -/** - * Convert a DataVIO to an AllocatingVIO. - * - * @param dataVIO The DataVIO to convert - * - * @return The DataVIO as an AllocatingVIO - **/ -static inline AllocatingVIO *dataVIOAsAllocatingVIO(DataVIO *dataVIO) -{ - return &dataVIO->allocatingVIO; -} - -/** - * Convert a DataVIO to a VIO. - * - * @param dataVIO The DataVIO to convert - * - * @return The DataVIO as a VIO - **/ -static inline VIO *dataVIOAsVIO(DataVIO *dataVIO) -{ - return allocatingVIOAsVIO(dataVIOAsAllocatingVIO(dataVIO)); -} - -/** - * Convert a generic VDOCompletion to a DataVIO. - * - * @param completion The completion to convert - * - * @return The completion as a DataVIO - **/ -static inline DataVIO *asDataVIO(VDOCompletion *completion) -{ - return vioAsDataVIO(asVIO(completion)); -} - -/** - * Convert a DataVIO to a generic completion. - * - * @param dataVIO The DataVIO to convert - * - * @return The DataVIO as a completion - **/ -static inline VDOCompletion *dataVIOAsCompletion(DataVIO *dataVIO) -{ - return allocatingVIOAsCompletion(dataVIOAsAllocatingVIO(dataVIO)); -} - -/** - * Convert a DataVIO to a generic wait queue entry. - * - * @param dataVIO The DataVIO to convert - * - * @return The DataVIO as a wait queue entry - **/ -static inline Waiter *dataVIOAsWaiter(DataVIO *dataVIO) -{ - return allocatingVIOAsWaiter(dataVIOAsAllocatingVIO(dataVIO)); -} - -/** - * Convert a DataVIO's generic wait queue entry back to the DataVIO. - * - * @param waiter The wait queue entry to convert - * - * @return The wait queue entry as a DataVIO - **/ -static inline DataVIO *waiterAsDataVIO(Waiter *waiter) -{ - if (waiter == NULL) { - return NULL; - } - - return allocatingVIOAsDataVIO(waiterAsAllocatingVIO(waiter)); -} - -/** - * Check whether a DataVIO is a read. - * - * @param dataVIO The DataVIO to check - **/ -static inline bool isReadDataVIO(DataVIO *dataVIO) -{ - return isReadVIO(dataVIOAsVIO(dataVIO)); -} - -/** - * Check whether a DataVIO is a write. - * - * @param dataVIO The DataVIO to check - **/ -static inline bool isWriteDataVIO(DataVIO *dataVIO) -{ - return isWriteVIO(dataVIOAsVIO(dataVIO)); -} - -/** - * Check whether a DataVIO is a compressed block write. - * - * @param dataVIO The DataVIO to check - * - * @return true if the DataVIO is a compressed block write - **/ -static inline bool isCompressedWriteDataVIO(DataVIO *dataVIO) -{ - return isCompressedWriteVIO(dataVIOAsVIO(dataVIO)); -} - -/** - * Check whether a DataVIO is a trim. - * - * @param dataVIO The DataVIO to check - * - * @return true if the DataVIO is a trim - **/ -static inline bool isTrimDataVIO(DataVIO *dataVIO) -{ - return (dataVIO->newMapped.state == MAPPING_STATE_UNMAPPED); -} - -/** - * Get the location that should passed Albireo as the new advice for where to - * find the data written by this DataVIO. - * - * @param dataVIO The write DataVIO that is ready to update Albireo - * - * @return a DataLocation containing the advice to store in Albireo - **/ -static inline DataLocation getDataVIONewAdvice(const DataVIO *dataVIO) -{ - return (DataLocation) { - .pbn = dataVIO->newMapped.pbn, - .state = dataVIO->newMapped.state, - }; -} - -/** - * Get the VDO from a DataVIO. - * - * @param dataVIO The DataVIO from which to get the VDO - * - * @return The VDO to which a DataVIO belongs - **/ -static inline VDO *getVDOFromDataVIO(DataVIO *dataVIO) -{ - return dataVIOAsVIO(dataVIO)->vdo; -} - -/** - * Get the ThreadConfig from a DataVIO. - * - * @param dataVIO The DataVIO from which to get the ThreadConfig - * - * @return The ThreadConfig of the VDO to which a DataVIO belongs - **/ -static inline const ThreadConfig *getThreadConfigFromDataVIO(DataVIO *dataVIO) -{ - return getThreadConfig(getVDOFromDataVIO(dataVIO)); -} - -/** - * Get the allocation of a DataVIO. - * - * @param dataVIO The DataVIO - * - * @return The allocation of the DataVIO - **/ -static inline PhysicalBlockNumber getDataVIOAllocation(DataVIO *dataVIO) -{ - return dataVIOAsAllocatingVIO(dataVIO)->allocation; -} - -/** - * Check whether a DataVIO has an allocation. - * - * @param dataVIO The DataVIO to check - * - * @return true if the DataVIO has an allocated block - **/ -static inline bool hasAllocation(DataVIO *dataVIO) -{ - return (getDataVIOAllocation(dataVIO) != ZERO_BLOCK); -} - -/** - * (Re)initialize a DataVIO to have a new logical block number, keeping the - * same parent and other state. This method must be called before using a - * DataVIO. - * - * @param dataVIO The DataVIO to initialize - * @param lbn The logical block number of the DataVIO - * @param operation The operation this DataVIO will perform - * @param isTrim true if this DataVIO is for a trim request - * @param callback The function to call once the VIO has completed its - * operation - **/ -void prepareDataVIO(DataVIO *dataVIO, - LogicalBlockNumber lbn, - VIOOperation operation, - bool isTrim, - VDOAction *callback); - -/** - * Complete the processing of a DataVIO. - * - * @param completion The completion of the VIO to complete - **/ -void completeDataVIO(VDOCompletion *completion); - -/** - * Finish processing a DataVIO, possibly due to an error. This function will - * set any error, and then initiate DataVIO clean up. - * - * @param dataVIO The DataVIO to abort - * @param result The result of processing the DataVIO - **/ -void finishDataVIO(DataVIO *dataVIO, int result); - -/** - * Continue processing a DataVIO that has been waiting for an event, setting - * the result from the event and calling the current callback. - * - * @param dataVIO The DataVIO to continue - * @param result The current result (will not mask older errors) - **/ -static inline void continueDataVIO(DataVIO *dataVIO, int result) -{ - continueCompletion(dataVIOAsCompletion(dataVIO), result); -} - -/** - * Get the name of the last asynchronous operation performed on a DataVIO. - * - * @param dataVIO The DataVIO in question - * - * @return The name of the last operation performed on the DataVIO - **/ -const char *getOperationName(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Add a trace record for the current source location. - * - * @param dataVIO The DataVIO structure to be updated - * @param location The source-location descriptor to be recorded - **/ -static inline void dataVIOAddTraceRecord(DataVIO *dataVIO, - TraceLocation location) -{ - vioAddTraceRecord(dataVIOAsVIO(dataVIO), location); -} - -/** - * Add a DataVIO to the tail end of a wait queue. The DataVIO must not already - * be waiting in a queue. A trace record is also generated for the DataVIO. - * - * @param queue The queue to which to add the waiter - * @param waiter The DataVIO to add to the queue - * @param location The source-location descriptor to be traced in the DataVIO - * - * @return VDO_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static inline int enqueueDataVIO(WaitQueue *queue, - DataVIO *waiter, - TraceLocation location) -{ - dataVIOAddTraceRecord(waiter, location); - return enqueueWaiter(queue, dataVIOAsWaiter(waiter)); -} - -/** - * Check that a DataVIO is running on the correct thread for its hash zone. - * - * @param dataVIO The DataVIO in question - **/ -static inline void assertInHashZone(DataVIO *dataVIO) -{ - ThreadID expected = getHashZoneThreadID(dataVIO->hashZone); - ThreadID threadID = getCallbackThreadID(); - // It's odd to use the LBN, but converting the chunk name to hex is a bit - // clunky for an inline, and the LBN better than nothing as an identifier. - ASSERT_LOG_ONLY((expected == threadID), - "DataVIO for logical block %" PRIu64 - " on thread %u, should be on hash zone thread %u", - dataVIO->logical.lbn, threadID, expected); -} - -/** - * Set a callback as a hash zone operation. This function presumes that the - * hashZone field of the DataVIO has already been set. - * - * @param dataVIO The DataVIO with which to set the callback - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setHashZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setCallback(dataVIOAsCompletion(dataVIO), callback, - getHashZoneThreadID(dataVIO->hashZone)); - dataVIOAddTraceRecord(dataVIO, location); -} - -/** - * Set a callback as a hash zone operation and invoke it immediately. - * - * @param dataVIO The DataVIO with which to set the callback - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void launchHashZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setHashZoneCallback(dataVIO, callback, location); - invokeCallback(dataVIOAsCompletion(dataVIO)); -} - -/** - * Check that a DataVIO is running on the correct thread for its logical zone. - * - * @param dataVIO The DataVIO in question - **/ -static inline void assertInLogicalZone(DataVIO *dataVIO) -{ - ThreadID expected = getLogicalZoneThreadID(dataVIO->logical.zone); - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((expected == threadID), - "DataVIO for logical block %" PRIu64 - " on thread %u, should be on thread %u", - dataVIO->logical.lbn, threadID, expected); -} - -/** - * Set a callback as a logical block operation. This function presumes that the - * logicalZone field of the DataVIO has already been set. - * - * @param dataVIO The DataVIO with which to set the callback - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setLogicalCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setCallback(dataVIOAsCompletion(dataVIO), callback, - getLogicalZoneThreadID(dataVIO->logical.zone)); - dataVIOAddTraceRecord(dataVIO, location); -} - -/** - * Set a callback as a logical block operation and invoke it immediately. - * - * @param dataVIO The DataVIO with which to set the callback - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void launchLogicalCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setLogicalCallback(dataVIO, callback, location); - invokeCallback(dataVIOAsCompletion(dataVIO)); -} - -/** - * Check that a DataVIO is running on the correct thread for its allocated - * zone. - * - * @param dataVIO The DataVIO in question - **/ -static inline void assertInAllocatedZone(DataVIO *dataVIO) -{ - assertInPhysicalZone(dataVIOAsAllocatingVIO(dataVIO)); -} - -/** - * Set a callback as a physical block operation in a DataVIO's allocated zone. - * - * @param dataVIO The DataVIO - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setAllocatedZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setPhysicalZoneCallback(dataVIOAsAllocatingVIO(dataVIO), callback, - location); -} - -/** - * Set a callback as a physical block operation in a DataVIO's allocated zone - * and queue the DataVIO and invoke it immediately. - * - * @param dataVIO The DataVIO - * @param callback The callback to invoke - * @param location The tracing info for the call site - **/ -static inline void launchAllocatedZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - launchPhysicalZoneCallback(dataVIOAsAllocatingVIO(dataVIO), callback, - location); -} - -/** - * Check that a DataVIO is running on the correct thread for its duplicate - * zone. - * - * @param dataVIO The DataVIO in question - **/ -static inline void assertInDuplicateZone(DataVIO *dataVIO) -{ - ThreadID expected = getPhysicalZoneThreadID(dataVIO->duplicate.zone); - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((expected == threadID), - "DataVIO for duplicate physical block %" PRIu64 - " on thread %u, should be on thread %u", - dataVIO->duplicate.pbn, threadID, expected); -} - -/** - * Set a callback as a physical block operation in a DataVIO's duplicate zone. - * - * @param dataVIO The DataVIO - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setDuplicateZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setCallback(dataVIOAsCompletion(dataVIO), callback, - getPhysicalZoneThreadID(dataVIO->duplicate.zone)); - dataVIOAddTraceRecord(dataVIO, location); -} - -/** - * Set a callback as a physical block operation in a DataVIO's duplicate zone - * and queue the DataVIO and invoke it immediately. - * - * @param dataVIO The DataVIO - * @param callback The callback to invoke - * @param location The tracing info for the call site - **/ -static inline void launchDuplicateZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setDuplicateZoneCallback(dataVIO, callback, location); - invokeCallback(dataVIOAsCompletion(dataVIO)); -} - -/** - * Check that a DataVIO is running on the correct thread for its mapped zone. - * - * @param dataVIO The DataVIO in question - **/ -static inline void assertInMappedZone(DataVIO *dataVIO) -{ - ThreadID expected = getPhysicalZoneThreadID(dataVIO->mapped.zone); - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((expected == threadID), - "DataVIO for mapped physical block %" PRIu64 - " on thread %u, should be on thread %u", - dataVIO->mapped.pbn, threadID, expected); -} - -/** - * Set a callback as a physical block operation in a DataVIO's mapped zone. - * - * @param dataVIO The DataVIO - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setMappedZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setCallback(dataVIOAsCompletion(dataVIO), callback, - getPhysicalZoneThreadID(dataVIO->mapped.zone)); - dataVIOAddTraceRecord(dataVIO, location); -} - -/** - * Check that a DataVIO is running on the correct thread for its newMapped - * zone. - * - * @param dataVIO The DataVIO in question - **/ -static inline void assertInNewMappedZone(DataVIO *dataVIO) -{ - ThreadID expected = getPhysicalZoneThreadID(dataVIO->newMapped.zone); - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((expected == threadID), - "DataVIO for newMapped physical block %" PRIu64 - " on thread %u, should be on thread %u", - dataVIO->newMapped.pbn, threadID, expected); -} - -/** - * Set a callback as a physical block operation in a DataVIO's newMapped zone. - * - * @param dataVIO The DataVIO - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setNewMappedZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setCallback(dataVIOAsCompletion(dataVIO), callback, - getPhysicalZoneThreadID(dataVIO->newMapped.zone)); - dataVIOAddTraceRecord(dataVIO, location); -} - -/** - * Set a callback as a physical block operation in a DataVIO's newMapped zone - * and queue the DataVIO and invoke it immediately. - * - * @param dataVIO The DataVIO - * @param callback The callback to invoke - * @param location The tracing info for the call site - **/ -static inline void launchNewMappedZoneCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setNewMappedZoneCallback(dataVIO, callback, location); - invokeCallback(dataVIOAsCompletion(dataVIO)); -} - -/** - * Check that a DataVIO is running on the journal thread. - * - * @param dataVIO The DataVIO in question - **/ -static inline void assertInJournalZone(DataVIO *dataVIO) -{ - ThreadID expected - = getJournalZoneThread(getThreadConfigFromDataVIO(dataVIO)); - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((expected == threadID), - "DataVIO for logical block %" PRIu64 - " on thread %u, should be on journal thread %u", - dataVIO->logical.lbn, threadID, expected); -} - -/** - * Set a callback as a journal operation. - * - * @param dataVIO The DataVIO with which to set the callback - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setJournalCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setCallback(dataVIOAsCompletion(dataVIO), callback, - getJournalZoneThread(getThreadConfigFromDataVIO(dataVIO))); - dataVIOAddTraceRecord(dataVIO, location); -} - -/** - * Set a callback as a journal operation and invoke it immediately. - * - * @param dataVIO The DataVIO with which to set the callback - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void launchJournalCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setJournalCallback(dataVIO, callback, location); - invokeCallback(dataVIOAsCompletion(dataVIO)); -} - -/** - * Check that a DataVIO is running on the packer thread - * - * @param dataVIO The DataVIO in question - **/ -static inline void assertInPackerZone(DataVIO *dataVIO) -{ - ThreadID expected = getPackerZoneThread(getThreadConfigFromDataVIO(dataVIO)); - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((expected == threadID), - "DataVIO for logical block %" PRIu64 - " on thread %u, should be on packer thread %u", - dataVIO->logical.lbn, threadID, expected); -} - -/** - * Set a callback as a packer operation. - * - * @param dataVIO The DataVIO with which to set the callback - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void setPackerCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setCallback(dataVIOAsCompletion(dataVIO), callback, - getPackerZoneThread(getThreadConfigFromDataVIO(dataVIO))); - dataVIOAddTraceRecord(dataVIO, location); -} - -/** - * Set a callback as a packer operation and invoke it immediately. - * - * @param dataVIO The DataVIO with which to set the callback - * @param callback The callback to set - * @param location The tracing info for the call site - **/ -static inline void launchPackerCallback(DataVIO *dataVIO, - VDOAction *callback, - TraceLocation location) -{ - setPackerCallback(dataVIO, callback, location); - invokeCallback(dataVIOAsCompletion(dataVIO)); -} - -/** - * Check whether the advice received from Albireo is a valid data location, - * and if it is, accept it as the location of a potential duplicate of the - * DataVIO. - * - * @param dataVIO The DataVIO that queried Albireo - * @param advice A potential location of the data, or NULL for no advice - **/ -void receiveDedupeAdvice(DataVIO *dataVIO, const DataLocation *advice); - -/** - * Set the location of the duplicate block for a DataVIO, updating the - * isDuplicate and duplicate fields from a ZonedPBN. - * - * @param dataVIO The DataVIO to modify - * @param source The location of the duplicate - **/ -void setDuplicateLocation(DataVIO *dataVIO, const ZonedPBN source); - -/** - * Clear a DataVIO's mapped block location, setting it to be unmapped. This - * indicates the block map entry for the logical block is either unmapped or - * corrupted. - * - * @param dataVIO The DataVIO whose mapped block location is to be reset - **/ -void clearMappedLocation(DataVIO *dataVIO); - -/** - * Set a DataVIO's mapped field to the physical location recorded in the block - * map for the logical block in the VIO. - * - * @param dataVIO The DataVIO whose field is to be set - * @param pbn The physical block number to set - * @param state The mapping state to set - * - * @return VDO_SUCCESS or an error code if the mapping is unusable - **/ -int setMappedLocation(DataVIO *dataVIO, - PhysicalBlockNumber pbn, - BlockMappingState state) - __attribute__((warn_unused_result)); - -/** - * Attempt to acquire the lock on a logical block. This is the start of the - * path for all external requests. It is registered in prepareDataVIO(). - * - * @param completion The DataVIO for an external data request as a completion - **/ -void attemptLogicalBlockLock(VDOCompletion *completion); - -/** - * Release the lock on the logical block, if any, that a DataVIO has acquired. - * - * @param dataVIO The DataVIO releasing its logical block lock - **/ -void releaseLogicalBlockLock(DataVIO *dataVIO); - -#endif // DATA_VIO_H diff --git a/vdo/base/dirtyLists.c b/vdo/base/dirtyLists.c deleted file mode 100644 index d16b790..0000000 --- a/vdo/base/dirtyLists.c +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyLists.c#1 $ - */ - -#include "dirtyLists.h" -#include "dirtyListsInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "types.h" - -struct dirtyLists { - /** The number of periods after which an element will be expired */ - BlockCount maximumAge; - /** The oldest period which has unexpired elements */ - SequenceNumber oldestPeriod; - /** One more than the current period */ - SequenceNumber nextPeriod; - /** The function to call on expired elements */ - DirtyCallback *callback; - /** The callback context */ - void *context; - /** The offset in the array of lists of the oldest period */ - BlockCount offset; - /** The list of elements which are being expired */ - RingNode expired; - /** The lists of dirty elements */ - RingNode lists[]; -}; - -/**********************************************************************/ -int makeDirtyLists(BlockCount maximumAge, - DirtyCallback *callback, - void *context, - DirtyLists **dirtyListsPtr) -{ - DirtyLists *dirtyLists; - int result = ALLOCATE_EXTENDED(DirtyLists, maximumAge, RingNode, __func__, - &dirtyLists); - if (result != VDO_SUCCESS) { - return result; - } - - dirtyLists->maximumAge = maximumAge; - dirtyLists->callback = callback; - dirtyLists->context = context; - - initializeRing(&dirtyLists->expired); - for (BlockCount i = 0; i < maximumAge; i++) { - initializeRing(&dirtyLists->lists[i]); - } - - *dirtyListsPtr = dirtyLists; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeDirtyLists(DirtyLists **dirtyListsPtr) -{ - DirtyLists *lists = *dirtyListsPtr; - if (lists == NULL) { - return; - } - - FREE(lists); - *dirtyListsPtr = NULL; -} - -/**********************************************************************/ -void setCurrentPeriod(DirtyLists *dirtyLists, SequenceNumber period) -{ - ASSERT_LOG_ONLY(dirtyLists->nextPeriod == 0, "current period not set"); - dirtyLists->oldestPeriod = period; - dirtyLists->nextPeriod = period + 1; - dirtyLists->offset = period % dirtyLists->maximumAge; -} - -/** - * Expire the oldest list. - * - * @param dirtyLists The DirtyLists to expire - **/ -static void expireOldestList(DirtyLists *dirtyLists) -{ - dirtyLists->oldestPeriod++; - RingNode *ring = &(dirtyLists->lists[dirtyLists->offset++]); - if (!isRingEmpty(ring)) { - spliceRingChainBefore(ring->next, ring->prev, &dirtyLists->expired); - } - - if (dirtyLists->offset == dirtyLists->maximumAge) { - dirtyLists->offset = 0; - } -} - -/** - * Update the period if necessary. - * - * @param dirtyLists The DirtyLists - * @param period The new period - **/ -static void updatePeriod(DirtyLists *dirtyLists, SequenceNumber period) -{ - while (dirtyLists->nextPeriod <= period) { - if ((dirtyLists->nextPeriod - dirtyLists->oldestPeriod) - == dirtyLists->maximumAge) { - expireOldestList(dirtyLists); - } - dirtyLists->nextPeriod++; - } -} - -/** - * Write out the expired list. - * - * @param dirtyLists The dirtyLists - **/ -static void writeExpiredElements(DirtyLists *dirtyLists) -{ - if (isRingEmpty(&dirtyLists->expired)) { - return; - } - - dirtyLists->callback(&dirtyLists->expired, dirtyLists->context); - ASSERT_LOG_ONLY(isRingEmpty(&dirtyLists->expired), - "no expired elements remain"); -} - -/**********************************************************************/ -void addToDirtyLists(DirtyLists *dirtyLists, - RingNode *node, - SequenceNumber oldPeriod, - SequenceNumber newPeriod) -{ - if ((oldPeriod == newPeriod) - || ((oldPeriod != 0) && (oldPeriod < newPeriod))) { - return; - } - - if (newPeriod < dirtyLists->oldestPeriod) { - pushRingNode(&dirtyLists->expired, node); - } else { - updatePeriod(dirtyLists, newPeriod); - pushRingNode(&dirtyLists->lists[newPeriod % dirtyLists->maximumAge], node); - } - - writeExpiredElements(dirtyLists); -} - -/**********************************************************************/ -void advancePeriod(DirtyLists *dirtyLists, SequenceNumber period) -{ - updatePeriod(dirtyLists, period); - writeExpiredElements(dirtyLists); -} - -/**********************************************************************/ -void flushDirtyLists(DirtyLists *dirtyLists) -{ - while (dirtyLists->oldestPeriod < dirtyLists->nextPeriod) { - expireOldestList(dirtyLists); - } - writeExpiredElements(dirtyLists); -} - -/**********************************************************************/ -SequenceNumber getDirtyListsNextPeriod(DirtyLists *dirtyLists) -{ - return dirtyLists->nextPeriod; -} diff --git a/vdo/base/dirtyLists.h b/vdo/base/dirtyLists.h deleted file mode 100644 index f3d27f7..0000000 --- a/vdo/base/dirtyLists.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyLists.h#1 $ - */ - -#ifndef DIRTY_LISTS_H -#define DIRTY_LISTS_H - -#include "ringNode.h" -#include "types.h" - -/** - * A collection of lists of dirty elements ordered by age. An element is always - * placed on the oldest list in which it was dirtied (moving between lists or - * removing altogether is cheap). Whenever the current period is advanced, any - * elements older than the maxium age are expired. If an element is to be added - * with a dirty age older than the maximum age, it is expired immediately. - **/ -typedef struct dirtyLists DirtyLists; - -/** - * A function which will be called with a ring of dirty elements which have - * been expired. All of the expired elements must be removed from the ring - * before this function returns. - * - * @param expired The list of expired elements - * @param context The context for the callback - **/ -typedef void DirtyCallback(RingNode *expired, void *context); - -/** - * Construct a new set of dirty lists. - * - * @param [in] maximumAge The age at which an element will be expired - * @param [in] callback The function to call when a set of elements have - * expired - * @param [in] context The context for the callback - * @param [out] dirtyListsPtr A pointer to hold the new DirtyLists - * - * @return VDO_SUCCESS or an error - **/ -int makeDirtyLists(BlockCount maximumAge, - DirtyCallback *callback, - void *context, - DirtyLists **dirtyListsPtr) - __attribute__((warn_unused_result)); - -/** - * Free a set of dirty lists and null out the pointer to them. - * - * @param dirtyListsPtr A pointer to the dirty lists to be freed - **/ -void freeDirtyLists(DirtyLists **dirtyListsPtr); - -/** - * Set the current period. This function should only be called once. - * - * @param dirtyLists The dirtyLists - * @param period The current period - **/ -void setCurrentPeriod(DirtyLists *dirtyLists, SequenceNumber period); - -/** - * Add an element to the dirty lists. - * - * @param dirtyLists The DirtyLists receiving the element - * @param node The RingNode of the element to add - * @param oldPeriod The period in which the element was previous dirtied, - * or 0 if it was not dirty - * @param newPeriod The period in which the element has now been dirtied, - * or 0 if it does not hold a lock - **/ -void addToDirtyLists(DirtyLists *dirtyLists, - RingNode *node, - SequenceNumber oldPeriod, - SequenceNumber newPeriod); - -/** - * Advance the current period. If the current period is greater than the number - * of lists, expire the oldest lists. - * - * @param dirtyLists The DirtyLists to advance - * @param period The new current period - **/ -void advancePeriod(DirtyLists *dirtyLists, SequenceNumber period); - -/** - * Flush all dirty lists. This will cause the period to be advanced past the - * current period. - * - * @param dirtyLists The dirtyLists to flush - **/ -void flushDirtyLists(DirtyLists *dirtyLists); - -#endif // DIRTY_LISTS_H diff --git a/vdo/base/dirtyListsInternals.h b/vdo/base/dirtyListsInternals.h deleted file mode 100644 index d5876d0..0000000 --- a/vdo/base/dirtyListsInternals.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/dirtyListsInternals.h#1 $ - */ - -#ifndef DIRTY_LISTS_INTERNALS_H -#define DIRTY_LISTS_INTERNALS_H - -#include "dirtyLists.h" -#include "types.h" - -/** - * Get the next period from a DirtyLists. This method is used by unit tests. - * - * @param dirtyLists The DirtyLists to examine - **/ -SequenceNumber getDirtyListsNextPeriod(DirtyLists *dirtyLists) - __attribute__((warn_unused_result)); - -#endif // DIRTY_LISTS_INTERNALS_H diff --git a/vdo/base/extent.c b/vdo/base/extent.c deleted file mode 100644 index 5983615..0000000 --- a/vdo/base/extent.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/extent.c#3 $ - */ - -#include "extent.h" - -#include "memoryAlloc.h" - -#include "completion.h" -#include "constants.h" -#include "logger.h" -#include "physicalLayer.h" -#include "types.h" -#include "vdo.h" -#include "vioRead.h" -#include "vioWrite.h" - -/**********************************************************************/ -int createExtent(PhysicalLayer *layer, - VIOType vioType, - VIOPriority priority, - BlockCount blockCount, - char *data, - VDOExtent **extentPtr) -{ - int result = ASSERT(isMetadataVIOType(vioType), - "createExtent() called for metadata"); - if (result != VDO_SUCCESS) { - return result; - } - - VDOExtent *extent; - result = ALLOCATE_EXTENDED(VDOExtent, blockCount, VIO *, __func__, &extent); - if (result != VDO_SUCCESS) { - return result; - } - - result = initializeEnqueueableCompletion(&extent->completion, - VDO_EXTENT_COMPLETION, layer); - if (result != VDO_SUCCESS) { - FREE(extent); - return result; - } - - for (; extent->count < blockCount; extent->count++) { - result = layer->createMetadataVIO(layer, vioType, priority, extent, data, - &extent->vios[extent->count]); - if (result != VDO_SUCCESS) { - freeExtent(&extent); - return result; - } - - data += VDO_BLOCK_SIZE; - } - - *extentPtr = extent; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeExtent(VDOExtent **extentPtr) -{ - VDOExtent *extent = *extentPtr; - if (extent == NULL) { - return; - } - - for (BlockCount i = 0; i < extent->count; i++) { - freeVIO(&extent->vios[i]); - } - - destroyEnqueueable(&extent->completion); - FREE(extent); - *extentPtr = NULL; -} - -/** - * Launch a metadata extent. - * - * @param extent The extent - * @param startBlock The absolute physical block at which the extent should - * begin its I/O - * @param count The number of blocks to write - * @param operation The operation to perform on the extent - **/ -static void launchMetadataExtent(VDOExtent *extent, - PhysicalBlockNumber startBlock, - BlockCount count, - VIOOperation operation) -{ - resetCompletion(&extent->completion); - if (count > extent->count) { - finishCompletion(&extent->completion, VDO_OUT_OF_RANGE); - return; - } - - extent->completeCount = extent->count - count; - for (BlockCount i = 0; i < count; i++) { - VIO *vio = extent->vios[i]; - vio->completion.callbackThreadID = extent->completion.callbackThreadID; - launchMetadataVIO(vio, startBlock++, handleVIOCompletion, - handleVIOCompletion, operation); - } -} - -/**********************************************************************/ -void readPartialMetadataExtent(VDOExtent *extent, - PhysicalBlockNumber startBlock, - BlockCount count) -{ - launchMetadataExtent(extent, startBlock, count, VIO_READ); -} - -/**********************************************************************/ -void writePartialMetadataExtent(VDOExtent *extent, - PhysicalBlockNumber startBlock, - BlockCount count) -{ - launchMetadataExtent(extent, startBlock, count, VIO_WRITE); -} - -/**********************************************************************/ -void handleVIOCompletion(VDOCompletion *completion) -{ - VDOExtent *extent = asVDOExtent(completion->parent); - if (++extent->completeCount != extent->count) { - setCompletionResult(extentAsCompletion(extent), completion->result); - return; - } - - finishCompletion(extentAsCompletion(extent), completion->result); -} diff --git a/vdo/base/extent.h b/vdo/base/extent.h deleted file mode 100644 index b023c06..0000000 --- a/vdo/base/extent.h +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/extent.h#2 $ - */ - -#ifndef EXTENT_H -#define EXTENT_H - -#include "permassert.h" - -#include "completion.h" -#include "types.h" -#include "vio.h" - -/** - * A chain of VIOs which are part of the same request. An extent contains - * a chain of at least 'count' VIOs. The 'next' pointer of the last VIO - * in the extent (as indicated by the count) may not be NULL, but it is not - * part of the extent. A VIO may belong to a single extent. - **/ -struct vdoExtent { - // The completion for asynchronous extent processing - VDOCompletion completion; - // The number of VIOs in the extent - BlockCount count; - // The number of completed VIOs in the extent - BlockCount completeCount; - // The VIOs in the extent - VIO *vios[]; -}; - -/** - * Convert a generic VDOCompletion to a VDOExtent. - * - * @param completion The completion to convert - * - * @return The completion as an extent - **/ -static inline VDOExtent *asVDOExtent(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(VDOExtent, completion) == 0); - assertCompletionType(completion->type, VDO_EXTENT_COMPLETION); - return (VDOExtent *) completion; -} - -/** - * Convert a VDOExtent to VDOCompletion. - * - * @param extent The extent to convert - * - * @return The extent as a VDOCompletion - **/ -static inline VDOCompletion *extentAsCompletion(VDOExtent *extent) -{ - return &extent->completion; -} - -/** - * Create a VDOExtent. - * - * @param [in] layer The layer - * @param [in] vioType The usage type to assign to the VIOs in the extent - * (data / block map / journal) - * @param [in] priority The relative priority to assign to the VIOs - * @param [in] blockCount The number of blocks in the buffer - * @param [in] data The buffer - * @param [out] extentPtr A pointer to hold the new extent - * - * @return VDO_SUCCESS or an error - **/ -int createExtent(PhysicalLayer *layer, - VIOType vioType, - VIOPriority priority, - BlockCount blockCount, - char *data, - VDOExtent **extentPtr) - __attribute__((warn_unused_result)); - -/** - * Free an extent and null out the reference to it. - * - * @param [in,out] extentPtr The reference to the extent to free - **/ -void freeExtent(VDOExtent **extentPtr); - -/** - * Read metadata from the underlying storage. - * - * @param extent The extent to read - * @param startBlock The physical block number of the first block - * in the extent - * @param count The number of blocks to read (must be less than or - * equal to the length of the extent) - **/ -void readPartialMetadataExtent(VDOExtent *extent, - PhysicalBlockNumber startBlock, - BlockCount count); - -/** - * Read metadata from the underlying storage. - * - * @param extent The extent to read - * @param startBlock The physical block number of the first block - * in the extent - **/ -static inline void readMetadataExtent(VDOExtent *extent, - PhysicalBlockNumber startBlock) -{ - readPartialMetadataExtent(extent, startBlock, extent->count); -} - -/** - * Write metadata to the underlying storage. - * - * @param extent The extent to write - * @param startBlock The physical block number of the first block in the - * extent - * @param count The number of blocks to read (must be less than or - * equal to the length of the extent) - **/ -void writePartialMetadataExtent(VDOExtent *extent, - PhysicalBlockNumber startBlock, - BlockCount count); -/** - * Write metadata to the underlying storage. - * - * @param extent The extent to write - * @param startBlock The physical block number of the first block in the - * extent - **/ -static inline void writeMetadataExtent(VDOExtent *extent, - PhysicalBlockNumber startBlock) -{ - writePartialMetadataExtent(extent, startBlock, extent->count); -} - -/** - * Notify an extent that one of its VIOs has completed. If the signaling VIO - * is the last of the extent's VIOs to complete, the extent will finish. This - * function is set as the VIO callback in completeVIO(). - * - * @param completion The completion of the VIO which has just finished - **/ -void handleVIOCompletion(VDOCompletion *completion); - -#endif /* EXTENT_H */ diff --git a/vdo/base/fixedLayout.c b/vdo/base/fixedLayout.c deleted file mode 100644 index 4ea048a..0000000 --- a/vdo/base/fixedLayout.c +++ /dev/null @@ -1,534 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/fixedLayout.c#3 $ - */ - -#include "fixedLayout.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" - -#include "header.h" -#include "statusCodes.h" - -const BlockCount ALL_FREE_BLOCKS = (uint64_t) -1; - -struct fixedLayout { - PhysicalBlockNumber firstFree; - PhysicalBlockNumber lastFree; - size_t numPartitions; - Partition *head; -}; - -struct partition { - PartitionID id; // The id of this partition - FixedLayout *layout; // The layout to which this partition belongs - PhysicalBlockNumber offset; // The offset into the layout of this partition - PhysicalBlockNumber base; // The untranslated number of the first block - BlockCount count; // The number of blocks in the partition - Partition *next; // A pointer to the next partition in the layout -}; - -typedef struct { - PhysicalBlockNumber firstFree; - PhysicalBlockNumber lastFree; - byte partitionCount; -} __attribute__((packed)) Layout3_0; - -typedef struct { - PartitionID id; - PhysicalBlockNumber offset; - PhysicalBlockNumber base; - BlockCount count; -} __attribute__((packed)) Partition3_0; - -static const Header LAYOUT_HEADER_3_0 = { - .id = FIXED_LAYOUT, - .version = { - .majorVersion = 3, - .minorVersion = 0, - }, - .size = sizeof(Layout3_0), // Minimum size (contains no partitions) -}; - -/**********************************************************************/ -int makeFixedLayout(BlockCount totalBlocks, - PhysicalBlockNumber startOffset, - FixedLayout **layoutPtr) -{ - FixedLayout *layout; - int result = ALLOCATE(1, FixedLayout, "fixed layout", &layout); - if (result != UDS_SUCCESS) { - return result; - } - - layout->firstFree = startOffset; - layout->lastFree = startOffset + totalBlocks; - layout->numPartitions = 0; - layout->head = NULL; - - *layoutPtr = layout; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeFixedLayout(FixedLayout **layoutPtr) -{ - FixedLayout *layout = *layoutPtr; - if (layout == NULL) { - return; - } - - while (layout->head != NULL) { - Partition *part = layout->head; - layout->head = part->next; - FREE(part); - } - - FREE(layout); - *layoutPtr = NULL; -} - -/**********************************************************************/ -BlockCount getTotalFixedLayoutSize(const FixedLayout *layout) -{ - BlockCount size = getFixedLayoutBlocksAvailable(layout); - for (Partition *partition = layout->head; partition != NULL; - partition = partition->next) { - size += partition->count; - } - - return size; -} - -/**********************************************************************/ -int getPartition(FixedLayout *layout, PartitionID id, Partition **partitionPtr) -{ - for (Partition *partition = layout->head; partition != NULL; - partition = partition->next) { - if (partition->id == id) { - if (partitionPtr != NULL) { - *partitionPtr = partition; - } - return VDO_SUCCESS; - } - } - - return VDO_UNKNOWN_PARTITION; -} - -/**********************************************************************/ -int translateToPBN(const Partition *partition, - PhysicalBlockNumber partitionBlockNumber, - PhysicalBlockNumber *layerBlockNumber) -{ - if (partition == NULL) { - *layerBlockNumber = partitionBlockNumber; - return VDO_SUCCESS; - } - - if (partitionBlockNumber < partition->base) { - return VDO_OUT_OF_RANGE; - } - - PhysicalBlockNumber offsetFromBase = partitionBlockNumber - partition->base; - if (offsetFromBase >= partition->count) { - return VDO_OUT_OF_RANGE; - } - - *layerBlockNumber = partition->offset + offsetFromBase; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int translateFromPBN(const Partition *partition, - PhysicalBlockNumber layerBlockNumber, - PhysicalBlockNumber *partitionBlockNumberPtr) -{ - if (partition == NULL) { - *partitionBlockNumberPtr = layerBlockNumber; - return VDO_SUCCESS; - } - - if (layerBlockNumber < partition->offset) { - return VDO_OUT_OF_RANGE; - } - - PhysicalBlockNumber partitionBlockNumber - = layerBlockNumber - partition->offset; - if (partitionBlockNumber >= partition->count) { - return VDO_OUT_OF_RANGE; - } - - *partitionBlockNumberPtr = partitionBlockNumber + partition->base; - return VDO_SUCCESS; -} - -/**********************************************************************/ -BlockCount getFixedLayoutBlocksAvailable(const FixedLayout *layout) -{ - return layout->lastFree - layout->firstFree; -} - -/** - * Allocate a partition. The partition will be attached to the partition - * list in the layout. - * - * @param layout The layout containing the partition - * @param id The id of the partition - * @param offset The offset into the layout at which the partition begins - * @param base The number of the first block for users of the partition - * @param blockCount The number of blocks in the partition - * - * @return VDO_SUCCESS or an error - **/ -static int allocatePartition(FixedLayout *layout, - byte id, - PhysicalBlockNumber offset, - PhysicalBlockNumber base, - BlockCount blockCount) -{ - Partition *partition; - int result = ALLOCATE(1, Partition, "fixed layout partition", &partition); - if (result != UDS_SUCCESS) { - return result; - } - - partition->id = id; - partition->layout = layout; - partition->offset = offset; - partition->base = base; - partition->count = blockCount; - partition->next = layout->head; - layout->head = partition; - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeFixedLayoutPartition(FixedLayout *layout, - PartitionID id, - BlockCount blockCount, - PartitionDirection direction, - PhysicalBlockNumber base) -{ - BlockCount freeBlocks = layout->lastFree - layout->firstFree; - if (blockCount == ALL_FREE_BLOCKS) { - if (freeBlocks == 0) { - return VDO_NO_SPACE; - } else { - blockCount = freeBlocks; - } - } else if (blockCount > freeBlocks) { - return VDO_NO_SPACE; - } - - int result = getPartition(layout, id, NULL); - if (result != VDO_UNKNOWN_PARTITION) { - return VDO_PARTITION_EXISTS; - } - - PhysicalBlockNumber offset = ((direction == FROM_END) - ? (layout->lastFree - blockCount) - : layout->firstFree); - result = allocatePartition(layout, id, offset, base, blockCount); - if (result != VDO_SUCCESS) { - return result; - } - - layout->numPartitions++; - if (direction == FROM_END) { - layout->lastFree = layout->lastFree - blockCount; - } else { - layout->firstFree += blockCount; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -BlockCount getFixedLayoutPartitionSize(const Partition *partition) -{ - return partition->count; -} - -/**********************************************************************/ -PhysicalBlockNumber getFixedLayoutPartitionOffset(const Partition *partition) -{ - return partition->offset; -} - -/**********************************************************************/ -PhysicalBlockNumber getFixedLayoutPartitionBase(const Partition *partition) -{ - return partition->base; -} - -/**********************************************************************/ -static inline size_t getEncodedSize(const FixedLayout *layout) -{ - return sizeof(Layout3_0) + (sizeof(Partition3_0) * layout->numPartitions); -} - -/**********************************************************************/ -size_t getFixedLayoutEncodedSize(const FixedLayout *layout) -{ - return ENCODED_HEADER_SIZE + getEncodedSize(layout); -} - -/** - * Encode a null-terminated list of fixed layout partitions into a buffer - * using partition format 3.0. - * - * @param layout The layout containing the list of partitions to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error code - **/ -static int encodePartitions_3_0(const FixedLayout *layout, Buffer *buffer) -{ - for (const Partition *partition = layout->head; - partition != NULL; - partition = partition->next) { - STATIC_ASSERT_SIZEOF(PartitionID, sizeof(byte)); - int result = putByte(buffer, partition->id); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, partition->offset); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, partition->base); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, partition->count); - if (result != UDS_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/** - * Encode the header fields of a fixed layout into a buffer using layout - * format 3.0. - * - * @param layout The layout to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error code - **/ -static int encodeLayout_3_0(const FixedLayout *layout, Buffer *buffer) -{ - int result = ASSERT(layout->numPartitions <= UINT8_MAX, - "fixed layout partition count must fit in a byte"); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, layout->firstFree); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, layout->lastFree); - if (result != UDS_SUCCESS) { - return result; - } - - return putByte(buffer, layout->numPartitions); -} - -/**********************************************************************/ -int encodeFixedLayout(const FixedLayout *layout, Buffer *buffer) -{ - if (!ensureAvailableSpace(buffer, getFixedLayoutEncodedSize(layout))) { - return UDS_BUFFER_ERROR; - } - - Header header = LAYOUT_HEADER_3_0; - header.size = getEncodedSize(layout); - int result = encodeHeader(&header, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - size_t initialLength = contentLength(buffer); - - result = encodeLayout_3_0(layout, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - size_t encodedSize = contentLength(buffer) - initialLength; - result = ASSERT(encodedSize == sizeof(Layout3_0), - "encoded size of fixed layout header must match structure"); - if (result != UDS_SUCCESS) { - return result; - } - - result = encodePartitions_3_0(layout, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - encodedSize = contentLength(buffer) - initialLength; - return ASSERT(encodedSize == header.size, - "encoded size of fixed layout must match header size"); -} - -/** - * Decode a sequence of fixed layout partitions from a buffer - * using partition format 3.0. - * - * @param buffer A buffer positioned at the start of the encoding - * @param layout The layout in which to allocate the decoded partitions - * - * @return UDS_SUCCESS or an error code - **/ -static int decodePartitions_3_0(Buffer *buffer, FixedLayout *layout) -{ - for (size_t i = 0; i < layout->numPartitions; i++) { - byte id; - int result = getByte(buffer, &id); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t offset; - result = getUInt64LEFromBuffer(buffer, &offset); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t base; - result = getUInt64LEFromBuffer(buffer, &base); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t count; - result = getUInt64LEFromBuffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - - result = allocatePartition(layout, id, offset, base, count); - if (result != VDO_SUCCESS) { - return result; - } - } - - return UDS_SUCCESS; -} - -/** - * Decode the header fields of a fixed layout from a buffer using layout - * format 3.0. - * - * @param buffer A buffer positioned at the start of the encoding - * @param layout The structure to receive the decoded fields - * - * @return UDS_SUCCESS or an error code - **/ -static int decodeLayout_3_0(Buffer *buffer, Layout3_0 *layout) -{ - size_t initialLength = contentLength(buffer); - - PhysicalBlockNumber firstFree; - int result = getUInt64LEFromBuffer(buffer, &firstFree); - if (result != UDS_SUCCESS) { - return result; - } - - PhysicalBlockNumber lastFree; - result = getUInt64LEFromBuffer(buffer, &lastFree); - if (result != UDS_SUCCESS) { - return result; - } - - byte partitionCount; - result = getByte(buffer, &partitionCount); - if (result != UDS_SUCCESS) { - return result; - } - - *layout = (Layout3_0) { - .firstFree = firstFree, - .lastFree = lastFree, - .partitionCount = partitionCount, - }; - - size_t decodedSize = initialLength - contentLength(buffer); - return ASSERT(decodedSize == sizeof(Layout3_0), - "decoded size of fixed layout header must match structure"); -} - -/**********************************************************************/ -int decodeFixedLayout(Buffer *buffer, FixedLayout **layoutPtr) -{ - Header header; - int result = decodeHeader(buffer, &header); - if (result != UDS_SUCCESS) { - return result; - } - - // Layout is variable size, so only do a minimum size check here. - result = validateHeader(&LAYOUT_HEADER_3_0, &header, false, __func__); - if (result != VDO_SUCCESS) { - return result; - } - - Layout3_0 layoutHeader; - result = decodeLayout_3_0(buffer, &layoutHeader); - if (result != UDS_SUCCESS) { - return result; - } - - if (contentLength(buffer) - < (sizeof(Partition3_0) * layoutHeader.partitionCount)) { - return VDO_UNSUPPORTED_VERSION; - } - - FixedLayout *layout; - result = ALLOCATE(1, FixedLayout, "fixed layout", &layout); - if (result != UDS_SUCCESS) { - return result; - } - - layout->firstFree = layoutHeader.firstFree; - layout->lastFree = layoutHeader.lastFree; - layout->numPartitions = layoutHeader.partitionCount; - - result = decodePartitions_3_0(buffer, layout); - if (result != VDO_SUCCESS) { - freeFixedLayout(&layout); - return result; - } - - *layoutPtr = layout; - return VDO_SUCCESS; -} diff --git a/vdo/base/fixedLayout.h b/vdo/base/fixedLayout.h deleted file mode 100644 index 0907299..0000000 --- a/vdo/base/fixedLayout.h +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/fixedLayout.h#1 $ - */ - -#ifndef FIXED_LAYOUT_H -#define FIXED_LAYOUT_H - -#include "buffer.h" - -#include "types.h" - -typedef enum { - FROM_BEGINNING, - FROM_END, -} PartitionDirection; - -extern const BlockCount ALL_FREE_BLOCKS; - -/** - * A fixed layout is like a traditional disk partitioning scheme. In the - * beginning there is one large unused area, of which parts are carved off. - * Each carved off section has its own internal offset and size. - **/ -typedef struct fixedLayout FixedLayout; -typedef struct partition Partition; - -/** - * Make an unpartitioned fixed layout. - * - * @param [in] totalBlocks The total size of the layout, in blocks - * @param [in] startOffset The block offset in the underlying layer at which - * the fixed layout begins - * @param [out] layoutPtr The pointer to hold the resulting layout - * - * @return a success or error code - **/ -int makeFixedLayout(BlockCount totalBlocks, - PhysicalBlockNumber startOffset, - FixedLayout **layoutPtr) - __attribute__((warn_unused_result)); - -/** - * Free the fixed layout and null out the reference to it. - * - * @param layoutPtr The reference to the layout to free - * - * @note all partitions created by this layout become invalid pointers - **/ -void freeFixedLayout(FixedLayout **layoutPtr); - -/** - * Get the total size of the layout in blocks. - * - * @param layout The layout - * - * @return The size of the layout - **/ -BlockCount getTotalFixedLayoutSize(const FixedLayout *layout) - __attribute__((warn_unused_result)); - -/** - * Get a partition by id. - * - * @param layout The layout from which to get a partition - * @param id The id of the partition - * @param partitionPtr A pointer to hold the partition - * - * @return VDO_SUCCESS or an error - **/ -int getPartition(FixedLayout *layout, PartitionID id, Partition **partitionPtr) - __attribute__((warn_unused_result)); - -/** - * Translate a block number from the partition's view to the layer's - * - * @param partition The partition to use for translation - * @param partitionBlockNumber The block number relative to the partition - * @param layerBlockNumber The block number relative to the layer - * - * @return VDO_SUCCESS or an error code - **/ -int translateToPBN(const Partition *partition, - PhysicalBlockNumber partitionBlockNumber, - PhysicalBlockNumber *layerBlockNumber) - __attribute__((warn_unused_result)); - -/** - * Translate a block number from the layer's view to the partition's. - * This is the inverse of translateToPBN(). - * - * @param partition The partition to use for translation - * @param layerBlockNumber The block number relative to the layer - * @param partitionBlockNumber The block number relative to the partition - * - * @return VDO_SUCCESS or an error code - **/ -int translateFromPBN(const Partition *partition, - PhysicalBlockNumber layerBlockNumber, - PhysicalBlockNumber *partitionBlockNumber) - __attribute__((warn_unused_result)); - -/** - * Return the number of unallocated blocks available. - * - * @param layout the fixed layout - * - * @return the number of blocks yet unallocated to partitions - **/ -BlockCount getFixedLayoutBlocksAvailable(const FixedLayout *layout) - __attribute__((warn_unused_result)); - -/** - * Create a new partition from the beginning or end of the unused space - * within a fixed layout. - * - * @param layout the fixed layout - * @param id the id of the partition to make - * @param blockCount the number of blocks to carve out, if set - * to ALL_FREE_BLOCKS, all remaining blocks will - * be used - * @param direction whether to carve out from beginning or end - * @param base the number of the first block in the partition - * from the point of view of its users - * - * @return a success or error code, particularly - * VDO_NO_SPACE if there are less than blockCount blocks remaining - **/ -int makeFixedLayoutPartition(FixedLayout *layout, - PartitionID id, - BlockCount blockCount, - PartitionDirection direction, - PhysicalBlockNumber base) - __attribute__((warn_unused_result)); - -/** - * Return the size in blocks of a partition. - * - * @param partition a partition of the fixedLayout - * - * @return the size of the partition in blocks - **/ -BlockCount getFixedLayoutPartitionSize(const Partition *partition) - __attribute__((warn_unused_result)); - -/** - * Get the first block of the partition in the layout. - * - * @param partition a partition of the fixedLayout - * - * @return the partition's offset in blocks - **/ -PhysicalBlockNumber getFixedLayoutPartitionOffset(const Partition *partition) - __attribute__((warn_unused_result)); - -/** - * Get the number of the first block in the partition from the partition users - * point of view. - * - * @param partition a partition of the fixedLayout - * - * @return the number of the first block in the partition - **/ -PhysicalBlockNumber getFixedLayoutPartitionBase(const Partition *partition) - __attribute__((warn_unused_result)); - -/** - * Get the size of an encoded layout - * - * @param layout The layout - * - * @return The encoded size of the layout - **/ -size_t getFixedLayoutEncodedSize(const FixedLayout *layout) - __attribute__((warn_unused_result)); - -/** - * Encode a layout into a buffer. - * - * @param layout The layout to encode - * @param buffer The buffer to encode into - * - * @return UDS_SUCCESS or an error - **/ -int encodeFixedLayout(const FixedLayout *layout, Buffer *buffer) - __attribute__((warn_unused_result)); - -/** - * Decode a fixed layout from a buffer. - * - * @param [in] buffer The buffer from which to decode - * @param [out] layoutPtr A pointer to hold the layout - * - * @return VDO_SUCCESS or an error - **/ -int decodeFixedLayout(Buffer *buffer, FixedLayout **layoutPtr) - __attribute__((warn_unused_result)); - -#endif // FIXED_LAYOUT_H diff --git a/vdo/base/flush.c b/vdo/base/flush.c deleted file mode 100644 index 4c6b94c..0000000 --- a/vdo/base/flush.c +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/flush.c#3 $ - */ - -#include "flush.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "blockAllocator.h" -#include "completion.h" -#include "logicalZone.h" -#include "numUtils.h" -#include "readOnlyNotifier.h" -#include "slabDepot.h" -#include "vdoInternal.h" - -struct flusher { - VDOCompletion completion; - /** The VDO to which this flusher belongs */ - VDO *vdo; - /** The current flush generation of the VDO */ - SequenceNumber flushGeneration; - /** The first unacknowledged flush generation */ - SequenceNumber firstUnacknowledgedGeneration; - /** The queue of flush requests waiting to notify other threads */ - WaitQueue notifiers; - /** The queue of flush requests waiting for VIOs to complete */ - WaitQueue pendingFlushes; - /** The flush generation for which notifications are being sent */ - SequenceNumber notifyGeneration; - /** The logical zone to notify next */ - LogicalZone *logicalZoneToNotify; - /** The ID of the thread on which flush requests should be made */ - ThreadID threadID; -}; - -/** - * Convert a generic VDOCompletion to a Flusher. - * - * @param completion The completion to convert - * - * @return The completion as a Flusher - **/ -static Flusher *asFlusher(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(Flusher, completion) == 0); - assertCompletionType(completion->type, FLUSH_NOTIFICATION_COMPLETION); - return (Flusher *) completion; -} - -/** - * Convert a VDOFlush's generic wait queue entry back to the VDOFlush. - * - * @param waiter The wait queue entry to convert - * - * @return The wait queue entry as a VDOFlush - **/ -static VDOFlush *waiterAsFlush(Waiter *waiter) -{ - STATIC_ASSERT(offsetof(VDOFlush, waiter) == 0); - return (VDOFlush *) waiter; -} - -/**********************************************************************/ -int makeFlusher(VDO *vdo) -{ - int result = ALLOCATE(1, Flusher, __func__, &vdo->flusher); - if (result != VDO_SUCCESS) { - return result; - } - - vdo->flusher->vdo = vdo; - vdo->flusher->threadID = getPackerZoneThread(getThreadConfig(vdo)); - return initializeEnqueueableCompletion(&vdo->flusher->completion, - FLUSH_NOTIFICATION_COMPLETION, - vdo->layer); -} - -/**********************************************************************/ -void freeFlusher(Flusher **flusherPtr) -{ - if (*flusherPtr == NULL) { - return; - } - - Flusher *flusher = *flusherPtr; - destroyEnqueueable(&flusher->completion); - FREE(flusher); - *flusherPtr = NULL; -} - -/**********************************************************************/ -ThreadID getFlusherThreadID(Flusher *flusher) -{ - return flusher->threadID; -} - -/**********************************************************************/ -static void notifyFlush(Flusher *flusher); - -/** - * Finish the notification process by checking if any flushes have completed - * and then starting the notification of the next flush request if one came in - * while the current notification was in progress. This callback is registered - * in flushPackerCallback(). - * - * @param completion The flusher completion - **/ -static void finishNotification(VDOCompletion *completion) -{ - Flusher *flusher = asFlusher(completion); - ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), - "finishNotification() called from flusher thread"); - - Waiter *waiter = dequeueNextWaiter(&flusher->notifiers); - int result = enqueueWaiter(&flusher->pendingFlushes, waiter); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(flusher->vdo->readOnlyNotifier, result); - VDOFlush *flush = waiterAsFlush(waiter); - completion->layer->completeFlush(&flush); - return; - } - - completeFlushes(flusher); - if (hasWaiters(&flusher->notifiers)) { - notifyFlush(flusher); - } -} - -/** - * Flush the packer now that all of the logical and physical zones have been - * notified of the new flush request. This callback is registered in - * incrementGeneration(). - * - * @param completion The flusher completion - **/ -static void flushPackerCallback(VDOCompletion *completion) -{ - Flusher *flusher = asFlusher(completion); - incrementPackerFlushGeneration(flusher->vdo->packer); - launchCallback(completion, finishNotification, flusher->threadID); -} - -/** - * Increment the flush generation in a logical zone. If there are more logical - * zones, go on to the next one, otherwise, prepare the physical zones. This - * callback is registered both in notifyFlush() and in itself. - * - * @param completion The flusher as a completion - **/ -static void incrementGeneration(VDOCompletion *completion) -{ - Flusher *flusher = asFlusher(completion); - incrementFlushGeneration(flusher->logicalZoneToNotify, - flusher->notifyGeneration); - flusher->logicalZoneToNotify - = getNextLogicalZone(flusher->logicalZoneToNotify); - if (flusher->logicalZoneToNotify == NULL) { - launchCallback(completion, flushPackerCallback, flusher->threadID); - return; - } - - launchCallback(completion, incrementGeneration, - getLogicalZoneThreadID(flusher->logicalZoneToNotify)); -} - -/** - * Lauch a flush notification. - * - * @param flusher The flusher doing the notification - **/ -static void notifyFlush(Flusher *flusher) -{ - VDOFlush *flush = waiterAsFlush(getFirstWaiter(&flusher->notifiers)); - flusher->notifyGeneration = flush->flushGeneration; - flusher->logicalZoneToNotify = getLogicalZone(flusher->vdo->logicalZones, 0); - flusher->completion.requeue = true; - launchCallback(&flusher->completion, incrementGeneration, - getLogicalZoneThreadID(flusher->logicalZoneToNotify)); -} - -/**********************************************************************/ -void flush(VDO *vdo, VDOFlush *flush) -{ - Flusher *flusher = vdo->flusher; - ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), - "flush() called from flusher thread"); - - flush->flushGeneration = flusher->flushGeneration++; - bool mayNotify = !hasWaiters(&flusher->notifiers); - - int result = enqueueWaiter(&flusher->notifiers, &flush->waiter); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(vdo->readOnlyNotifier, result); - flusher->completion.layer->completeFlush(&flush); - return; - } - - if (mayNotify) { - notifyFlush(flusher); - } -} - -/**********************************************************************/ -void completeFlushes(Flusher *flusher) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() == flusher->threadID), - "completeFlushes() called from flusher thread"); - - SequenceNumber oldestActiveGeneration = UINT64_MAX; - for (LogicalZone *zone = getLogicalZone(flusher->vdo->logicalZones, 0); - zone != NULL; - zone = getNextLogicalZone(zone)) { - SequenceNumber oldestInZone = getOldestLockedGeneration(zone); - oldestActiveGeneration = minSequenceNumber(oldestActiveGeneration, - oldestInZone); - } - - while (hasWaiters(&flusher->pendingFlushes)) { - VDOFlush *flush = waiterAsFlush(getFirstWaiter(&flusher->pendingFlushes)); - if (flush->flushGeneration >= oldestActiveGeneration) { - return; - } - - ASSERT_LOG_ONLY((flush->flushGeneration - == flusher->firstUnacknowledgedGeneration), - "acknowledged next expected flush, %" PRIu64 - ", was: %llu", - flusher->firstUnacknowledgedGeneration, - flush->flushGeneration); - dequeueNextWaiter(&flusher->pendingFlushes); - flusher->completion.layer->completeFlush(&flush); - flusher->firstUnacknowledgedGeneration++; - } -} - -/**********************************************************************/ -void dumpFlusher(const Flusher *flusher) -{ - logInfo("Flusher"); - logInfo(" flushGeneration=%" PRIu64 - " firstUnacknowledgedGeneration=%llu", - flusher->flushGeneration, flusher->firstUnacknowledgedGeneration); - logInfo(" notifiers queue is %s; pendingFlushes queue is %s", - (hasWaiters(&flusher->notifiers) ? "not empty" : "empty"), - (hasWaiters(&flusher->pendingFlushes) ? "not empty" : "empty")); -} diff --git a/vdo/base/flush.h b/vdo/base/flush.h deleted file mode 100644 index da7c8bc..0000000 --- a/vdo/base/flush.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/flush.h#1 $ - */ - -#ifndef FLUSH_H -#define FLUSH_H - -#include "types.h" -#include "waitQueue.h" - -/** - * A marker for tracking which journal entries are affected by a flush request. - **/ -struct vdoFlush { - /** The wait queue entry for this flush */ - Waiter waiter; - /** Which flush this struct represents */ - SequenceNumber flushGeneration; -}; - -/** - * Make a flusher for a VDO. - * - * @param vdo The VDO which owns the flusher - * - * @return VDO_SUCCESS or an error - **/ -int makeFlusher(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Free a flusher and null out the reference to it. - * - * @param flusherPtr A pointer to the flusher to free - **/ -void freeFlusher(Flusher **flusherPtr); - -/** - * Get the ID of the thread on which flusher functions should be called. - * - * @param flusher The flusher to query - * - * @return The ID of the thread which handles the flusher - **/ -ThreadID getFlusherThreadID(Flusher *flusher) - __attribute__((warn_unused_result)); - -/** - * Handle empty flush requests. - * - * @param vdo The VDO - * @param vdoFlush The opaque flush request - **/ -void flush(VDO *vdo, VDOFlush *vdoFlush); - -/** - * Attempt to complete any flushes which might have finished. - * - * @param flusher The flusher - **/ -void completeFlushes(Flusher *flusher); - -/** - * Dump the flusher, in a thread-unsafe fashion. - * - * @param flusher The flusher - **/ -void dumpFlusher(const Flusher *flusher); - -#endif /* FLUSH_H */ diff --git a/vdo/base/forest.c b/vdo/base/forest.c deleted file mode 100644 index eabd6c3..0000000 --- a/vdo/base/forest.c +++ /dev/null @@ -1,565 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/forest.c#8 $ - */ - -#include "forest.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "blockMap.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "blockMapTree.h" -#include "blockMapTreeInternals.h" -#include "constants.h" -#include "dirtyLists.h" -#include "forest.h" -#include "numUtils.h" -#include "recoveryJournal.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "types.h" -#include "vdoInternal.h" -#include "vio.h" -#include "vioPool.h" - -enum { - BLOCK_MAP_VIO_POOL_SIZE = 64, -}; - -typedef struct { - TreePage *levels[BLOCK_MAP_TREE_HEIGHT]; -} BlockMapTreeSegment; - -typedef struct blockMapTree { - BlockMapTreeSegment *segments; -} BlockMapTree; - -struct forest { - BlockMap *map; - size_t segments; - Boundary *boundaries; - TreePage **pages; - BlockMapTree trees[]; -}; - -typedef struct { - PageNumber pageIndex; - SlotNumber slot; -} CursorLevel; - -typedef struct cursors Cursors; - -typedef struct { - Waiter waiter; - BlockMapTree *tree; - Height height; - Cursors *parent; - Boundary boundary; - CursorLevel levels[BLOCK_MAP_TREE_HEIGHT]; - VIOPoolEntry *vioPoolEntry; -} Cursor; - -struct cursors { - BlockMap *map; - BlockMapTreeZone *zone; - VIOPool *pool; - EntryCallback *entryCallback; - VDOCompletion *parent; - RootCount activeRoots; - Cursor cursors[]; -}; - -/**********************************************************************/ -TreePage *getTreePageByIndex(Forest *forest, - RootCount rootIndex, - Height height, - PageNumber pageIndex) -{ - PageNumber offset = 0; - for (size_t segment = 0; segment < forest->segments; segment++) { - PageNumber border = forest->boundaries[segment].levels[height - 1]; - if (pageIndex < border) { - BlockMapTree *tree = &forest->trees[rootIndex]; - return &(tree->segments[segment].levels[height - 1][pageIndex - offset]); - } - offset = border; - } - - return NULL; -} - -/** - * Compute the number of pages which must be allocated at each level in order - * to grow the forest to a new number of entries. - * - * @param [in] rootCount The number of roots - * @param [in] flatPageCount The number of flat block map pages - * @param [in] oldSizes The current size of the forest at each level - * @param [in] entries The new number of entries the block map must - * address - * @param [out] newSizes The new size of the forest at each level - * - * @return The total number of non-leaf pages required - **/ -static BlockCount computeNewPages(RootCount rootCount, - BlockCount flatPageCount, - Boundary *oldSizes, - BlockCount entries, - Boundary *newSizes) -{ - PageCount leafPages - = maxPageCount(computeBlockMapPageCount(entries) - flatPageCount, 1); - PageCount levelSize = computeBucketCount(leafPages, rootCount); - BlockCount totalPages = 0; - for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { - levelSize = computeBucketCount(levelSize, BLOCK_MAP_ENTRIES_PER_PAGE); - newSizes->levels[height] = levelSize; - BlockCount newPages = levelSize; - if (oldSizes != NULL) { - newPages -= oldSizes->levels[height]; - } - totalPages += (newPages * rootCount); - } - - return totalPages; -} - -/**********************************************************************/ -static int makeSegment(Forest *oldForest, - BlockCount newPages, - Boundary *newBoundary, - Forest *forest) -{ - size_t index = (oldForest == NULL) ? 0 : oldForest->segments; - forest->segments = index + 1; - - int result = ALLOCATE(forest->segments, Boundary, "forest boundary array", - &forest->boundaries); - if (result != VDO_SUCCESS) { - return result; - } - - result = ALLOCATE(forest->segments, TreePage *, "forest page pointers", - &forest->pages); - if (result != VDO_SUCCESS) { - return result; - } - - result = ALLOCATE(newPages, TreePage, "new forest pages", - &forest->pages[index]); - if (result != VDO_SUCCESS) { - return result; - } - - if (index > 0) { - memcpy(forest->boundaries, oldForest->boundaries, - index * sizeof(Boundary)); - memcpy(forest->pages, oldForest->pages, index * sizeof(TreePage *)); - } - - memcpy(&(forest->boundaries[index]), newBoundary, sizeof(Boundary)); - - PageCount segmentSizes[BLOCK_MAP_TREE_HEIGHT]; - for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { - segmentSizes[height] = newBoundary->levels[height]; - if (index > 0) { - segmentSizes[height] -= oldForest->boundaries[index - 1].levels[height]; - } - } - - TreePage *pagePtr = forest->pages[index]; - for (RootCount root = 0; root < forest->map->rootCount; root++) { - BlockMapTree *tree = &(forest->trees[root]); - int result = ALLOCATE(forest->segments, BlockMapTreeSegment, - "tree root segments", &tree->segments); - if (result != VDO_SUCCESS) { - return result; - } - - if (index > 0) { - memcpy(tree->segments, oldForest->trees[root].segments, - index * sizeof(BlockMapTreeSegment)); - } - - BlockMapTreeSegment *segment = &(tree->segments[index]); - for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT; height++) { - if (segmentSizes[height] == 0) { - continue; - } - - segment->levels[height] = pagePtr; - if (height == (BLOCK_MAP_TREE_HEIGHT - 1)) { - // Record the root. - BlockMapPage *page = formatBlockMapPage(pagePtr->pageBuffer, - forest->map->nonce, - INVALID_PBN, true); - page->entries[0] = packPBN(forest->map->rootOrigin + root, - MAPPING_STATE_UNCOMPRESSED); - } - pagePtr += segmentSizes[height]; - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void deforest(Forest *forest, size_t firstPageSegment) -{ - if (forest->pages != NULL) { - for (size_t segment = firstPageSegment; segment < forest->segments; - segment++) { - FREE(forest->pages[segment]); - } - FREE(forest->pages); - } - - for (RootCount root = 0; root < forest->map->rootCount; root++) { - BlockMapTree *tree = &(forest->trees[root]); - FREE(tree->segments); - } - - FREE(forest->boundaries); - FREE(forest); -} - -/**********************************************************************/ -int makeForest(BlockMap *map, BlockCount entries) -{ - STATIC_ASSERT(offsetof(TreePage, waiter) == 0); - - Forest *oldForest = map->forest; - Boundary *oldBoundary = NULL; - if (oldForest != NULL) { - oldBoundary = &(oldForest->boundaries[oldForest->segments - 1]); - } - - Boundary newBoundary; - BlockCount newPages = computeNewPages(map->rootCount, map->flatPageCount, - oldBoundary, entries, &newBoundary); - if (newPages == 0) { - map->nextEntryCount = entries; - return VDO_SUCCESS; - } - - Forest *forest; - int result = ALLOCATE_EXTENDED(Forest, map->rootCount, BlockMapTree, - __func__, &forest); - if (result != VDO_SUCCESS) { - return result; - } - - forest->map = map; - result = makeSegment(oldForest, newPages, &newBoundary, forest); - if (result != VDO_SUCCESS) { - deforest(forest, forest->segments - 1); - return result; - } - - map->nextForest = forest; - map->nextEntryCount = entries; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeForest(Forest **forestPtr) -{ - Forest *forest = *forestPtr; - if (forest == NULL) { - return; - } - - deforest(forest, 0); - *forestPtr = NULL; -} - -/**********************************************************************/ -void abandonForest(BlockMap *map) -{ - Forest *forest = map->nextForest; - map->nextForest = NULL; - if (forest != NULL) { - deforest(forest, forest->segments - 1); - } - - map->nextEntryCount = 0; -} - -/**********************************************************************/ -void replaceForest(BlockMap *map) -{ - if (map->nextForest != NULL) { - if (map->forest != NULL) { - deforest(map->forest, map->forest->segments); - } - map->forest = map->nextForest; - map->nextForest = NULL; - } - - map->entryCount = map->nextEntryCount; - map->nextEntryCount = 0; -} - -/** - * Finish the traversal of a single tree. If it was the last cursor, finish - * the traversal. - * - * @param cursor The cursor doing the traversal - **/ -static void finishCursor(Cursor *cursor) -{ - Cursors *cursors = cursor->parent; - returnVIOToPool(cursors->pool, cursor->vioPoolEntry); - if (--cursors->activeRoots > 0) { - return; - } - - VDOCompletion *parent = cursors->parent; - FREE(cursors); - - finishCompletion(parent, VDO_SUCCESS); -} - -/**********************************************************************/ -static void traverse(Cursor *cursor); - -/** - * Continue traversing a block map tree. - * - * @param completion The VIO doing a read or write - **/ -static void continueTraversal(VDOCompletion *completion) -{ - VIOPoolEntry *poolEntry = completion->parent; - Cursor *cursor = poolEntry->parent; - traverse(cursor); -} - -/** - * Continue traversing a block map tree now that a page has been loaded. - * - * @param completion The VIO doing the read - **/ -static void finishTraversalLoad(VDOCompletion *completion) -{ - VIOPoolEntry *entry = completion->parent; - Cursor *cursor = entry->parent; - Height height = cursor->height; - CursorLevel *level = &cursor->levels[height]; - - TreePage *treePage - = &(cursor->tree->segments[0].levels[height][level->pageIndex]); - BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; - copyValidPage(entry->buffer, cursor->parent->map->nonce, - entry->vio->physical, page); - traverse(cursor); -} - -/** - * Traverse a single block map tree. This is the recursive heart of the - * traversal process. - * - * @param cursor The cursor doing the traversal - **/ -static void traverse(Cursor *cursor) -{ - for (; cursor->height < BLOCK_MAP_TREE_HEIGHT; cursor->height++) { - Height height = cursor->height; - CursorLevel *level = &cursor->levels[height]; - TreePage *treePage - = &(cursor->tree->segments[0].levels[height][level->pageIndex]); - BlockMapPage *page = (BlockMapPage *) treePage->pageBuffer; - if (!isBlockMapPageInitialized(page)) { - continue; - } - - for (; level->slot < BLOCK_MAP_ENTRIES_PER_PAGE; level->slot++) { - DataLocation location = unpackBlockMapEntry(&page->entries[level->slot]); - if (!isValidLocation(&location)) { - // This entry is invalid, so remove it from the page. - page->entries[level->slot] - = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); - writeTreePage(treePage, cursor->parent->zone); - continue; - } - - if (!isMappedLocation(&location)) { - continue; - } - - PageNumber entryIndex - = (BLOCK_MAP_ENTRIES_PER_PAGE * level->pageIndex) + level->slot; - - // Erase mapped entries past the end of the logical space. - if (entryIndex >= cursor->boundary.levels[height]) { - page->entries[level->slot] - = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); - writeTreePage(treePage, cursor->parent->zone); - continue; - } - - if (cursor->height < BLOCK_MAP_TREE_HEIGHT - 1) { - int result = cursor->parent->entryCallback(location.pbn, - cursor->parent->parent); - if (result != VDO_SUCCESS) { - page->entries[level->slot] - = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); - writeTreePage(treePage, cursor->parent->zone); - continue; - } - } - - if (cursor->height == 0) { - continue; - } - - cursor->height--; - CursorLevel *nextLevel = &cursor->levels[cursor->height]; - nextLevel->pageIndex = entryIndex; - nextLevel->slot = 0; - level->slot++; - launchReadMetadataVIO(cursor->vioPoolEntry->vio, location.pbn, - finishTraversalLoad, continueTraversal); - return; - } - } - - finishCursor(cursor); -} - -/** - * Start traversing a single block map tree now that the Cursor has a VIO with - * which to load pages. - * - *

Implements WaiterCallback. - * - * @param waiter The Cursor - * @param context The VIOPoolEntry just acquired - **/ -static void launchCursor(Waiter *waiter, void *context) -{ - STATIC_ASSERT(offsetof(Cursor, waiter) == 0); - Cursor *cursor = (Cursor *) waiter; - cursor->vioPoolEntry = (VIOPoolEntry *) context; - cursor->vioPoolEntry->parent = cursor; - vioAsCompletion(cursor->vioPoolEntry->vio)->callbackThreadID - = cursor->parent->zone->mapZone->threadID; - traverse(cursor); -} - -/** - * Compute the number of pages used at each level of the given root's tree. - * - * @param map The block map - * @param rootIndex The index of the root to measure - * - * @return The list of page counts as a Boundary - **/ -static Boundary computeBoundary(BlockMap *map, RootCount rootIndex) -{ - PageCount leafPages = computeBlockMapPageCount(map->entryCount); - PageCount treeLeafPages = leafPages - map->flatPageCount; - - /* - * Compute the leaf pages for this root. If the number of leaf pages does - * not distribute evenly, we must determine if this root gets an extra page. - * Extra pages are assigned to roots starting at firstTreeRoot and going up. - */ - PageCount firstTreeRoot = map->flatPageCount % map->rootCount; - PageCount lastTreeRoot = (leafPages - 1) % map->rootCount; - - PageCount levelPages = treeLeafPages / map->rootCount; - if (inCyclicRange(firstTreeRoot, rootIndex, lastTreeRoot, map->rootCount)) { - levelPages++; - } - - Boundary boundary; - for (Height height = 0; height < BLOCK_MAP_TREE_HEIGHT - 1; height++) { - boundary.levels[height] = levelPages; - levelPages = computeBucketCount(levelPages, BLOCK_MAP_ENTRIES_PER_PAGE); - } - - // The root node always exists, even if the root is otherwise unused. - boundary.levels[BLOCK_MAP_TREE_HEIGHT - 1] = 1; - - return boundary; -} - -/**********************************************************************/ -void traverseForest(BlockMap *map, - EntryCallback *entryCallback, - VDOCompletion *parent) -{ - if (computeBlockMapPageCount(map->entryCount) <= map->flatPageCount) { - // There are no tree pages, so there's nothing to do. - finishCompletion(parent, VDO_SUCCESS); - return; - } - - Cursors *cursors; - int result = ALLOCATE_EXTENDED(Cursors, map->rootCount, Cursor, __func__, - &cursors); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - cursors->map = map; - cursors->zone = &(getBlockMapZone(map, 0)->treeZone); - cursors->pool = cursors->zone->vioPool; - cursors->entryCallback = entryCallback; - cursors->parent = parent; - cursors->activeRoots = map->rootCount; - for (RootCount root = 0; root < map->rootCount; root++) { - Cursor *cursor = &cursors->cursors[root]; - *cursor = (Cursor) { - .tree = &map->forest->trees[root], - .height = BLOCK_MAP_TREE_HEIGHT - 1, - .parent = cursors, - .boundary = computeBoundary(map, root), - }; - - cursor->waiter.callback = launchCursor; - acquireVIOFromPool(cursors->pool, &cursor->waiter); - }; -} - -/**********************************************************************/ -BlockCount computeForestSize(BlockCount logicalBlocks, RootCount rootCount) -{ - Boundary newSizes; - BlockCount approximateNonLeaves - = computeNewPages(rootCount, 0, NULL, logicalBlocks, &newSizes); - - // Exclude the tree roots since those aren't allocated from slabs, - // and also exclude the super-roots, which only exist in memory. - approximateNonLeaves - -= rootCount * (newSizes.levels[BLOCK_MAP_TREE_HEIGHT - 2] - + newSizes.levels[BLOCK_MAP_TREE_HEIGHT - 1]); - - BlockCount approximateLeaves - = computeBlockMapPageCount(logicalBlocks - approximateNonLeaves); - - // This can be a slight over-estimate since the tree will never have to - // address these blocks, so it might be a tiny bit smaller. - return (approximateNonLeaves + approximateLeaves); -} diff --git a/vdo/base/forest.h b/vdo/base/forest.h deleted file mode 100644 index 9a5a7cf..0000000 --- a/vdo/base/forest.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/forest.h#2 $ - */ - -#ifndef FOREST_H -#define FOREST_H - -#include "blockMapTree.h" -#include "types.h" - -/** - * A function to be called for each allocated PBN when traversing the forest. - * - * @param pbn A PBN of a tree node - * @param completion The parent completion of the traversal - * - * @return VDO_SUCCESS or an error - **/ -typedef int EntryCallback(PhysicalBlockNumber pbn, VDOCompletion *completion); - -/** - * Get the tree page for a given height and page index. - * - * @param forest The forest which holds the page - * @param rootIndex The index of the tree that holds the page - * @param height The height of the desired page - * @param pageIndex The index of the desired page - * - * @return The requested page - **/ -TreePage *getTreePageByIndex(Forest *forest, - RootCount rootIndex, - Height height, - PageNumber pageIndex) - __attribute__((warn_unused_result)); - -/** - * Make a collection of trees for a BlockMap, expanding the existing forest if - * there is one. - * - * @param map The block map - * @param entries The number of entries the block map will hold - * - * @return VDO_SUCCESS or an error - **/ -int makeForest(BlockMap *map, BlockCount entries) - __attribute__((warn_unused_result)); - -/** - * Free a forest and all of the segments it contains and NULL out the reference - * to it. - * - * @param forestPtr A pointer to the forest to free - **/ -void freeForest(Forest **forestPtr); - -/** - * Abandon the unused next forest from a BlockMap. - * - * @param map The block map - **/ -void abandonForest(BlockMap *map); - -/** - * Replace a BlockMap's Forest with the already-prepared larger forest. - * - * @param map The block map - **/ -void replaceForest(BlockMap *map); - -/** - * Walk the entire forest of a block map. - * - * @param map The block map to traverse - * @param entryCallback A function to call with the pbn of each allocated node - * in the forest - * @param parent The completion to notify on each traversed PBN, and - * when the traversal is complete - **/ -void traverseForest(BlockMap *map, - EntryCallback *entryCallback, - VDOCompletion *parent); - -/** - * Compute the approximate number of pages which the forest will allocate in - * order to map the specified number of logical blocks. This method assumes - * that the block map is entirely arboreal. - * - * @param logicalBlocks The number of blocks to map - * @param rootCount The number of trees in the forest - * - * @return A (slight) over-estimate of the total number of possible forest - * pages including the leaves - **/ -BlockCount computeForestSize(BlockCount logicalBlocks, RootCount rootCount) - __attribute__((warn_unused_result)); -#endif // FOREST_H diff --git a/vdo/base/hashLock.c b/vdo/base/hashLock.c deleted file mode 100644 index 8494f1d..0000000 --- a/vdo/base/hashLock.c +++ /dev/null @@ -1,1605 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLock.c#5 $ - */ - -/** - * HashLock controls and coordinates writing, index access, and dedupe among - * groups of DataVIOs concurrently writing identical blocks, allowing them to - * deduplicate not only against advice but also against each other. This save - * on index queries and allows those DataVIOs to concurrently deduplicate - * against a single block instead of being serialized through a PBN read lock. - * Only one index query is needed for each HashLock, instead of one for every - * DataVIO. - * - * A HashLock acts like a state machine perhaps more than as a lock. Other - * than the starting and ending states INITIALIZING and DESTROYING, every - * state represents and is held for the duration of an asynchronous operation. - * All state transitions are performed on the thread of the HashZone - * containing the lock. An asynchronous operation is almost always performed - * upon entering a state, and the callback from that operation triggers - * exiting the state and entering a new state. - * - * In all states except DEDUPING, there is a single DataVIO, called the lock - * agent, performing the asynchronous operations on behalf of the lock. The - * agent will change during the lifetime of the lock if the lock is shared by - * more than one DataVIO. DataVIOs waiting to deduplicate are kept on a wait - * queue. Viewed a different way, the agent holds the lock exclusively until - * the lock enters the DEDUPING state, at which point it becomes a shared lock - * that all the waiters (and any new DataVIOs that arrive) use to share a PBN - * lock. In state DEDUPING, there is no agent. When the last DataVIO in the - * lock calls back in DEDUPING, it becomes the agent and the lock becomes - * exclusive again. New DataVIOs that arrive in the lock will also go on the - * wait queue. - * - * The existence of lock waiters is a key factor controlling which state the - * lock transitions to next. When the lock is new or has waiters, it will - * always try to reach DEDUPING, and when it doesn't, it will try to clean up - * and exit. - * - * Deduping requires holding a PBN lock on a block that is known to contain - * data identical to the DataVIOs in the lock, so the lock will send the - * agent to the duplicate zone to acquire the PBN lock (LOCKING), to the - * kernel I/O threads to read and verify the data (VERIFYING), or to write a - * new copy of the data to a full data block or a slot in a compressed block - * (WRITING). - * - * Cleaning up consists of updating the index when the data location is - * different from the initial index query (UPDATING, triggered by stale - * advice, compression, and rollover), releasing the PBN lock on the duplicate - * block (UNLOCKING), and releasing the HashLock itself back to the hash zone - * (DESTROYING). - * - * The shortest sequence of states is for non-concurrent writes of new data: - * INITIALIZING -> QUERYING -> WRITING -> DESTROYING - * This sequence is short because no PBN read lock or index update is needed. - * - * Non-concurrent, finding valid advice looks like this (endpoints elided): - * -> QUERYING -> LOCKING -> VERIFYING -> DEDUPING -> UNLOCKING -> - * Or with stale advice (endpoints elided): - * -> QUERYING -> LOCKING -> VERIFYING -> UNLOCKING -> WRITING -> UPDATING -> - * - * When there are not enough available reference count increments available on - * a PBN for a DataVIO to deduplicate, a new lock is forked and the excess - * waiters roll over to the new lock (which goes directly to WRITING). The new - * lock takes the place of the old lock in the lock map so new DataVIOs will - * be directed to it. The two locks will proceed independently, but only the - * new lock will have the right to update the index (unless it also forks). - * - * Since rollover happens in a lock instance, once a valid data location has - * been selected, it will not change. QUERYING and WRITING are only performed - * once per lock lifetime. All other non-endpoint states can be re-entered. - * - * XXX still need doc on BYPASSING - * - * The function names in this module follow a convention referencing the - * states and transitions in the state machine diagram for VDOSTORY-190. - * [XXX link or repository path to it?] - * For example, for the LOCKING state, there are startLocking() and - * finishLocking() functions. startLocking() is invoked by the finish function - * of the state (or states) that transition to LOCKING. It performs the actual - * lock state change and must be invoked on the hash zone thread. - * finishLocking() is called by (or continued via callback from) the code - * actually obtaining the lock. It does any bookkeeping or decision-making - * required and invokes the appropriate start function of the state being - * transitioned to after LOCKING. - **/ - -#include "hashLock.h" -#include "hashLockInternals.h" - -#include "logger.h" -#include "permassert.h" - -#include "compressionState.h" -#include "constants.h" -#include "dataVIO.h" -#include "hashZone.h" -#include "packer.h" -#include "pbnLock.h" -#include "physicalZone.h" -#include "ringNode.h" -#include "slab.h" -#include "slabDepot.h" -#include "trace.h" -#include "types.h" -#include "vdoInternal.h" -#include "vioWrite.h" -#include "waitQueue.h" - -static const char *LOCK_STATE_NAMES[] = { - [HASH_LOCK_BYPASSING] = "BYPASSING", - [HASH_LOCK_DEDUPING] = "DEDUPING", - [HASH_LOCK_DESTROYING] = "DESTROYING", - [HASH_LOCK_INITIALIZING] = "INITIALIZING", - [HASH_LOCK_LOCKING] = "LOCKING", - [HASH_LOCK_QUERYING] = "QUERYING", - [HASH_LOCK_UNLOCKING] = "UNLOCKING", - [HASH_LOCK_UPDATING] = "UPDATING", - [HASH_LOCK_VERIFYING] = "VERIFYING", - [HASH_LOCK_WRITING] = "WRITING", -}; - -// There are loops in the state diagram, so some forward decl's are needed. -static void startDeduping(HashLock *lock, DataVIO *agent, bool agentIsDone); -static void startLocking(HashLock *lock, DataVIO *agent); -static void startWriting(HashLock *lock, DataVIO *agent); -static void unlockDuplicatePBN(VDOCompletion *completion); -static void transferAllocationLock(DataVIO *dataVIO); - -/**********************************************************************/ -PBNLock *getDuplicateLock(DataVIO *dataVIO) -{ - if (dataVIO->hashLock == NULL) { - return NULL; - } - return dataVIO->hashLock->duplicateLock; -} - -/**********************************************************************/ -const char *getHashLockStateName(HashLockState state) -{ - // Catch if a state has been added without updating the name array. - STATIC_ASSERT((HASH_LOCK_DESTROYING + 1) == COUNT_OF(LOCK_STATE_NAMES)); - return (state < COUNT_OF(LOCK_STATE_NAMES)) ? LOCK_STATE_NAMES[state] : NULL; -} - -/** - * Set the current state of a hash lock. - * - * @param lock The lock to update - * @param newState The new state - **/ -static void setHashLockState(HashLock *lock, HashLockState newState) -{ - if (false) { - logWarning("XXX %" PRIptr " %s -> %s", (void *) lock, - getHashLockStateName(lock->state), - getHashLockStateName(newState)); - } - lock->state = newState; -} - -/** - * Assert that a DataVIO is the agent of its hash lock, and that this is being - * called in the hash zone. - * - * @param dataVIO The DataVIO expected to be the lock agent - * @param where A string describing the function making the assertion - **/ -static void assertHashLockAgent(DataVIO *dataVIO, const char *where) -{ - // Not safe to access the agent field except from the hash zone. - assertInHashZone(dataVIO); - ASSERT_LOG_ONLY(dataVIO == dataVIO->hashLock->agent, - "%s must be for the hash lock agent", where); -} - -/** - * Set or clear the lock agent. - * - * @param lock The hash lock to update - * @param newAgent The new lock agent (may be NULL to clear the agent) - **/ -static void setAgent(HashLock *lock, DataVIO *newAgent) -{ - lock->agent = newAgent; -} - -/** - * Set the duplicate lock held by a hash lock. May only be called in the - * physical zone of the PBN lock. - * - * @param hashLock The hash lock to update - * @param pbnLock The PBN read lock to use as the duplicate lock - **/ -static void setDuplicateLock(HashLock *hashLock, PBNLock *pbnLock) -{ - ASSERT_LOG_ONLY((hashLock->duplicateLock == NULL), - "hash lock must not already hold a duplicate lock"); - - pbnLock->holderCount += 1; - hashLock->duplicateLock = pbnLock; -} - -/** - * Convert a pointer to the hashLockNode field in a DataVIO to the enclosing - * DataVIO. - * - * @param lockNode The RingNode to convert - * - * @return A pointer to the DataVIO containing the RingNode - **/ -static inline DataVIO *dataVIOFromLockNode(RingNode *lockNode) -{ - return (DataVIO *) ((byte *) lockNode - offsetof(DataVIO, hashLockNode)); -} - -/** - * Remove the first DataVIO from the lock's wait queue and return it. - * - * @param lock The lock containing the wait queue - * - * @return The first (oldest) waiter in the queue, or NULL if - * the queue is empty - **/ -static inline DataVIO *dequeueLockWaiter(HashLock *lock) -{ - return waiterAsDataVIO(dequeueNextWaiter(&lock->waiters)); -} - -/** - * Continue processing a DataVIO that has been waiting for an event, setting - * the result from the event, and continuing in a specified callback function. - * - * @param dataVIO The DataVIO to continue - * @param result The current result (will not mask older errors) - * @param callback The function in which to continue processing - **/ -static void continueDataVIOIn(DataVIO *dataVIO, - int result, - VDOAction *callback) -{ - dataVIOAsCompletion(dataVIO)->callback = callback; - continueDataVIO(dataVIO, result); -} - -/** - * Set, change, or clear the hash lock a DataVIO is using. Updates the hash - * lock (or locks) to reflect the change in membership. - * - * @param dataVIO The DataVIO to update - * @param newLock The hash lock the DataVIO is joining - **/ -static void setHashLock(DataVIO *dataVIO, HashLock *newLock) -{ - HashLock *oldLock = dataVIO->hashLock; - if (oldLock != NULL) { - ASSERT_LOG_ONLY(dataVIO->hashZone != NULL, - "must have a hash zone when halding a hash lock"); - ASSERT_LOG_ONLY(!isRingEmpty(&dataVIO->hashLockNode), - "must be on a hash lock ring when holding a hash lock"); - ASSERT_LOG_ONLY(oldLock->referenceCount > 0, - "hash lock reference must be counted"); - - if ((oldLock->state != HASH_LOCK_BYPASSING) - && (oldLock->state != HASH_LOCK_UNLOCKING)) { - // If the reference count goes to zero in a non-terminal state, we're - // most likely leaking this lock. - ASSERT_LOG_ONLY(oldLock->referenceCount > 1, - "hash locks should only become unreferenced in" - " a terminal state, not state %s", - getHashLockStateName(oldLock->state)); - } - - unspliceRingNode(&dataVIO->hashLockNode); - oldLock->referenceCount -= 1; - - dataVIO->hashLock = NULL; - } - - if (newLock != NULL) { - // Keep all DataVIOs sharing the lock on a ring since they can complete in - // any order and we'll always need a pointer to one to compare data. - pushRingNode(&newLock->duplicateRing, &dataVIO->hashLockNode); - newLock->referenceCount += 1; - - // XXX Not needed for VDOSTORY-190, but useful for checking whether a test - // is getting concurrent dedupe, and how much. - if (newLock->maxReferences < newLock->referenceCount) { - newLock->maxReferences = newLock->referenceCount; - } - - dataVIO->hashLock = newLock; - } -} - -/** - * Bottleneck for DataVIOs that have written or deduplicated and that are no - * longer needed to be an agent for the hash lock. - * - * @param dataVIO The DataVIO to complete and send to be cleaned up - **/ -static void exitHashLock(DataVIO *dataVIO) -{ - // XXX trace record? - - // Release the hash lock now, saving a thread transition in cleanup. - releaseHashLock(dataVIO); - - // Complete the DataVIO and start the clean-up path in vioWrite to release - // any locks it still holds. - finishDataVIO(dataVIO, VDO_SUCCESS); -} - -/** - * Retire the active lock agent, replacing it with the first lock waiter, and - * make the retired agent exit the hash lock. - * - * @param lock The hash lock to update - * - * @return The new lock agent (which will be NULL if there was no waiter) - **/ -static DataVIO *retireLockAgent(HashLock *lock) -{ - DataVIO *oldAgent = lock->agent; - DataVIO *newAgent = dequeueLockWaiter(lock); - setAgent(lock, newAgent); - exitHashLock(oldAgent); - if (newAgent != NULL) { - setDuplicateLocation(newAgent, lock->duplicate); - } - return newAgent; -} - -/** - * Callback to call compressData(), putting a DataVIO back on the write path. - * - * @param completion The DataVIO - **/ -static void compressDataCallback(VDOCompletion *completion) -{ - // XXX VDOSTORY-190 need an error check since compressData doesn't have one. - compressData(asDataVIO(completion)); -} - -/** - * Add a DataVIO to the lock's queue of waiters. - * - * @param lock The hash lock on which to wait - * @param dataVIO The DataVIO to add to the queue - **/ -static void waitOnHashLock(HashLock *lock, DataVIO *dataVIO) -{ - int result = enqueueDataVIO(&lock->waiters, dataVIO, THIS_LOCATION(NULL)); - if (result != VDO_SUCCESS) { - // This should be impossible, but if it somehow happens, give up on trying - // to dedupe the data. - setHashLock(dataVIO, NULL); - continueDataVIOIn(dataVIO, result, compressDataCallback); - return; - } - - // Make sure the agent doesn't block indefinitely in the packer since it now - // has at least one other DataVIO waiting on it. - if ((lock->state == HASH_LOCK_WRITING) && cancelCompression(lock->agent)) { - /* - * Even though we're waiting, we also have to send ourselves as a one-way - * message to the packer to ensure the agent continues executing. This is - * safe because cancelCompression() guarantees the agent won't continue - * executing until this message arrives in the packer, and because the - * wait queue link isn't used for sending the message. - */ - dataVIO->compression.lockHolder = lock->agent; - launchPackerCallback(dataVIO, removeLockHolderFromPacker, - THIS_LOCATION("$F;cb=removeLockHolderFromPacker")); - } -} - -/** - * WaiterCallback function that calls compressData on the DataVIO waiter. - * - * @param waiter The DataVIO's waiter link - * @param context Not used - **/ -static void compressWaiter(Waiter *waiter, - void *context __attribute__((unused))) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - dataVIO->isDuplicate = false; - compressData(dataVIO); -} - -/** - * Handle the result of the agent for the lock releasing a read lock on - * duplicate candidate due to aborting the hash lock. This continuation is - * registered in unlockDuplicatePBN(). - * - * @param completion The completion of the DataVIO acting as the lock's agent - **/ -static void finishBypassing(VDOCompletion *completion) -{ - DataVIO *agent = asDataVIO(completion); - assertHashLockAgent(agent, __func__); - HashLock *lock = agent->hashLock; - - ASSERT_LOG_ONLY(lock->duplicateLock == NULL, - "must have released the duplicate lock for the hash lock"); - exitHashLock(agent); -} - -/** - * Stop using the hash lock, resuming the old write path for the lock agent - * and any DataVIOs waiting on it, and put it in a state where DataVIOs - * entering the lock will use the old dedupe path instead of waiting. - * - * @param lock The hash lock - * @param agent The DataVIO acting as the agent for the lock - **/ -static void startBypassing(HashLock *lock, DataVIO *agent) -{ - setHashLockState(lock, HASH_LOCK_BYPASSING); - - // Ensure we don't attempt to update advice when cleaning up. - lock->updateAdvice = false; - - ASSERT_LOG_ONLY(((agent != NULL) || !hasWaiters(&lock->waiters)), - "should not have waiters without an agent"); - notifyAllWaiters(&lock->waiters, compressWaiter, NULL); - - if (lock->duplicateLock != NULL) { - if (agent != NULL) { - // The agent must reference the duplicate zone to launch it. - agent->duplicate = lock->duplicate; - launchDuplicateZoneCallback(agent, unlockDuplicatePBN, - THIS_LOCATION(NULL)); - return; - } - ASSERT_LOG_ONLY(false, "hash lock holding a PBN lock must have an agent"); - } - - if (agent == NULL) { - return; - } - - setAgent(lock, NULL); - agent->isDuplicate = false; - compressData(agent); -} - -/** - * Abort processing on this hash lock when noticing an error. Currently, this - * moves the hash lock to the BYPASSING state, to release all pending DataVIOs. - * - * @param lock The HashLock - * @param dataVIO The DataVIO with the error - **/ -static void abortHashLock(HashLock *lock, DataVIO *dataVIO) -{ - // If we've already aborted the lock, don't try to re-abort it; just exit. - if (lock->state == HASH_LOCK_BYPASSING) { - exitHashLock(dataVIO); - return; - } - - if (dataVIO != lock->agent) { - if ((lock->agent != NULL) || (lock->referenceCount > 1)) { - // Other DataVIOs are still sharing the lock (which should be DEDUPING), - // so just kick this one out of the lock to report its error. - ASSERT_LOG_ONLY(lock->agent == NULL, - "only active agent should call abortHashLock"); - exitHashLock(dataVIO); - return; - } - // Make the lone DataVIO the lock agent so it can abort and clean up. - setAgent(lock, dataVIO); - } - - startBypassing(lock, dataVIO); -} - -/** - * Handle the result of the agent for the lock releasing a read lock on - * duplicate candidate. This continuation is registered in - * unlockDuplicatePBN(). - * - * @param completion The completion of the DataVIO acting as the lock's agent - **/ -static void finishUnlocking(VDOCompletion *completion) -{ - DataVIO *agent = asDataVIO(completion); - assertHashLockAgent(agent, __func__); - HashLock *lock = agent->hashLock; - - ASSERT_LOG_ONLY(lock->duplicateLock == NULL, - "must have released the duplicate lock for the hash lock"); - - if (completion->result != VDO_SUCCESS) { - abortHashLock(lock, agent); - return; - } - - if (!lock->verified) { - /* - * UNLOCKING -> WRITING transition: The lock we released was on an - * unverified block, so it must have been a lock on advice we were - * verifying, not on a location that was used for deduplication. Go write - * (or compress) the block to get a location to dedupe against. - */ - startWriting(lock, agent); - return; - } - - // With the lock released, the verified duplicate block may already have - // changed and will need to be re-verified if a waiter arrived. - lock->verified = false; - - if (hasWaiters(&lock->waiters)) { - /* - * UNLOCKING -> LOCKING transition: A new DataVIO entered the hash lock - * while the agent was releasing the PBN lock. The current agent exits and - * the waiter has to re-lock and re-verify the duplicate location. - */ - // XXX VDOSTORY-190 If we used the current agent to re-acquire the PBN - // lock we wouldn't need to re-verify. - agent = retireLockAgent(lock); - startLocking(lock, agent); - return; - } - - /* - * UNLOCKING -> DESTROYING transition: The agent is done with the lock - * and no other DataVIOs reference it, so remove it from the lock map - * and return it to the pool. - */ - exitHashLock(agent); -} - -/** - * Release a read lock on the PBN of the block that may or may not have - * contained duplicate data. This continuation is launched by - * startUnlocking(), and calls back to finishUnlocking() on the hash zone - * thread. - * - * @param completion The completion of the DataVIO acting as the lock's agent - **/ -static void unlockDuplicatePBN(VDOCompletion *completion) -{ - DataVIO *agent = asDataVIO(completion); - assertInDuplicateZone(agent); - HashLock *lock = agent->hashLock; - - ASSERT_LOG_ONLY(lock->duplicateLock != NULL, - "must have a duplicate lock to release"); - - releasePBNLock(agent->duplicate.zone, agent->duplicate.pbn, - &lock->duplicateLock); - - if (lock->state == HASH_LOCK_BYPASSING) { - launchHashZoneCallback(agent, finishBypassing, THIS_LOCATION(NULL)); - } else { - launchHashZoneCallback(agent, finishUnlocking, THIS_LOCATION(NULL)); - } -} - -/** - * Release a read lock on the PBN of the block that may or may not have - * contained duplicate data. - * - * @param lock The hash lock - * @param agent The DataVIO currently acting as the agent for the lock - **/ -static void startUnlocking(HashLock *lock, DataVIO *agent) -{ - setHashLockState(lock, HASH_LOCK_UNLOCKING); - - /* - * XXX If we arrange to continue on the duplicate zone thread when - * verification fails, and don't explicitly change lock states (or use an - * agent-local state, or an atomic), we can avoid a thread transition here. - */ - launchDuplicateZoneCallback(agent, unlockDuplicatePBN, THIS_LOCATION(NULL)); -} - -/** - * Process the result of a UDS update performed by the agent for the lock. - * This continuation is registered in startQuerying(). - * - * @param completion The completion of the DataVIO that performed the update - **/ -static void finishUpdating(VDOCompletion *completion) -{ - DataVIO *agent = asDataVIO(completion); - assertHashLockAgent(agent, __func__); - HashLock *lock = agent->hashLock; - - if (completion->result != VDO_SUCCESS) { - abortHashLock(lock, agent); - return; - } - - // UDS was updated successfully, so don't update again unless the - // duplicate location changes due to rollover. - lock->updateAdvice = false; - - if (hasWaiters(&lock->waiters)) { - /* - * UPDATING -> DEDUPING transition: A new DataVIO arrived during the UDS - * update. Send it on the verified dedupe path. The agent is done with the - * lock, but the lock may still need to use it to clean up after rollover. - */ - startDeduping(lock, agent, true); - return; - } - - if (lock->duplicateLock != NULL) { - /* - * UPDATING -> UNLOCKING transition: No one is waiting to dedupe, but we - * hold a duplicate PBN lock, so go release it. - */ - startUnlocking(lock, agent); - } else { - /* - * UPDATING -> DESTROYING transition: No one is waiting to dedupe and - * there's no lock to release. - */ - // XXX startDestroying(lock, agent); - startBypassing(lock, NULL); - exitHashLock(agent); - } -} - -/** - * Continue deduplication with the last step, updating UDS with the location - * of the duplicate that should be returned as advice in the future. - * - * @param lock The hash lock - * @param agent The DataVIO currently acting as the agent for the lock - **/ -static void startUpdating(HashLock *lock, DataVIO *agent) -{ - setHashLockState(lock, HASH_LOCK_UPDATING); - - ASSERT_LOG_ONLY(lock->verified, "new advice should have been verified"); - ASSERT_LOG_ONLY(lock->updateAdvice, "should only update advice if needed"); - - agent->lastAsyncOperation = UPDATE_INDEX; - setHashZoneCallback(agent, finishUpdating, THIS_LOCATION(NULL)); - dataVIOAsCompletion(agent)->layer->updateAlbireo(agent); -} - -/** - * Handle a DataVIO that has finished deduplicating against the block locked - * by the hash lock. If there are other DataVIOs still sharing the lock, this - * will just release the DataVIO's share of the lock and finish processing the - * DataVIO. If this is the last DataVIO holding the lock, this makes the - * DataVIO the lock agent and uses it to advance the state of the lock so it - * can eventually be released. - * - * @param lock The hash lock - * @param dataVIO The lock holder that has finished deduplicating - **/ -static void finishDeduping(HashLock *lock, DataVIO *dataVIO) -{ - ASSERT_LOG_ONLY(lock->agent == NULL, "shouldn't have an agent in DEDUPING"); - ASSERT_LOG_ONLY(!hasWaiters(&lock->waiters), - "shouldn't have any lock waiters in DEDUPING"); - - // Just release the lock reference if other DataVIOs are still deduping. - if (lock->referenceCount > 1) { - exitHashLock(dataVIO); - return; - } - - // The hash lock must have an agent for all other lock states. - DataVIO *agent = dataVIO; - setAgent(lock, agent); - - if (lock->updateAdvice) { - /* - * DEDUPING -> UPDATING transition: The location of the duplicate block - * changed since the initial UDS query because of compression, rollover, - * or because the query agent didn't have an allocation. The UDS update - * was delayed in case there was another change in location, but with only - * this DataVIO using the hash lock, it's time to update the advice. - */ - startUpdating(lock, agent); - } else { - /* - * DEDUPING -> UNLOCKING transition: Release the PBN read lock on the - * duplicate location so the hash lock itself can be released (contingent - * on no new DataVIOs arriving in the lock before the agent returns). - */ - startUnlocking(lock, agent); - } -} - -/** - * Implements WaiterCallback. Binds the DataVIO that was waiting to a new hash - * lock and waits on that lock. - **/ -static void enterForkedLock(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - HashLock *newLock = (HashLock *) context; - - setHashLock(dataVIO, newLock); - waitOnHashLock(newLock, dataVIO); -} - -/** - * Fork a hash lock because it has run out of increments on the duplicate PBN. - * Transfers the new agent and any lock waiters to a new hash lock instance - * which takes the place of the old lock in the lock map. The old lock remains - * active, but will not update advice. - * - * @param oldLock The hash lock to fork - * @param newAgent The DataVIO that will be the agent for the new lock - **/ -static void forkHashLock(HashLock *oldLock, DataVIO *newAgent) -{ - HashLock *newLock; - int result = acquireHashLockFromZone(newAgent->hashZone, - &newAgent->chunkName, - oldLock, &newLock); - if (result != VDO_SUCCESS) { - abortHashLock(oldLock, newAgent); - return; - } - - // Only one of the two locks should update UDS. The old lock is out of - // references, so it would be poor dedupe advice in the short term. - oldLock->updateAdvice = false; - newLock->updateAdvice = true; - - setHashLock(newAgent, newLock); - setAgent(newLock, newAgent); - - notifyAllWaiters(&oldLock->waiters, enterForkedLock, newLock); - - newAgent->isDuplicate = false; - startWriting(newLock, newAgent); -} - -/** - * Reserve a reference count increment for a DataVIO and launch it on the - * dedupe path. If no increments are available, this will roll over to a new - * hash lock and launch the DataVIO as the writing agent for that lock. - * - * @param lock The hash lock - * @param dataVIO The DataVIO to deduplicate using the hash lock - * @param hasClaim true if the dataVIO already has claimed - * an increment from the duplicate lock - **/ -static void launchDedupe(HashLock *lock, DataVIO *dataVIO, bool hasClaim) -{ - if (!hasClaim && !claimPBNLockIncrement(lock->duplicateLock)) { - // Out of increments, so must roll over to a new lock. - forkHashLock(lock, dataVIO); - return; - } - - // Deduplicate against the lock's verified location. - setDuplicateLocation(dataVIO, lock->duplicate); - launchDuplicateZoneCallback(dataVIO, shareBlock, - THIS_LOCATION("$F;cb=shareBlock")); -} - -/** - * Enter the hash lock state where DataVIOs deduplicate in parallel against a - * true copy of their data on disk. If the agent itself needs to deduplicate, - * an increment for it must already have been claimed from the duplicate lock, - * ensuring the hash lock will still have a DataVIO holding it. - * - * @param lock The hash lock - * @param agent The DataVIO acting as the agent for the lock - * @param agentIsDone true only if the agent has already written - * or deduplicated against its data - **/ -static void startDeduping(HashLock *lock, DataVIO *agent, bool agentIsDone) -{ - setHashLockState(lock, HASH_LOCK_DEDUPING); - - // We don't take the downgraded allocation lock from the agent unless we - // actually need to deduplicate against it. - if (lock->duplicateLock == NULL) { - ASSERT_LOG_ONLY(!isCompressed(agent->newMapped.state), - "compression must have shared a lock"); - ASSERT_LOG_ONLY(agentIsDone, "agent must have written the new duplicate"); - transferAllocationLock(agent); - } - - ASSERT_LOG_ONLY(isPBNReadLock(lock->duplicateLock), - "duplicateLock must be a PBN read lock"); - - /* - * This state is not like any of the other states. There is no designated - * agent--the agent transitioning to this state and all the waiters will be - * launched to deduplicate in parallel. - */ - setAgent(lock, NULL); - - /* - * Launch the agent (if not already deduplicated) and as many lock waiters - * as we have available increments for on the dedupe path. If we run out of - * increments, rollover will be triggered and the remaining waiters will be - * transferred to the new lock. - */ - if (!agentIsDone) { - launchDedupe(lock, agent, true); - agent = NULL; - } - while (hasWaiters(&lock->waiters)) { - launchDedupe(lock, dequeueLockWaiter(lock), false); - } - - if (agentIsDone) { - /* - * In the degenerate case where all the waiters rolled over to a new lock, - * this will continue to use the old agent to clean up this lock, and - * otherwise it just lets the agent exit the lock. - */ - finishDeduping(lock, agent); - } -} - -/** - * Handle the result of the agent for the lock comparing its data to the - * duplicate candidate. This continuation is registered in startVerifying(). - * - * @param completion The completion of the DataVIO used to verify dedupe - **/ -static void finishVerifying(VDOCompletion *completion) -{ - DataVIO *agent = asDataVIO(completion); - assertHashLockAgent(agent, __func__); - HashLock *lock = agent->hashLock; - - if (completion->result != VDO_SUCCESS) { - // XXX VDOSTORY-190 should convert verify IO errors to verification failure - abortHashLock(lock, agent); - return; - } - - lock->verified = agent->isDuplicate; - - // Only count the result of the initial verification of the advice as valid - // or stale, and not any re-verifications due to PBN lock releases. - if (!lock->verifyCounted) { - lock->verifyCounted = true; - if (lock->verified) { - bumpHashZoneValidAdviceCount(agent->hashZone); - } else { - bumpHashZoneStaleAdviceCount(agent->hashZone); - } - } - - // Even if the block is a verified duplicate, we can't start to deduplicate - // unless we can claim a reference count increment for the agent. - if (lock->verified && !claimPBNLockIncrement(lock->duplicateLock)) { - agent->isDuplicate = false; - lock->verified = false; - } - - if (lock->verified) { - /* - * VERIFYING -> DEDUPING transition: The advice is for a true duplicate, - * so start deduplicating against it, if references are available. - */ - startDeduping(lock, agent, false); - } else { - /* - * VERIFYING -> UNLOCKING transition: Either the verify failed or we'd try - * to dedupe and roll over immediately, which would fail because it would - * leave the lock without an agent to release the PBN lock. In both cases, - * the data will have to be written or compressed, but first the advice - * PBN must be unlocked by the VERIFYING agent. - */ - lock->updateAdvice = true; - startUnlocking(lock, agent); - } -} - -/** - * Continue the deduplication path for a hash lock by using the agent to read - * (and possibly decompress) the data at the candidate duplicate location, - * comparing it to the data in the agent to verify that the candidate is - * identical to all the DataVIOs sharing the hash. If so, it can be - * deduplicated against, otherwise a DataVIO allocation will have to be - * written to and used for dedupe. - * - * @param lock The hash lock (must be LOCKING) - * @param agent The DataVIO to use to read and compare candidate data - **/ -static void startVerifying(HashLock *lock, DataVIO *agent) -{ - setHashLockState(lock, HASH_LOCK_VERIFYING); - ASSERT_LOG_ONLY(!lock->verified, "hash lock only verifies advice once"); - - /* - * XXX VDOSTORY-190 Optimization: This is one of those places where the zone - * and continuation we want to use depends on the outcome of the comparison. - * If we could choose which path in the layer thread before continuing, we - * could save a thread transition in one of the two cases (assuming we're - * willing to delay visibility of the the hash lock state change). - */ - VDOCompletion *completion = dataVIOAsCompletion(agent); - agent->lastAsyncOperation = VERIFY_DEDUPLICATION; - setHashZoneCallback(agent, finishVerifying, THIS_LOCATION(NULL)); - completion->layer->verifyDuplication(agent); -} - -/** - * Handle the result of the agent for the lock attempting to obtain a PBN read - * lock on the candidate duplicate block. this continuation is registered in - * lockDuplicatePBN(). - * - * @param completion The completion of the DataVIO that attempted to get - * the read lock - **/ -static void finishLocking(VDOCompletion *completion) -{ - DataVIO *agent = asDataVIO(completion); - assertHashLockAgent(agent, __func__); - HashLock *lock = agent->hashLock; - - if (completion->result != VDO_SUCCESS) { - // XXX clearDuplicateLocation()? - agent->isDuplicate = false; - abortHashLock(lock, agent); - return; - } - - if (!agent->isDuplicate) { - ASSERT_LOG_ONLY(lock->duplicateLock == NULL, - "must not hold duplicateLock if not flagged as a duplicate"); - /* - * LOCKING -> WRITING transition: The advice block is being modified or - * has no available references, so try to write or compress the data, - * remembering to update UDS later with the new advice. - */ - bumpHashZoneStaleAdviceCount(agent->hashZone); - lock->updateAdvice = true; - startWriting(lock, agent); - return; - } - - ASSERT_LOG_ONLY(lock->duplicateLock != NULL, - "must hold duplicateLock if flagged as a duplicate"); - - if (!lock->verified) { - /* - * LOCKING -> VERIFYING transition: Continue on the unverified dedupe path, - * reading the candidate duplicate and comparing it to the agent's data to - * decide whether it is a true duplicate or stale advice. - */ - startVerifying(lock, agent); - return; - } - - if (!claimPBNLockIncrement(lock->duplicateLock)) { - /* - * LOCKING -> UNLOCKING transition: The verified block was re-locked, but - * has no available increments left. Must first release the useless PBN - * read lock before rolling over to a new copy of the block. - */ - agent->isDuplicate = false; - lock->verified = false; - lock->updateAdvice = true; - startUnlocking(lock, agent); - return; - } - - /* - * LOCKING -> DEDUPING transition: Continue on the verified dedupe path, - * deduplicating against a location that was previously verified or - * written to. - */ - startDeduping(lock, agent, false); -} - -/** - * Acquire a read lock on the PBN of the block containing candidate duplicate - * data (compressed or uncompressed). If the PBN is already locked for - * writing, the lock attempt is abandoned and isDuplicate will be cleared - * before calling back. this continuation is launched from startLocking(), and - * calls back to finishLocking() on the hash zone thread. - * - * @param completion The completion of the DataVIO attempting to acquire the - * physical block lock on behalf of its hash lock - **/ -static void lockDuplicatePBN(VDOCompletion *completion) -{ - DataVIO *agent = asDataVIO(completion); - PhysicalZone *zone = agent->duplicate.zone; - assertInDuplicateZone(agent); - - setHashZoneCallback(agent, finishLocking, THIS_LOCATION(NULL)); - - // While in the zone that owns it, find out how many additional references - // can be made to the block if it turns out to truly be a duplicate. - SlabDepot *depot = getSlabDepot(getVDOFromDataVIO(agent)); - unsigned int incrementLimit = getIncrementLimit(depot, agent->duplicate.pbn); - if (incrementLimit == 0) { - // We could deduplicate against it later if a reference happened to be - // released during verification, but it's probably better to bail out now. - // XXX clearDuplicateLocation()? - agent->isDuplicate = false; - continueDataVIO(agent, VDO_SUCCESS); - return; - } - - PBNLock *lock; - int result = attemptPBNLock(zone, agent->duplicate.pbn, VIO_READ_LOCK, - &lock); - if (result != VDO_SUCCESS) { - continueDataVIO(agent, result); - return; - } - - if (!isPBNReadLock(lock)) { - /* - * There are three cases of write locks: uncompressed data block writes, - * compressed (packed) block writes, and block map page writes. In all - * three cases, we give up on trying to verify the advice and don't bother - * to try deduplicate against the data in the write lock holder. - * - * 1) We don't ever want to try to deduplicate against a block map page. - * - * 2a) It's very unlikely we'd deduplicate against an entire packed block, - * both because of the chance of matching it, and because we don't record - * advice for it, but for the uncompressed representation of all the - * fragments it contains. The only way we'd be getting lock contention is - * if we've written the same representation coincidentally before, had it - * become unreferenced, and it just happened to be packed together from - * compressed writes when we go to verify the lucky advice. Giving up is a - * miniscule loss of potential dedupe. - * - * 2b) If the advice is for a slot of a compressed block, it's about to - * get smashed, and the write smashing it cannot contain our data--it - * would have to be writing on behalf of our hash lock, but that's - * impossible since we're the lock agent. - * - * 3a) If the lock is held by a DataVIO with different data, the advice is - * already stale or is about to become stale. - * - * 3b) If the lock is held by a DataVIO that matches us, we may as well - * either write it ourselves (or reference the copy we already wrote) - * instead of potentially having many duplicates wait for the lock holder - * to write, journal, hash, and finally arrive in the hash lock. All we - * lose is a chance to avoid a UDS update in the very rare case of advice - * for a free block that just happened to be allocated to a DataVIO with - * the same hash. In async mode, there's also a chance to save on a block - * write, at the cost of a block verify. Saving on a full block compare in - * all stale advice cases almost certainly outweighs saving a UDS update - * in a lucky case where advice would have been saved from becoming stale. - */ - // XXX clearDuplicateLocation()? - agent->isDuplicate = false; - continueDataVIO(agent, VDO_SUCCESS); - return; - } - - if (lock->holderCount == 0) { - // Ensure that the newly-locked block is referenced. - Slab *slab = getSlab(depot, agent->duplicate.pbn); - result = acquireProvisionalReference(slab, agent->duplicate.pbn, lock); - if (result != VDO_SUCCESS) { - logWarningWithStringError(result, - "Error acquiring provisional reference for " - "dedupe candidate; aborting dedupe"); - agent->isDuplicate = false; - releasePBNLock(zone, agent->duplicate.pbn, &lock); - continueDataVIO(agent, result); - return; - } - - /* - * The increment limit we grabbed earlier is still valid. The lock now - * holds the rights to acquire all those references. Those rights will be - * claimed by hash locks sharing this read lock. - */ - lock->incrementLimit = incrementLimit; - } - - // We've successfully acquired a read lock on behalf of the hash lock, - // so mark it as such. - setDuplicateLock(agent->hashLock, lock); - - /* - * XXX VDOSTORY-190 Optimization: Same as startLocking() lazily changing - * state to save on having to switch back to the hash zone thread. Here we - * could directly launch the block verify, then switch to a hash thread. - */ - continueDataVIO(agent, VDO_SUCCESS); -} - -/** - * Continue deduplication for a hash lock that has obtained valid advice - * of a potential duplicate through its agent. - * - * @param lock The hash lock (currently must be QUERYING) - * @param agent The DataVIO bearing the dedupe advice - **/ -static void startLocking(HashLock *lock, DataVIO *agent) -{ - ASSERT_LOG_ONLY(lock->duplicateLock == NULL, - "must not acquire a duplicate lock when already holding it"); - - setHashLockState(lock, HASH_LOCK_LOCKING); - - /* - * XXX VDOSTORY-190 Optimization: If we arrange to continue on the duplicate - * zone thread when accepting the advice, and don't explicitly change lock - * states (or use an agent-local state, or an atomic), we can avoid a thread - * transition here. - */ - agent->lastAsyncOperation = ACQUIRE_PBN_READ_LOCK; - launchDuplicateZoneCallback(agent, lockDuplicatePBN, THIS_LOCATION(NULL)); -} - -/** - * Re-entry point for the lock agent after it has finished writing or - * compressing its copy of the data block. The agent will never need to dedupe - * against anything, so it's done with the lock, but the lock may not be - * finished with it, as a UDS update might still be needed. - * - * If there are other lock holders, the agent will hand the job to one of them - * and exit, leaving the lock to deduplicate against the just-written block. - * If there are no other lock holders, the agent either exits (and later tears - * down the hash lock), or it remains the agent and updates UDS. - * - * @param lock The hash lock, which must be in state WRITING - * @param agent The DataVIO that wrote its data for the lock - **/ -static void finishWriting(HashLock *lock, DataVIO *agent) -{ - // Dedupe against the data block or compressed block slot the agent wrote. - // Since we know the write succeeded, there's no need to verify it. - lock->duplicate = agent->newMapped; - lock->verified = true; - - if (isCompressed(lock->duplicate.state) && lock->registered) { - // Compression means the location we gave in the UDS query is not the - // location we're using to deduplicate. - lock->updateAdvice = true; - } - - // If there are any waiters, we need to start deduping them. - if (hasWaiters(&lock->waiters)) { - /* - * WRITING -> DEDUPING transition: an asynchronously-written block - * failed to compress, so the PBN lock on the written copy was already - * transferred. The agent is done with the lock, but the lock may - * still need to use it to clean up after rollover. - */ - startDeduping(lock, agent, true); - return; - } - - // There are no waiters and the agent has successfully written, so take a - // step towards being able to release the hash lock (or just release it). - if (lock->updateAdvice) { - /* - * WRITING -> UPDATING transition: There's no waiter and a UDS update is - * needed, so retain the WRITING agent and use it to launch the update. - * The happens on compression, rollover, or the QUERYING agent not having - * an allocation. - */ - startUpdating(lock, agent); - } else if (lock->duplicateLock != NULL) { - /* - * WRITING -> UNLOCKING transition: There's no waiter and no update - * needed, but the compressed write gave us a shared duplicate lock that - * we must release. - */ - setDuplicateLocation(agent, lock->duplicate); - startUnlocking(lock, agent); - } else { - /* - * WRITING -> DESTROYING transition: There's no waiter, no update needed, - * and no duplicate lock held, so both the agent and lock have no more - * work to do. The agent will release its allocation lock in cleanup. - */ - // XXX startDestroying(lock, agent); - startBypassing(lock, NULL); - exitHashLock(agent); - } -} - -/** - * Search through the lock waiters for a DataVIO that has an allocation. If - * one is found, swap agents, put the old agent at the head of the wait queue, - * then return the new agent. Otherwise, just return the current agent. - * - * @param lock The hash lock to modify - **/ -static DataVIO *selectWritingAgent(HashLock *lock) -{ - // This should-be-impossible condition is the only cause for - // enqueueDataVIO() to fail later on, where it would be a pain to handle. - int result = ASSERT(!isWaiting(dataVIOAsWaiter(lock->agent)), - "agent must not be waiting"); - if (result != VDO_SUCCESS) { - return lock->agent; - } - - WaitQueue tempQueue; - initializeWaitQueue(&tempQueue); - - // Move waiters to the temp queue one-by-one until we find an allocation. - // Not ideal to search, but it only happens when nearly out of space. - DataVIO *dataVIO; - while (((dataVIO = dequeueLockWaiter(lock)) != NULL) - && !hasAllocation(dataVIO)) { - // Use the lower-level enqueue since we're just moving waiters around. - int result = enqueueWaiter(&tempQueue, dataVIOAsWaiter(dataVIO)); - // The only error is the DataVIO already being on a wait queue, and since - // we just dequeued it, that could only happen due to a memory smash or - // concurrent use of that DataVIO. - ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueueWaiter error"); - } - - if (dataVIO != NULL) { - // Move the rest of the waiters over to the temp queue, preserving the - // order they arrived at the lock. - transferAllWaiters(&lock->waiters, &tempQueue); - - // The current agent is being replaced and will have to wait to dedupe; - // make it the first waiter since it was the first to reach the lock. - int result = enqueueDataVIO(&lock->waiters, lock->agent, - THIS_LOCATION(NULL)); - ASSERT_LOG_ONLY(result == VDO_SUCCESS, - "impossible enqueueDataVIO error after isWaiting checked"); - setAgent(lock, dataVIO); - } else { - // No one has an allocation, so keep the current agent. - dataVIO = lock->agent; - } - - // Swap all the waiters back onto the lock's queue. - transferAllWaiters(&tempQueue, &lock->waiters); - return dataVIO; -} - -/** - * Begin the non-duplicate write path for a hash lock that had no advice, - * selecting a DataVIO with an allocation as a new agent, if necessary, - * then resuming the agent on the DataVIO write path. - * - * @param lock The hash lock (currently must be QUERYING) - * @param agent The DataVIO currently acting as the agent for the lock - **/ -static void startWriting(HashLock *lock, DataVIO *agent) -{ - setHashLockState(lock, HASH_LOCK_WRITING); - - // The agent might not have received an allocation and so can't be used for - // writing, but it's entirely possible that one of the waiters did. - if (!hasAllocation(agent)) { - agent = selectWritingAgent(lock); - // If none of the waiters had an allocation, the writes all have to fail. - if (!hasAllocation(agent)) { - /* - * XXX VDOSTORY-190 Should we keep a variant of BYPASSING that causes - * new arrivals to fail immediately if they don't have an allocation? It - * might be possible that on some path there would be non-waiters still - * referencing the lock, so it would remain in the map as everything is - * currently spelled, even if the agent and all the waiters release. - */ - startBypassing(lock, agent); - return; - } - } - - // If the agent compresses, it might wait indefinitely in the packer, - // which would be bad if there are any other DataVIOs waiting. - if (hasWaiters(&lock->waiters)) { - // XXX in sync mode, transition directly to LOCKING to start dedupe? - cancelCompression(agent); - } - - /* - * Send the agent to the compress/pack/async-write path in vioWrite. If it - * succeeds, it will return to the hash lock via continueHashLock() and call - * finishWriting(). - */ - compressData(agent); -} - -/** - * Process the result of a UDS query performed by the agent for the lock. This - * continuation is registered in startQuerying(). - * - * @param completion The completion of the DataVIO that performed the query - **/ -static void finishQuerying(VDOCompletion *completion) -{ - DataVIO *agent = asDataVIO(completion); - assertHashLockAgent(agent, __func__); - HashLock *lock = agent->hashLock; - - if (completion->result != VDO_SUCCESS) { - abortHashLock(lock, agent); - return; - } - - if (agent->isDuplicate) { - lock->duplicate = agent->duplicate; - /* - * QUERYING -> LOCKING transition: Valid advice was obtained from UDS. - * Use the QUERYING agent to start the hash lock on the unverified dedupe - * path, verifying that the advice can be used. - */ - startLocking(lock, agent); - } else { - // The agent will be used as the duplicate if has an allocation; if it - // does, that location was posted to UDS, so no update will be needed. - lock->updateAdvice = !hasAllocation(agent); - /* - * QUERYING -> WRITING transition: There was no advice or the advice - * wasn't valid, so try to write or compress the data. - */ - startWriting(lock, agent); - } -} - -/** - * Start deduplication for a hash lock that has finished initializing by - * making the DataVIO that requested it the agent, entering the QUERYING - * state, and using the agent to perform the UDS query on behalf of the lock. - * - * @param lock The initialized hash lock - * @param dataVIO The DataVIO that has just obtained the new lock - **/ -static void startQuerying(HashLock *lock, DataVIO *dataVIO) -{ - setAgent(lock, dataVIO); - setHashLockState(lock, HASH_LOCK_QUERYING); - - VDOCompletion *completion = dataVIOAsCompletion(dataVIO); - dataVIO->lastAsyncOperation = CHECK_FOR_DEDUPLICATION; - setHashZoneCallback(dataVIO, finishQuerying, THIS_LOCATION(NULL)); - completion->layer->checkForDuplication(dataVIO); -} - -/** - * Complain that a DataVIO has entered a HashLock that is in an unimplemented - * or unusable state and continue the DataVIO with an error. - * - * @param lock The hash lock - * @param dataVIO The DataVIO attempting to enter the lock - **/ -static void reportBogusLockState(HashLock *lock, DataVIO *dataVIO) -{ - int result = ASSERT_FALSE("hash lock must not be in unimplemented state %s", - getHashLockStateName(lock->state)); - continueDataVIOIn(dataVIO, result, compressDataCallback); -} - -/**********************************************************************/ -void enterHashLock(DataVIO *dataVIO) -{ - HashLock *lock = dataVIO->hashLock; - switch (lock->state) { - case HASH_LOCK_INITIALIZING: - startQuerying(lock, dataVIO); - break; - - case HASH_LOCK_QUERYING: - case HASH_LOCK_WRITING: - case HASH_LOCK_UPDATING: - case HASH_LOCK_LOCKING: - case HASH_LOCK_VERIFYING: - case HASH_LOCK_UNLOCKING: - // The lock is busy, and can't be shared yet. - waitOnHashLock(lock, dataVIO); - break; - - case HASH_LOCK_BYPASSING: - // Bypass dedupe entirely. - compressData(dataVIO); - break; - - case HASH_LOCK_DEDUPING: - launchDedupe(lock, dataVIO, false); - break; - - case HASH_LOCK_DESTROYING: - // A lock in this state should not be acquired by new VIOs. - reportBogusLockState(lock, dataVIO); - break; - - default: - reportBogusLockState(lock, dataVIO); - } -} - -/**********************************************************************/ -void continueHashLock(DataVIO *dataVIO) -{ - HashLock *lock = dataVIO->hashLock; - // XXX VDOSTORY-190 Eventually we may be able to fold the error handling - // in at this point instead of using a separate entry point for it. - - switch (lock->state) { - case HASH_LOCK_WRITING: - ASSERT_LOG_ONLY(dataVIO == lock->agent, - "only the lock agent may continue the lock"); - finishWriting(lock, dataVIO); - break; - - case HASH_LOCK_DEDUPING: - finishDeduping(lock, dataVIO); - break; - - case HASH_LOCK_BYPASSING: - // This DataVIO has finished the write path and the lock doesn't need it. - // XXX This isn't going to be correct if DEDUPING ever uses BYPASSING. - finishDataVIO(dataVIO, VDO_SUCCESS); - break; - - case HASH_LOCK_INITIALIZING: - case HASH_LOCK_QUERYING: - case HASH_LOCK_UPDATING: - case HASH_LOCK_LOCKING: - case HASH_LOCK_VERIFYING: - case HASH_LOCK_UNLOCKING: - case HASH_LOCK_DESTROYING: - // A lock in this state should never be re-entered. - reportBogusLockState(lock, dataVIO); - break; - - default: - reportBogusLockState(lock, dataVIO); - } -} - -/**********************************************************************/ -void continueHashLockOnError(DataVIO *dataVIO) -{ - // XXX We could simply use continueHashLock() and check for errors in that. - abortHashLock(dataVIO->hashLock, dataVIO); -} - -/** - * Check whether the data in DataVIOs sharing a lock is different than in a - * DataVIO seeking to share the lock, which should only be possible in the - * extremely unlikely case of a hash collision. - * - * @param lock The lock to check - * @param candidate The DataVIO seeking to share the lock - * - * @return true if the given DataVIO must not share the lock - * because it doesn't have the same data as the lock holders - **/ -static bool isHashCollision(HashLock *lock, DataVIO *candidate) -{ - if (isRingEmpty(&lock->duplicateRing)) { - return false; - } - - DataVIO *lockHolder = dataVIOFromLockNode(lock->duplicateRing.next); - PhysicalLayer *layer = dataVIOAsCompletion(candidate)->layer; - bool collides = !layer->compareDataVIOs(lockHolder, candidate); - - if (collides) { - bumpHashZoneCollisionCount(candidate->hashZone); - } else { - bumpHashZoneDataMatchCount(candidate->hashZone); - } - - return collides; -} - -/**********************************************************************/ -static inline int assertHashLockPreconditions(const DataVIO *dataVIO) -{ - int result = ASSERT(dataVIO->hashLock == NULL, - "must not already hold a hash lock"); - if (result != VDO_SUCCESS) { - return result; - } - result = ASSERT(isRingEmpty(&dataVIO->hashLockNode), - "must not already be a member of a hash lock ring"); - if (result != VDO_SUCCESS) { - return result; - } - return ASSERT(dataVIO->recoverySequenceNumber == 0, - "must not hold a recovery lock when getting a hash lock"); -} - -/**********************************************************************/ -int acquireHashLock(DataVIO *dataVIO) -{ - int result = assertHashLockPreconditions(dataVIO); - if (result != VDO_SUCCESS) { - return result; - } - - HashLock *lock; - result = acquireHashLockFromZone(dataVIO->hashZone, &dataVIO->chunkName, - NULL, &lock); - if (result != VDO_SUCCESS) { - return result; - } - - if (isHashCollision(lock, dataVIO)) { - // Hash collisions are extremely unlikely, but the bogus dedupe would be a - // data corruption. Bypass dedupe entirely by leaving hashLock unset. - // XXX clear hashZone too? - return VDO_SUCCESS; - } - - setHashLock(dataVIO, lock); - return VDO_SUCCESS; -} - -/**********************************************************************/ -void releaseHashLock(DataVIO *dataVIO) -{ - HashLock *lock = dataVIO->hashLock; - if (lock == NULL) { - return; - } - - setHashLock(dataVIO, NULL); - - if (lock->referenceCount > 0) { - // The lock is still in use by other DataVIOs. - return; - } - - setHashLockState(lock, HASH_LOCK_DESTROYING); - returnHashLockToZone(dataVIO->hashZone, &lock); -} - -/** - * Transfer a DataVIO's downgraded allocation PBN lock to the DataVIO's hash - * lock, converting it to a duplicate PBN lock. - * - * @param dataVIO The DataVIO holding the allocation lock to transfer - **/ -static void transferAllocationLock(DataVIO *dataVIO) -{ - ASSERT_LOG_ONLY(dataVIO->newMapped.pbn == getDataVIOAllocation(dataVIO), - "transferred lock must be for the block written"); - - AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); - PBNLock *pbnLock = allocatingVIO->allocationLock; - allocatingVIO->allocationLock = NULL; - allocatingVIO->allocation = ZERO_BLOCK; - - ASSERT_LOG_ONLY(isPBNReadLock(pbnLock), - "must have downgraded the allocation lock before transfer"); - - HashLock *hashLock = dataVIO->hashLock; - hashLock->duplicate = dataVIO->newMapped; - dataVIO->duplicate = dataVIO->newMapped; - - // Since the lock is being transferred, the holder count doesn't change (and - // isn't even safe to examine on this thread). - hashLock->duplicateLock = pbnLock; -} - -/**********************************************************************/ -void shareCompressedWriteLock(DataVIO *dataVIO, PBNLock *pbnLock) -{ - ASSERT_LOG_ONLY(getDuplicateLock(dataVIO) == NULL, - "a duplicate PBN lock should not exist when writing"); - ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state), - "lock transfer must be for a compressed write"); - assertInNewMappedZone(dataVIO); - - // First sharer downgrades the lock. - if (!isPBNReadLock(pbnLock)) { - downgradePBNWriteLock(pbnLock); - } - - // Get a share of the PBN lock, ensuring it cannot be released until - // after this DataVIO has had a chance to journal a reference. - dataVIO->duplicate = dataVIO->newMapped; - dataVIO->hashLock->duplicate = dataVIO->newMapped; - setDuplicateLock(dataVIO->hashLock, pbnLock); - - // Claim a reference for this DataVIO, which is necessary since another - // HashLock might start deduplicating against it before our incRef. - bool claimed = claimPBNLockIncrement(pbnLock); - ASSERT_LOG_ONLY(claimed, "impossible to fail to claim an initial increment"); -} diff --git a/vdo/base/hashLock.h b/vdo/base/hashLock.h deleted file mode 100644 index b21e465..0000000 --- a/vdo/base/hashLock.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLock.h#3 $ - */ - -#ifndef HASH_LOCK_H -#define HASH_LOCK_H - -#include "types.h" - -/** - * Get the PBN lock on the duplicate data location for a DataVIO from the - * HashLock the DataVIO holds (if there is one). - * - * @param dataVIO The DataVIO to query - * - * @return The PBN lock on the DataVIO's duplicate location - **/ -PBNLock *getDuplicateLock(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Acquire or share a lock on the hash (chunk name) of the data in a DataVIO, - * updating the DataVIO to reference the lock. This must only be called in the - * correct thread for the zone. In the unlikely case of a hash collision, this - * function will succeed, but the DataVIO will not get a lock reference. - * - * @param dataVIO The DataVIO acquiring a lock on its chunk name - **/ -int acquireHashLock(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Asynchronously process a DataVIO that has just acquired its reference to a - * hash lock. This may place the DataVIO on a wait queue, or it may use the - * DataVIO to perform operations on the lock's behalf. - * - * @param dataVIO The DataVIO that has just acquired a lock on its chunk name - **/ -void enterHashLock(DataVIO *dataVIO); - -/** - * Asynchronously continue processing a DataVIO in its hash lock after it has - * finished writing, compressing, or deduplicating, so it can share the result - * with any DataVIOs waiting in the hash lock, or update Albireo, or simply - * release its share of the lock. This must only be called in the correct - * thread for the hash zone. - * - * @param dataVIO The DataVIO to continue processing in its hash lock - **/ -void continueHashLock(DataVIO *dataVIO); - -/** - * Re-enter the hash lock after encountering an error, to clean up the hash - * lock. - * - * @param dataVIO The DataVIO with an error - **/ -void continueHashLockOnError(DataVIO *dataVIO); - -/** - * Release a DataVIO's share of a hash lock, if held, and null out the - * DataVIO's reference to it. This must only be called in the correct thread - * for the hash zone. - * - * If the DataVIO is the only one holding the lock, this also releases any - * resources or locks used by the hash lock (such as a PBN read lock on a - * block containing data with the same hash) and returns the lock to the hash - * zone's lock pool. - * - * @param dataVIO The DataVIO releasing its hash lock - **/ -void releaseHashLock(DataVIO *dataVIO); - -/** - * Make a DataVIO's hash lock a shared holder of the PBN lock on the - * compressed block to which its data was just written. If the lock is still a - * write lock (as it will be for the first share), it will be converted to a - * read lock. This also reserves a reference count increment for the DataVIO. - * - * @param dataVIO The DataVIO which was just compressed - * @param pbnLock The PBN lock on the compressed block - **/ -void shareCompressedWriteLock(DataVIO *dataVIO, PBNLock *pbnLock); - -#endif // HASH_LOCK_H diff --git a/vdo/base/hashLockInternals.h b/vdo/base/hashLockInternals.h deleted file mode 100644 index 67b5634..0000000 --- a/vdo/base/hashLockInternals.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashLockInternals.h#2 $ - */ - -#ifndef HASH_LOCK_INTERNALS_H -#define HASH_LOCK_INTERNALS_H - -#include "completion.h" -#include "ringNode.h" -#include "types.h" -#include "uds.h" -#include "waitQueue.h" - -typedef enum { - /** State for locks that are not in use or are being initialized. */ - HASH_LOCK_INITIALIZING = 0, - - // This is the sequence of states typically used on the non-dedupe path. - HASH_LOCK_QUERYING, - HASH_LOCK_WRITING, - HASH_LOCK_UPDATING, - - // The remaining states are typically used on the dedupe path in this order. - HASH_LOCK_LOCKING, - HASH_LOCK_VERIFYING, - HASH_LOCK_DEDUPING, - HASH_LOCK_UNLOCKING, - - // XXX This is a temporary state denoting a lock which is sending VIOs back - // to the old dedupe and vioWrite pathways. It won't be in the final version - // of VDOSTORY-190. - HASH_LOCK_BYPASSING, - - /** - * Terminal state for locks returning to the pool. Must be last both because - * it's the final state, and also because it's used to count the states. - **/ - HASH_LOCK_DESTROYING, -} HashLockState; - -struct hashLock { - /** When the lock is unused, this RingNode allows the lock to be pooled */ - RingNode poolNode; - - /** The block hash covered by this lock */ - UdsChunkName hash; - - /** - * A ring containing the DataVIOs sharing this lock, all having the same - * chunk name and data block contents, linked by their hashLockNode fields. - **/ - RingNode duplicateRing; - - /** The number of DataVIOs sharing this lock instance */ - VIOCount referenceCount; - - /** The maximum value of referenceCount in the lifetime of this lock */ - VIOCount maxReferences; - - /** The current state of this lock */ - HashLockState state; - - /** True if the UDS index should be updated with new advice */ - bool updateAdvice; - - /** True if the advice has been verified to be a true duplicate */ - bool verified; - - /** True if the lock has already accounted for an initial verification */ - bool verifyCounted; - - /** True if this lock is registered in the lock map (cleared on rollover) */ - bool registered; - - /** - * If verified is false, this is the location of a possible duplicate. - * If verified is true, is is the verified location of a true duplicate. - **/ - ZonedPBN duplicate; - - /** The PBN lock on the block containing the duplicate data */ - PBNLock *duplicateLock; - - /** The DataVIO designated to act on behalf of the lock */ - DataVIO *agent; - - /** - * Other DataVIOs with data identical to the agent who are currently waiting - * for the agent to get the information they all need to deduplicate--either - * against each other, or against an existing duplicate on disk. - **/ - WaitQueue waiters; -}; - -/** - * Initialize a HashLock instance which has been newly allocated. - * - * @param lock The lock to initialize - **/ -static inline void initializeHashLock(HashLock *lock) -{ - initializeRing(&lock->poolNode); - initializeRing(&lock->duplicateRing); - initializeWaitQueue(&lock->waiters); -} - -/** - * Get the string representation of a hash lock state. - * - * @param state The hash lock state - * - * @return The short string representing the state - **/ -const char *getHashLockStateName(HashLockState state) - __attribute__((warn_unused_result)); - -#endif // HASH_LOCK_INTERNALS_H diff --git a/vdo/base/hashZone.c b/vdo/base/hashZone.c deleted file mode 100644 index 61345a7..0000000 --- a/vdo/base/hashZone.c +++ /dev/null @@ -1,351 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashZone.c#3 $ - */ - -#include "hashZone.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" - -#include "constants.h" -#include "dataVIO.h" -#include "hashLock.h" -#include "hashLockInternals.h" -#include "pointerMap.h" -#include "ringNode.h" -#include "statistics.h" -#include "threadConfig.h" -#include "types.h" -#include "vdoInternal.h" - -enum { - LOCK_POOL_CAPACITY = MAXIMUM_USER_VIOS, -}; - -/** - * These fields are only modified by the locks sharing the hash zone thread, - * but are queried by other threads. - **/ -typedef struct atomicHashLockStatistics { - /** Number of times the UDS advice proved correct */ - Atomic64 dedupeAdviceValid; - - /** Number of times the UDS advice proved incorrect */ - Atomic64 dedupeAdviceStale; - - /** Number of writes with the same data as another in-flight write */ - Atomic64 concurrentDataMatches; - - /** Number of writes whose hash collided with an in-flight write */ - Atomic64 concurrentHashCollisions; -} AtomicHashLockStatistics; - -struct hashZone { - /** Which hash zone this is */ - ZoneCount zoneNumber; - - /** The thread ID for this zone */ - ThreadID threadID; - - /** Mapping from chunkName fields to HashLocks */ - PointerMap *hashLockMap; - - /** Ring containing all unused HashLocks */ - RingNode lockPool; - - /** Statistics shared by all hash locks in this zone */ - AtomicHashLockStatistics statistics; - - /** Array of all HashLocks */ - HashLock *lockArray; -}; - -/** - * Implements PointerKeyComparator. - **/ -static bool compareKeys(const void *thisKey, const void *thatKey) -{ - // Null keys are not supported. - return (memcmp(thisKey, thatKey, sizeof(UdsChunkName)) == 0); -} - -/** - * Implements PointerKeyComparator. - **/ -static uint32_t hashKey(const void *key) -{ - const UdsChunkName *name = key; - /* - * Use a fragment of the chunk name as a hash code. It must not overlap with - * fragments used elsewhere to ensure uniform distributions. - */ - // XXX pick an offset in the chunk name that isn't used elsewhere - return getUInt32LE(&name->name[4]); -} - -/**********************************************************************/ -static inline HashLock *asHashLock(RingNode *poolNode) -{ - STATIC_ASSERT(offsetof(HashLock, poolNode) == 0); - return (HashLock *) poolNode; -} - -/**********************************************************************/ -int makeHashZone(VDO *vdo, ZoneCount zoneNumber, HashZone **zonePtr) -{ - HashZone *zone; - int result = ALLOCATE(1, HashZone, __func__, &zone); - if (result != VDO_SUCCESS) { - return result; - } - - result = makePointerMap(LOCK_MAP_CAPACITY, 0, compareKeys, hashKey, - &zone->hashLockMap); - if (result != VDO_SUCCESS) { - freeHashZone(&zone); - return result; - } - - zone->zoneNumber = zoneNumber; - zone->threadID = getHashZoneThread(getThreadConfig(vdo), zoneNumber); - initializeRing(&zone->lockPool); - - result = ALLOCATE(LOCK_POOL_CAPACITY, HashLock, "HashLock array", - &zone->lockArray); - if (result != VDO_SUCCESS) { - freeHashZone(&zone); - return result; - } - - for (VIOCount i = 0; i < LOCK_POOL_CAPACITY; i++) { - HashLock *lock = &zone->lockArray[i]; - initializeHashLock(lock); - pushRingNode(&zone->lockPool, &lock->poolNode); - } - - *zonePtr = zone; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeHashZone(HashZone **zonePtr) -{ - if (*zonePtr == NULL) { - return; - } - - HashZone *zone = *zonePtr; - freePointerMap(&zone->hashLockMap); - FREE(zone->lockArray); - FREE(zone); - *zonePtr = NULL; -} - -/**********************************************************************/ -ZoneCount getHashZoneNumber(const HashZone *zone) -{ - return zone->zoneNumber; -} - -/**********************************************************************/ -ThreadID getHashZoneThreadID(const HashZone *zone) -{ - return zone->threadID; -} - -/**********************************************************************/ -HashLockStatistics getHashZoneStatistics(const HashZone *zone) -{ - const AtomicHashLockStatistics *atoms = &zone->statistics; - return (HashLockStatistics) { - .dedupeAdviceValid = relaxedLoad64(&atoms->dedupeAdviceValid), - .dedupeAdviceStale = relaxedLoad64(&atoms->dedupeAdviceStale), - .concurrentDataMatches = relaxedLoad64(&atoms->concurrentDataMatches), - .concurrentHashCollisions - = relaxedLoad64(&atoms->concurrentHashCollisions), - }; -} - -/** - * Return a hash lock to the zone's pool and null out the reference to it. - * - * @param [in] zone The zone from which the lock was borrowed - * @param [in,out] lockPtr The last reference to the lock being returned - **/ -static void returnHashLockToPool(HashZone *zone, HashLock **lockPtr) -{ - HashLock *lock = *lockPtr; - *lockPtr = NULL; - - memset(lock, 0, sizeof(*lock)); - initializeHashLock(lock); - pushRingNode(&zone->lockPool, &lock->poolNode); -} - -/**********************************************************************/ -int acquireHashLockFromZone(HashZone *zone, - const UdsChunkName *hash, - HashLock *replaceLock, - HashLock **lockPtr) -{ - // Borrow and prepare a lock from the pool so we don't have to do two - // PointerMap accesses in the common case of no lock contention. - HashLock *newLock = asHashLock(popRingNode(&zone->lockPool)); - int result = ASSERT(newLock != NULL, - "never need to wait for a free hash lock"); - if (result != VDO_SUCCESS) { - return result; - } - - // Fill in the hash of the new lock so we can map it, since we have to use - // the hash as the map key. - newLock->hash = *hash; - - HashLock *lock; - result = pointerMapPut(zone->hashLockMap, &newLock->hash, newLock, - (replaceLock != NULL), (void **) &lock); - if (result != VDO_SUCCESS) { - returnHashLockToPool(zone, &newLock); - return result; - } - - if (replaceLock != NULL) { - // XXX on mismatch put the old lock back and return a severe error - ASSERT_LOG_ONLY(lock == replaceLock, - "old lock must have been in the lock map"); - // XXX check earlier and bail out? - ASSERT_LOG_ONLY(replaceLock->registered, - "old lock must have been marked registered"); - replaceLock->registered = false; - } - - if (lock == replaceLock) { - lock = newLock; - lock->registered = true; - } else { - // There's already a lock for the hash, so we don't need the borrowed lock. - returnHashLockToPool(zone, &newLock); - } - - *lockPtr = lock; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void returnHashLockToZone(HashZone *zone, HashLock **lockPtr) -{ - HashLock *lock = *lockPtr; - *lockPtr = NULL; - - if (lock->registered) { - HashLock *removed = pointerMapRemove(zone->hashLockMap, &lock->hash); - ASSERT_LOG_ONLY(lock == removed, - "hash lock being released must have been mapped"); - } else { - ASSERT_LOG_ONLY(lock != pointerMapGet(zone->hashLockMap, &lock->hash), - "unregistered hash lock must not be in the lock map"); - } - - ASSERT_LOG_ONLY(!hasWaiters(&lock->waiters), - "hash lock returned to zone must have no waiters"); - ASSERT_LOG_ONLY((lock->duplicateLock == NULL), - "hash lock returned to zone must not reference a PBN lock"); - ASSERT_LOG_ONLY((lock->state == HASH_LOCK_DESTROYING), - "returned hash lock must not be in use with state %s", - getHashLockStateName(lock->state)); - ASSERT_LOG_ONLY(isRingEmpty(&lock->poolNode), - "hash lock returned to zone must not be in a pool ring"); - ASSERT_LOG_ONLY(isRingEmpty(&lock->duplicateRing), - "hash lock returned to zone must not reference DataVIOs"); - - returnHashLockToPool(zone, &lock); -} - -/** - * Dump a compact description of HashLock to the log if the lock is not on the - * free list. - * - * @param lock The hash lock to dump - **/ -static void dumpHashLock(const HashLock *lock) -{ - if (!isRingEmpty(&lock->poolNode)) { - // This lock is on the free list. - return; - } - - // Necessarily cryptic since we can log a lot of these. First three chars of - // state is unambiguous. 'U' indicates a lock not registered in the map. - const char *state = getHashLockStateName(lock->state); - logInfo(" hl %" PRIptr ": %3.3s %c%llu/%u rc=%u wc=%zu agt=%" PRIptr, - (const void *) lock, - state, - (lock->registered ? 'D' : 'U'), - lock->duplicate.pbn, - lock->duplicate.state, - lock->referenceCount, - countWaiters(&lock->waiters), - (void *) lock->agent); -} - -/**********************************************************************/ -void bumpHashZoneValidAdviceCount(HashZone *zone) -{ - // Must only be mutated on the hash zone thread. - relaxedAdd64(&zone->statistics.dedupeAdviceValid, 1); -} - -/**********************************************************************/ -void bumpHashZoneStaleAdviceCount(HashZone *zone) -{ - // Must only be mutated on the hash zone thread. - relaxedAdd64(&zone->statistics.dedupeAdviceStale, 1); -} - -/**********************************************************************/ -void bumpHashZoneDataMatchCount(HashZone *zone) -{ - // Must only be mutated on the hash zone thread. - relaxedAdd64(&zone->statistics.concurrentDataMatches, 1); -} - -/**********************************************************************/ -void bumpHashZoneCollisionCount(HashZone *zone) -{ - // Must only be mutated on the hash zone thread. - relaxedAdd64(&zone->statistics.concurrentHashCollisions, 1); -} - -/**********************************************************************/ -void dumpHashZone(const HashZone *zone) -{ - if (zone->hashLockMap == NULL) { - logInfo("HashZone %u: NULL map", zone->zoneNumber); - return; - } - - logInfo("HashZone %u: mapSize=%zu", - zone->zoneNumber, pointerMapSize(zone->hashLockMap)); - for (VIOCount i = 0; i < LOCK_POOL_CAPACITY; i++) { - dumpHashLock(&zone->lockArray[i]); - } -} diff --git a/vdo/base/hashZone.h b/vdo/base/hashZone.h deleted file mode 100644 index ac1b695..0000000 --- a/vdo/base/hashZone.h +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/hashZone.h#1 $ - */ - -#ifndef HASH_ZONE_H -#define HASH_ZONE_H - -#include "uds.h" - -#include "statistics.h" -#include "types.h" - -/** - * Create a hash zone. - * - * @param [in] vdo The VDO to which the zone will belong - * @param [in] zoneNumber The number of the zone to create - * @param [out] zonePtr A pointer to hold the new HashZone - * - * @return VDO_SUCCESS or an error code - **/ -int makeHashZone(VDO *vdo, ZoneCount zoneNumber, HashZone **zonePtr) - __attribute__((warn_unused_result)); - -/** - * Free a hash zone and null out the reference to it. - * - * @param zonePtr A pointer to the zone to free - **/ -void freeHashZone(HashZone **zonePtr); - -/** - * Get the zone number of a hash zone. - * - * @param zone The zone - * - * @return The number of the zone - **/ -ZoneCount getHashZoneNumber(const HashZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the ID of a hash zone's thread. - * - * @param zone The zone - * - * @return The zone's thread ID - **/ -ThreadID getHashZoneThreadID(const HashZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the statistics for this hash zone. - * - * @param zone The hash zone to query - * - * @return A copy of the current statistics for the hash zone - **/ -HashLockStatistics getHashZoneStatistics(const HashZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the lock for the hash (chunk name) of the data in a DataVIO, or if one - * does not exist (or if we are explicitly rolling over), initialize a new - * lock for the hash and register it in the zone. This must only be called in - * the correct thread for the zone. - * - * @param [in] zone The zone responsible for the hash - * @param [in] hash The hash to lock - * @param [in] replaceLock If non-NULL, the lock already registered for the - * hash which should be replaced by the new lock - * @param [out] lockPtr A pointer to receive the hash lock - * - * @return VDO_SUCCESS or an error code - **/ -int acquireHashLockFromZone(HashZone *zone, - const UdsChunkName *hash, - HashLock *replaceLock, - HashLock **lockPtr) - __attribute__((warn_unused_result)); - -/** - * Return a hash lock to the zone it was borrowed from, remove it from the - * zone's lock map, returning it to the pool, and nulling out the reference to - * it. This must only be called when the lock has been completely released, - * and only in the correct thread for the zone. - * - * @param [in] zone The zone from which the lock was borrowed - * @param [in,out] lockPtr The lock that is no longer in use - **/ -void returnHashLockToZone(HashZone *zone, HashLock **lockPtr); - -/** - * Increment the valid advice count in the hash zone statistics. - * Must only be called from the hash zone thread. - * - * @param zone The hash zone of the lock that received valid advice - **/ -void bumpHashZoneValidAdviceCount(HashZone *zone); - -/** - * Increment the stale advice count in the hash zone statistics. - * Must only be called from the hash zone thread. - * - * @param zone The hash zone of the lock that received stale advice - **/ -void bumpHashZoneStaleAdviceCount(HashZone *zone); - -/** - * Increment the concurrent dedupe count in the hash zone statistics. - * Must only be called from the hash zone thread. - * - * @param zone The hash zone of the lock that matched a new DataVIO - **/ -void bumpHashZoneDataMatchCount(HashZone *zone); - -/** - * Increment the concurrent hash collision count in the hash zone statistics. - * Must only be called from the hash zone thread. - * - * @param zone The hash zone of the lock that rejected a colliding DataVIO - **/ -void bumpHashZoneCollisionCount(HashZone *zone); - -/** - * Dump information about a hash zone to the log for debugging. - * - * @param zone The zone to dump - **/ -void dumpHashZone(const HashZone *zone); - -#endif // HASH_ZONE_H diff --git a/vdo/base/header.c b/vdo/base/header.c deleted file mode 100644 index 8f0582b..0000000 --- a/vdo/base/header.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/header.c#5 $ - */ - -#include "header.h" - -#include "logger.h" -#include "permassert.h" -#include "statusCodes.h" - -/**********************************************************************/ -int validateVersion(VersionNumber expectedVersion, - VersionNumber actualVersion, - const char *componentName) -{ - if (!areSameVersion(expectedVersion, actualVersion)) { - return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, - "%s version mismatch," - " expected %d.%d, got %d.%d", - componentName, - expectedVersion.majorVersion, - expectedVersion.minorVersion, - actualVersion.majorVersion, - actualVersion.minorVersion); - } - return VDO_SUCCESS; -} - -/**********************************************************************/ -int validateHeader(const Header *expectedHeader, - const Header *actualHeader, - bool exactSize, - const char *componentName) -{ - if (expectedHeader->id != actualHeader->id) { - return logErrorWithStringError(VDO_INCORRECT_COMPONENT, - "%s ID mismatch, expected %d, got %d", - componentName, - expectedHeader->id, - actualHeader->id); - } - - int result = validateVersion(expectedHeader->version, - actualHeader->version, - componentName); - if (result != VDO_SUCCESS) { - return result; - } - - if ((expectedHeader->size > actualHeader->size) - || (exactSize && (expectedHeader->size < actualHeader->size))) { - return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, - "%s size mismatch, expected %zu, got %zu", - componentName, - expectedHeader->size, - actualHeader->size); - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int encodeHeader(const Header *header, Buffer *buffer) -{ - if (!ensureAvailableSpace(buffer, ENCODED_HEADER_SIZE)) { - return UDS_BUFFER_ERROR; - } - - int result = putUInt32LEIntoBuffer(buffer, header->id); - if (result != UDS_SUCCESS) { - return result; - } - - result = encodeVersionNumber(header->version, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - return putUInt64LEIntoBuffer(buffer, header->size); -} - -/**********************************************************************/ -int encodeVersionNumber(VersionNumber version, Buffer *buffer) -{ - PackedVersionNumber packed = packVersionNumber(version); - return putBytes(buffer, sizeof(packed), &packed); -} - -/**********************************************************************/ -int decodeHeader(Buffer *buffer, Header *header) -{ - ComponentID id; - int result = getUInt32LEFromBuffer(buffer, &id); - if (result != UDS_SUCCESS) { - return result; - } - - VersionNumber version; - result = decodeVersionNumber(buffer, &version); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t size; - result = getUInt64LEFromBuffer(buffer, &size); - if (result != UDS_SUCCESS) { - return result; - } - - *header = (Header) { - .id = id, - .version = version, - .size = size, - }; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int decodeVersionNumber(Buffer *buffer, VersionNumber *version) -{ - PackedVersionNumber packed; - int result = getBytesFromBuffer(buffer, sizeof(packed), &packed); - if (result != UDS_SUCCESS) { - return result; - } - - *version = unpackVersionNumber(packed); - return UDS_SUCCESS; -} diff --git a/vdo/base/header.h b/vdo/base/header.h deleted file mode 100644 index d5b4f0e..0000000 --- a/vdo/base/header.h +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/header.h#4 $ - */ - -#ifndef HEADER_H -#define HEADER_H - -#include "buffer.h" -#include "numeric.h" - -#include "types.h" - -/** - * An in-memory representation of a version number for versioned structures on - * disk. - * - * A version number consists of two portions, a major version and a - * minor version. Any format change which does not require an explicit - * upgrade step from the previous version should increment the minor - * version. Any format change which either requires an explicit - * upgrade step, or is wholly incompatible (i.e. can not be upgraded - * to), should increment the major version, and set the minor version - * to 0. - **/ -typedef struct { - uint32_t majorVersion; - uint32_t minorVersion; -} __attribute__((packed)) VersionNumber; - -/** - * A packed, machine-independent, on-disk representation of a VersionNumber. - * Both fields are stored in little-endian byte order. - **/ -typedef struct { - byte majorVersion[4]; - byte minorVersion[4]; -} __attribute__((packed)) PackedVersionNumber; - -/** - * The registry of component ids for use in headers - **/ -typedef enum { - SUPER_BLOCK = 0, - FIXED_LAYOUT = 1, - RECOVERY_JOURNAL = 2, - SLAB_DEPOT = 3, - BLOCK_MAP = 4, - GEOMETRY_BLOCK = 5, -} ComponentID; - -/** - * The header for versioned data stored on disk. - **/ -typedef struct { - ComponentID id; // The component this is a header for - VersionNumber version; // The version of the data format - size_t size; // The size of the data following this header -} __attribute__((packed)) Header; - -enum { - ENCODED_HEADER_SIZE = sizeof(Header), -}; - -/** - * Check whether two version numbers are the same. - * - * @param versionA The first version - * @param versionB The second version - * - * @return true if the two versions are the same - **/ -static inline bool areSameVersion(VersionNumber versionA, - VersionNumber versionB) -{ - return ((versionA.majorVersion == versionB.majorVersion) - && (versionA.minorVersion == versionB.minorVersion)); -} - -/** - * Check whether an actual version is upgradable to an expected version. - * An actual version is upgradable if its major number is expected but - * its minor number differs, and the expected version's minor number - * is greater than the actual version's minor number. - * - * @param expectedVersion The expected version - * @param actualVersion The version being validated - * - * @return true if the actual version is upgradable - **/ -static inline bool isUpgradableVersion(VersionNumber expectedVersion, - VersionNumber actualVersion) -{ - return ((expectedVersion.majorVersion == actualVersion.majorVersion) - && (expectedVersion.minorVersion > actualVersion.minorVersion)); -} - -/** - * Check whether a version matches an expected version. Logs an error - * describing a mismatch. - * - * @param expectedVersion The expected version - * @param actualVersion The version being validated - * @param componentName The name of the component or the calling function - * (for error logging) - * - * @return VDO_SUCCESS if the versions are the same - * VDO_UNSUPPORTED_VERSION if the versions don't match - **/ -int validateVersion(VersionNumber expectedVersion, - VersionNumber actualVersion, - const char *componentName) - __attribute__((warn_unused_result)); - -/** - * Check whether a header matches expectations. Logs an error describing the - * first mismatch found. - * - * @param expectedHeader The expected header - * @param actualHeader The header being validated - * @param exactSize If true, the size fields of the two headers must be - * the same, otherwise actualSize >= expectedSize is OK - * @param componentName The name of the component or the calling function - * (for error logging) - * - * @return VDO_SUCCESS if the header meets expectations - * VDO_INCORRECT_COMPONENT if the component ids don't match - * VDO_UNSUPPORTED_VERSION if the versions or sizes don't match - **/ -int validateHeader(const Header *expectedHeader, - const Header *actualHeader, - bool exactSize, - const char *componentName) - __attribute__((warn_unused_result)); - -/** - * Encode a header into a buffer. - * - * @param header The header to encode - * @param buffer The buffer in which to encode the header - * - * @return UDS_SUCCESS or an error - **/ -int encodeHeader(const Header *header, Buffer *buffer) - __attribute__((warn_unused_result)); - -/** - * Encode a version number into a buffer. - * - * @param version The version to encode - * @param buffer The buffer in which to encode the version - * - * @return UDS_SUCCESS or an error - **/ -int encodeVersionNumber(VersionNumber version, Buffer *buffer) - __attribute__((warn_unused_result)); - -/** - * Decode a header from a buffer. - * - * @param [in] buffer The buffer from which to decode the header - * @param [out] header The header to decode - * - * @return UDS_SUCCESS or an error - **/ -int decodeHeader(Buffer *buffer, Header *header) - __attribute__((warn_unused_result)); - -/** - * Decode a version number from a buffer. - * - * @param buffer The buffer from which to decode the version - * @param version The version structure to decode into - * - * @return UDS_SUCCESS or an error - **/ -int decodeVersionNumber(Buffer *buffer, VersionNumber *version) - __attribute__((warn_unused_result)); - -/** - * Convert a VersionNumber to its packed on-disk representation. - * - * @param version The version number to convert - * - * @return the platform-independent representation of the version - **/ -static inline PackedVersionNumber packVersionNumber(VersionNumber version) -{ - PackedVersionNumber packed; - storeUInt32LE(packed.majorVersion, version.majorVersion); - storeUInt32LE(packed.minorVersion, version.minorVersion); - return packed; -} - -/** - * Convert a PackedVersionNumber to its native in-memory representation. - * - * @param version The version number to convert - * - * @return the platform-independent representation of the version - **/ -static inline VersionNumber unpackVersionNumber(PackedVersionNumber version) -{ - return (VersionNumber) { - .majorVersion = getUInt32LE(version.majorVersion), - .minorVersion = getUInt32LE(version.minorVersion), - }; -} - -#endif // HEADER_H diff --git a/vdo/base/heap.c b/vdo/base/heap.c deleted file mode 100644 index 0928023..0000000 --- a/vdo/base/heap.c +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/heap.c#2 $ - */ - -#include "heap.h" - -#include "errors.h" -#include "logger.h" -#include "numeric.h" - -#include "statusCodes.h" - -/**********************************************************************/ -void initializeHeap(Heap *heap, - HeapComparator *comparator, - HeapSwapper *swapper, - void *array, - size_t capacity, - size_t elementSize) -{ - *heap = (Heap) { - .comparator = comparator, - .swapper = swapper, - .capacity = capacity, - .elementSize = elementSize, - }; - if (array != NULL) { - // Calculating child indexes is simplified by pretending the element array - // is 1-based. - heap->array = ((byte *) array - elementSize); - } -} - -/**********************************************************************/ -static void siftHeapDown(Heap *heap, size_t topNode, size_t lastNode) -{ - // Keep sifting until the sub-heap rooted at topNode has no children. - size_t leftChild; - while ((leftChild = (2 * topNode)) <= lastNode) { - // If there are two children, select the largest child to swap with. - size_t swapNode = leftChild; - if (leftChild < lastNode) { - size_t rightChild = leftChild + heap->elementSize; - if (heap->comparator(&heap->array[leftChild], - &heap->array[rightChild]) < 0) { - swapNode = rightChild; - } - } - - // Stop sifting if topNode is at least as large as its largest child, - // which means the heap invariant was restored by the previous swap. - if (heap->comparator(&heap->array[topNode], &heap->array[swapNode]) >= 0) { - return; - } - - // Swap the element we've been sifting down with the larger child. - heap->swapper(&heap->array[topNode], &heap->array[swapNode]); - - // Descend into the sub-heap rooted at that child, going around the loop - // again in place of a tail-recursive call to siftHeapDown(). - topNode = swapNode; - } - - // We sifted the element all the way to a leaf node of the heap, so the heap - // invariant has now been restored. -} - -/**********************************************************************/ -void buildHeap(Heap *heap, size_t count) -{ - heap->count = minSizeT(count, heap->capacity); - - if ((heap->count < 2) || (heap->elementSize == 0)) { - return; - } - - /* - * All the leaf nodes are trivially valid sub-heaps. Starting with the parent - * of the right-most leaf node, restore the heap invariant in that sub-heap - * by sifting the top node of the sub-heap down into one of its children's - * valid sub-heaps (or not, if the top node is already larger than its - * children). Continue iterating through all the interior nodes in the heap, - * in sort of a reverse breadth-first traversal, restoring the heap - * invariant for each (increasingly larger) sub-heap until we reach the root - * of the heap. Once we sift the root node down into one of its two valid - * children, the entire heap must be valid, by induction. - * - * Even though we operate on every node and potentially perform an O(log N) - * traversal for each node, the combined probabilities of actually needing - * to do a swap and the heights of the sub-heaps sum to a constant, so - * restoring a heap from the bottom-up like this has only O(N) complexity. - */ - size_t size = heap->elementSize; - size_t lastParent = size * (heap->count / 2); - size_t lastNode = size * heap->count; - for (size_t topNode = lastParent; topNode > 0; topNode -= size) { - siftHeapDown(heap, topNode, lastNode); - } -} - -/**********************************************************************/ -bool popMaxHeapElement(Heap *heap, void *elementPtr) -{ - if (heap->count == 0) { - return false; - } - - size_t rootNode = (heap->elementSize * 1); - size_t lastNode = (heap->elementSize * heap->count); - - // Return the maximum element (the root of the heap) if the caller wanted it. - if (elementPtr != NULL) { - memcpy(elementPtr, &heap->array[rootNode], heap->elementSize); - } - - // Move the right-most leaf node to the vacated root node, reducing the - // number of elements by one and violating the heap invariant. - if (rootNode != lastNode) { - memcpy(&heap->array[rootNode], &heap->array[lastNode], heap->elementSize); - } - heap->count -= 1; - lastNode -= heap->elementSize; - - // Restore the heap invariant by sifting the root back down into the heap. - siftHeapDown(heap, rootNode, lastNode); - return true; -} - -/**********************************************************************/ -static inline size_t siftAndSort(Heap *heap, size_t rootNode, size_t lastNode) -{ - /* - * We have a valid heap, so the largest unsorted element is now at the top - * of the heap. That element belongs at the start of the partially-sorted - * array, preceding all the larger elements that we've already removed - * from the heap. Swap that largest unsorted element with the the - * right-most leaf node in the heap, moving it to its sorted position in - * the array. - */ - heap->swapper(&heap->array[rootNode], &heap->array[lastNode]); - // The sorted list is now one element larger and valid. The heap is - // one element smaller, and invalid. - lastNode -= heap->elementSize; - // Restore the heap invariant by sifting the swapped element back down - // into the heap. - siftHeapDown(heap, rootNode, lastNode); - return lastNode; -} - -/**********************************************************************/ -size_t sortHeap(Heap *heap) -{ - // All zero-length records are identical and therefore already sorted, as - // are empty or singleton arrays. - if ((heap->count < 2) || (heap->elementSize == 0)) { - return heap->count; - } - - // Get the byte array offset of the root node, and the right-most leaf node - // in the 1-based array of records that will form the heap. - size_t rootNode = (heap->elementSize * 1); - size_t lastNode = (heap->elementSize * heap->count); - - while (lastNode > rootNode) { - lastNode = siftAndSort(heap, rootNode, lastNode); - } - - size_t count = heap->count; - heap->count = 0; - return count; -} - -/**********************************************************************/ -void *sortNextHeapElement(Heap *heap) -{ - if ((heap->count == 0) || (heap->elementSize == 0)) { - return NULL; - } - - // Get the byte array offset of the root node, and the right-most leaf node - // in the 1-based array of records that will form the heap. - size_t rootNode = (heap->elementSize * 1); - size_t lastNode = (heap->elementSize * heap->count); - if (heap->count > 1) { - siftAndSort(heap, rootNode, lastNode); - } - heap->count--; - - return &heap->array[lastNode]; -} diff --git a/vdo/base/heap.h b/vdo/base/heap.h deleted file mode 100644 index 916f017..0000000 --- a/vdo/base/heap.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/heap.h#2 $ - */ - -#ifndef HEAP_H -#define HEAP_H - -#include "common.h" - -/** - * Prototype for functions which compare two array elements. All the time - * complexity claims in this module assume this operation has O(1) time - * complexity. - * - * @param item1 The first element to compare - * @param item2 The second element to compare - * - * @return An integer which is less than, equal to, or greater than 0 - * depending on whether item1 is less than, equal to, or greater - * than item2, respectively - **/ -typedef int HeapComparator(const void *item1, const void *item2); - -/** - * Prototype for functions which swap two array elements. - * - * @param item1 The first element to swap - * @param item2 The second element to swap - **/ -typedef void HeapSwapper(void *item1, void *item2); - -/** - * A heap array can be any array of fixed-length elements in which the heap - * invariant can be established. In a max-heap, every child of a node must be - * at least as large as its children. Once that invariant is established in an - * array by calling buildHeap(), all the other heap operations may be used on - * that array. - **/ -typedef struct heap { - /** the 1-based array of heap elements (nodes) */ - byte *array; - /** the function to use to compare two elements */ - HeapComparator *comparator; - /** the function to use to swap two elements */ - HeapSwapper *swapper; - /** the maximum number of elements that can be stored */ - size_t capacity; - /** the size of every element (in bytes) */ - size_t elementSize; - /** the current number of elements in the heap */ - size_t count; -} Heap; - -/** - * Initialize an binary heap by wrapping it around an array of elements. - * - * The heap will not own the array it wraps. Use buildHeap() subsequently to - * arrange any elements contained in the array into a valid heap. - * - * @param heap The heap to initialize - * @param comparator The function to use to compare two heap elements - * @param swapper The function to use to swap two heap elements - * @param array The array of elements (not modified by this call) - * @param capacity The maximum number of elements which fit in the array - * @param elementSize The size of every array element, in bytes - **/ -void initializeHeap(Heap *heap, - HeapComparator *comparator, - HeapSwapper *swapper, - void *array, - size_t capacity, - size_t elementSize); - -/** - * Build a max-heap in place in an array (heapify it) by re-ordering the - * elements to establish the heap invariant. Before calling this function, - * first copy the elements to be arranged into a heap into the array that was - * passed to initializeHeap(). This operation has O(N) time complexity in the - * number of elements in the array. - * - * @param heap The heap to build - * @param count The number of elements in the array to build into a heap - **/ -void buildHeap(Heap *heap, size_t count); - -/** - * Check whether the heap is currently empty. - * - * @param heap The heap to query - * - * @return true if there are no elements in the heap - **/ -static inline bool isHeapEmpty(const Heap *heap) -{ - return (heap->count == 0); -} - -/** - * Remove the largest element from the top of the heap and restore the heap - * invariant on the remaining elements. This operation has O(log2(N)) time - * complexity. - * - * @param [in] heap The heap to modify - * @param [out] elementPtr A pointer to receive the largest element (may be - * NULL if the caller just wishes to discard it) - * - * @return false if the heap was empty, so no element was removed - **/ -bool popMaxHeapElement(Heap *heap, void *elementPtr); - -/** - * Sort the elements contained in a heap. - * - * This function re-orders the elements contained in the heap to a sorted - * array in-place by repeatedly popping the maximum element off the heap and - * moving it to the spot vacated at the end of the heap array. When the - * function returns, the heap will be empty and the array will contain the - * elements in sorted order, from heap minimum to heap maximum. The sort is - * unstable--relative ordering of equal keys is not preserved. This operation - * has O(N*log2(N)) time complexity. - * - * @param heap The heap containing the elements to sort - * - * @return the number of elements that were sorted - **/ -size_t sortHeap(Heap *heap); - -/** - * Gets the next sorted heap element and returns a pointer to it, in O(log2(N)) - * time. - * - * @param heap The heap to sort one more step - * - * @return a pointer to the element sorted, or NULL if already fully sorted. - **/ -void *sortNextHeapElement(Heap *heap); - -#endif /* HEAP_H */ diff --git a/vdo/base/intMap.c b/vdo/base/intMap.c deleted file mode 100644 index 2c690a6..0000000 --- a/vdo/base/intMap.c +++ /dev/null @@ -1,661 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/intMap.c#1 $ - */ - -/** - * Hash table implementation of a map from integers to pointers, implemented - * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see - * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does - * not contain any of the locking/concurrency features of the algorithm, just - * the collision resolution scheme. - * - * Hopscotch Hashing is based on hashing with open addressing and linear - * probing. All the entries are stored in a fixed array of buckets, with no - * dynamic allocation for collisions. Unlike linear probing, all the entries - * that hash to a given bucket are stored within a fixed neighborhood starting - * at that bucket. Chaining is effectively represented as a bit vector - * relative to each bucket instead of as pointers or explicit offsets. - * - * When an empty bucket cannot be found within a given neighborhood, - * subsequent neighborhoods are searched, and one or more entries will "hop" - * into those neighborhoods. When this process works, an empty bucket will - * move into the desired neighborhood, allowing the entry to be added. When - * that process fails (typically when the buckets are around 90% full), the - * table must be resized and the all entries rehashed and added to the - * expanded table. - * - * Unlike linear probing, the number of buckets that must be searched in the - * worst case has a fixed upper bound (the size of the neighborhood). Those - * entries occupy a small number of memory cache lines, leading to improved - * use of the cache (fewer misses on both successful and unsuccessful - * searches). Hopscotch hashing outperforms linear probing at much higher load - * factors, so even with the increased memory burden for maintaining the hop - * vectors, less memory is needed to achieve that performance. Hopscotch is - * also immune to "contamination" from deleting entries since entries are - * genuinely removed instead of being replaced by a placeholder. - * - * The published description of the algorithm used a bit vector, but the paper - * alludes to an offset scheme which is used by this implementation. Since the - * entries in the neighborhood are within N entries of the hash bucket at the - * start of the neighborhood, a pair of small offset fields each log2(N) bits - * wide is all that's needed to maintain the hops as a linked list. In order - * to encode "no next hop" (i.e. NULL) as the natural initial value of zero, - * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 => - * offset=1, etc.) We can represent neighborhoods of up to 255 entries with - * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the - * first entry in the list is always the bucket closest to the start of the - * neighborhood. - * - * While individual accesses tend to be very fast, the table resize operations - * are very very expensive. If an upper bound on the latency of adding an - * entry to the table is needed, we either need to ensure the table is - * pre-sized to be large enough so no resize is ever needed, or we'll need to - * develop an approach to incrementally resize the table. - **/ - -#include "intMap.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" - -enum { - DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table - NEIGHBORHOOD = 255, // the number of buckets in each neighborhood - MAX_PROBES = 1024, // limit on the number of probes for a free bucket - NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list - DEFAULT_LOAD = 75 // a compromise between memory use and performance -}; - -/** - * Buckets are packed together to reduce memory usage and improve cache - * efficiency. It would be tempting to encode the hop offsets separately and - * maintain alignment of key/value pairs, but it's crucial to keep the hop - * fields near the buckets that they use them so they'll tend to share cache - * lines. - **/ -typedef struct __attribute__((packed)) bucket { - uint8_t firstHop; // the biased offset of the first entry in the hop list - // of the neighborhood that hashes to this bucket - uint8_t nextHop; // the biased offset of the next bucket in the hop list - - uint64_t key; // the key stored in this bucket - void *value; // the value stored in this bucket (NULL if empty) -} Bucket; - -/** - * The concrete definition of the opaque IntMap type. To avoid having to wrap - * the neighborhoods of the last entries back around to the start of the - * bucket array, we allocate a few more buckets at the end of the array - * instead, which is why capacity and bucketCount are different. - **/ -struct intMap { - size_t size; // the number of entries stored in the map - size_t capacity; // the number of neighborhoods in the map - size_t bucketCount; // the number of buckets in the bucket array - Bucket *buckets; // the array of hash buckets -}; - -/** - * This is the Google CityHash 16-byte hash mixing function. - * - * @param input1 the first input value - * @param input2 the second input value - * - * @return a hash of the two inputs - **/ -static uint64_t mix(uint64_t input1, uint64_t input2) -{ - static const uint64_t CITY_MULTIPLIER = 0x9ddfea08eb382d69ULL; - - uint64_t hash = (input1 ^ input2); - hash *= CITY_MULTIPLIER; - hash ^= (hash >> 47); - hash ^= input2; - hash *= CITY_MULTIPLIER; - hash ^= (hash >> 47); - hash *= CITY_MULTIPLIER; - return hash; -} - -/** - * Calculate a 64-bit non-cryptographic hash value for the provided 64-bit - * integer key. The implementation is based on Google's CityHash, only - * handling the specific case of an 8-byte input. - * - * @param key the mapping key - * - * @return the hash of the mapping key - **/ -static uint64_t hashKey(uint64_t key) -{ - // Aliasing restrictions forbid us from casting pointer types, so use a - // union to convert a single uint64_t to two uint32_t values. - union { - uint64_t u64; - uint32_t u32[2]; - } pun = { .u64 = key }; - return mix(sizeof(key) + (((uint64_t) pun.u32[0]) << 3), pun.u32[1]); -} - -/** - * Initialize an IntMap. - * - * @param map the map to initialize - * @param capacity the initial capacity of the map - * - * @return UDS_SUCCESS or an error code - **/ -static int allocateBuckets(IntMap *map, size_t capacity) -{ - map->size = 0; - map->capacity = capacity; - - // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a - // full neighborhood without have to wrap back around to element zero. - map->bucketCount = capacity + (NEIGHBORHOOD - 1); - return ALLOCATE(map->bucketCount, Bucket, "IntMap buckets", &map->buckets); -} - -/**********************************************************************/ -int makeIntMap(size_t initialCapacity, - unsigned int initialLoad, - IntMap **mapPtr) -{ - // Use the default initial load if the caller did not specify one. - if (initialLoad == 0) { - initialLoad = DEFAULT_LOAD; - } - if (initialLoad > 100) { - return UDS_INVALID_ARGUMENT; - } - - IntMap *map; - int result = ALLOCATE(1, IntMap, "IntMap", &map); - if (result != UDS_SUCCESS) { - return result; - } - - // Use the default capacity if the caller did not specify one. - size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY; - - // Scale up the capacity by the specified initial load factor. - // (i.e to hold 1000 entries at 80% load we need a capacity of 1250) - capacity = capacity * 100 / initialLoad; - - result = allocateBuckets(map, capacity); - if (result != UDS_SUCCESS) { - freeIntMap(&map); - return result; - } - - *mapPtr = map; - return UDS_SUCCESS; -} - -/** - * Free the bucket array for the map. - * - * @param map the map whose bucket array is to be freed - **/ -static void freeBuckets(IntMap *map) -{ - FREE(map->buckets); - map->buckets = NULL; -} - -/**********************************************************************/ -void freeIntMap(IntMap **mapPtr) -{ - if (*mapPtr != NULL) { - freeBuckets(*mapPtr); - FREE(*mapPtr); - *mapPtr = NULL; - } -} - -/**********************************************************************/ -size_t intMapSize(const IntMap *map) -{ - return map->size; -} - -/** - * Convert a biased hop offset within a neighborhood to a pointer to the - * bucket it references. - * - * @param neighborhood the first bucket in the neighborhood - * @param hopOffset the biased hop offset to the desired bucket - * - * @return NULL if hopOffset is zero, otherwise a pointer to - * the bucket in the neighborhood at hopOffset - 1 - **/ -static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset) -{ - if (hopOffset == NULL_HOP_OFFSET) { - return NULL; - } - - STATIC_ASSERT(NULL_HOP_OFFSET == 0); - return &neighborhood[hopOffset - 1]; -} - -/** - * Add a bucket into the hop list for the neighborhood, inserting it into the - * list so the hop list remains sorted by hop offset. - * - * @param neighborhood the first bucket in the neighborhood - * @param newBucket the bucket to add to the hop list - **/ -static void insertInHopList(Bucket *neighborhood, Bucket *newBucket) -{ - // Zero indicates a NULL hop offset, so bias the hop offset by one. - int hopOffset = 1 + (newBucket - neighborhood); - - // Handle the special case of adding a bucket at the start of the list. - int nextHop = neighborhood->firstHop; - if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { - newBucket->nextHop = nextHop; - neighborhood->firstHop = hopOffset; - return; - } - - // Search the hop list for the insertion point that maintains the sort - // order. - for (;;) { - Bucket *bucket = dereferenceHop(neighborhood, nextHop); - nextHop = bucket->nextHop; - - if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { - newBucket->nextHop = nextHop; - bucket->nextHop = hopOffset; - return; - } - } -} - -/** - * Select and return the hash bucket for a given search key. - * - * @param map the map to search - * @param key the mapping key - **/ -static Bucket *selectBucket(const IntMap *map, uint64_t key) -{ - // Calculate a good hash value for the provided key. We want exactly 32 - // bits, so mask the result. - uint64_t hash = hashKey(key) & 0xFFFFFFFF; - - /* - * Scale the 32-bit hash to a bucket index by treating it as a binary - * fraction and multiplying that by the capacity. If the hash is uniformly - * distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be - * uniformly distributed over [0 .. capacity-1]. The multiply and shift is - * much faster than a divide (modulus) on X86 CPUs. - */ - return &map->buckets[(hash * map->capacity) >> 32]; -} - -/** - * Search the hop list associated with given hash bucket for a given search - * key. If the key is found, returns a pointer to the entry (bucket or - * collision), otherwise returns NULL. - * - * @param [in] map the map being searched - * @param [in] bucket the map bucket to search for the key - * @param [in] key the mapping key - * @param [out] previousPtr if not NULL, a pointer in which to - * store the bucket in the list preceding the one - * that had the matching key - * - * @return an entry that matches the key, or NULL if not found - **/ -static Bucket *searchHopList(IntMap *map __attribute__((unused)), - Bucket *bucket, - uint64_t key, - Bucket **previousPtr) -{ - Bucket *previous = NULL; - unsigned int nextHop = bucket->firstHop; - while (nextHop != NULL_HOP_OFFSET) { - // Check the neighboring bucket indexed by the offset for the desired key. - Bucket *entry = dereferenceHop(bucket, nextHop); - if ((key == entry->key) && (entry->value != NULL)) { - if (previousPtr != NULL) { - *previousPtr = previous; - } - return entry; - } - nextHop = entry->nextHop; - previous = entry; - } - return NULL; -} - -/**********************************************************************/ -void *intMapGet(IntMap *map, uint64_t key) -{ - Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL); - return ((match != NULL) ? match->value : NULL); -} - -/** - * Increase the number of hash buckets and rehash all the existing entries, - * storing them in the new buckets. - * - * @param map the map to resize - **/ -static int resizeBuckets(IntMap *map) -{ - // Copy the top-level map data to the stack. - IntMap oldMap = *map; - - // Re-initialize the map to be empty and 50% larger. - size_t newCapacity = map->capacity / 2 * 3; - logInfo("%s: attempting resize from %zu to %zu, current size=%zu", - __func__, map->capacity, newCapacity, map->size); - int result = allocateBuckets(map, newCapacity); - if (result != UDS_SUCCESS) { - *map = oldMap; - return result; - } - - // Populate the new hash table from the entries in the old bucket array. - for (size_t i = 0; i < oldMap.bucketCount; i++) { - Bucket *entry = &oldMap.buckets[i]; - if (entry->value == NULL) { - continue; - } - - result = intMapPut(map, entry->key, entry->value, true, NULL); - if (result != UDS_SUCCESS) { - // Destroy the new partial map and restore the map from the stack. - freeBuckets(map); - *map = oldMap; - return result; - } - } - - // Destroy the old bucket array. - freeBuckets(&oldMap); - return UDS_SUCCESS; -} - -/** - * Probe the bucket array starting at the given bucket for the next empty - * bucket, returning a pointer to it. NULL will be returned if - * the search reaches the end of the bucket array or if the number of linear - * probes exceeds a specified limit. - * - * @param map the map containing the buckets to search - * @param bucket the bucket at which to start probing - * @param maxProbes the maximum number of buckets to search - * - * @return the next empty bucket, or NULL if the search failed - **/ -static Bucket *findEmptyBucket(IntMap *map, - Bucket *bucket, - unsigned int maxProbes) -{ - // Limit the search to either the nearer of the end of the bucket array or a - // fixed distance beyond the initial bucket. - size_t remaining = &map->buckets[map->bucketCount] - bucket; - Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)]; - - for (Bucket *entry = bucket; entry < sentinel; entry++) { - if (entry->value == NULL) { - return entry; - } - } - return NULL; -} - -/** - * Move an empty bucket closer to the start of the bucket array. This searches - * the neighborhoods that contain the empty bucket for a non-empty bucket - * closer to the start of the array. If such a bucket is found, this swaps the - * two buckets by moving the entry to the empty bucket. - * - * @param map the map containing the bucket - * @param hole the empty bucket to fill with an entry that precedes it in one - * of its enclosing neighborhoods - * - * @return the bucket that was vacated by moving its entry to the provided - * hole, or NULL if no entry could be moved - **/ -static Bucket *moveEmptyBucket(IntMap *map __attribute__((unused)), - Bucket *hole) -{ - /* - * Examine every neighborhood that the empty bucket is part of, starting - * with the one in which it is the last bucket. No boundary check is needed - * for the negative array arithmetic since this function is only called when - * hole is at least NEIGHBORHOOD cells deeper into the array than a valid - * bucket. - */ - for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { - // Find the entry that is nearest to the bucket, which means it will be - // nearest to the hash bucket whose neighborhood is full. - Bucket *newHole = dereferenceHop(bucket, bucket->firstHop); - if (newHole == NULL) { - // There are no buckets in this neighborhood that are in use by this one - // (they must all be owned by overlapping neighborhoods). - continue; - } - - // Skip this bucket if its first entry is actually further away than the - // hole that we're already trying to fill. - if (hole < newHole) { - continue; - } - - /* - * We've found an entry in this neighborhood that we can "hop" further - * away, moving the hole closer to the hash bucket, if not all the way - * into its neighborhood. - */ - - // The entry that will be the new hole is the first bucket in the list, - // so setting firstHop is all that's needed remove it from the list. - bucket->firstHop = newHole->nextHop; - newHole->nextHop = NULL_HOP_OFFSET; - - // Move the entry into the original hole. - hole->key = newHole->key; - hole->value = newHole->value; - newHole->value = NULL; - - // Insert the filled hole into the hop list for the neighborhood. - insertInHopList(bucket, hole); - return newHole; - } - - // We couldn't find an entry to relocate to the hole. - return NULL; -} - -/** - * Find and update any existing mapping for a given key, returning the value - * associated with the key in the provided pointer. - * - * @param [in] map the IntMap to attempt to modify - * @param [in] neighborhood the first bucket in the neighborhood that - * would contain the search key - * @param [in] key the key with which to associate the new value - * @param [in] newValue the value to be associated with the key - * @param [in] update whether to overwrite an existing value - * @param [out] oldValuePtr a pointer in which to store the old value - * (unmodified if no mapping was found) - * - * @return true if the map contains a mapping for the key - * false if it does not - **/ -static bool updateMapping(IntMap *map, - Bucket *neighborhood, - uint64_t key, - void *newValue, - bool update, - void **oldValuePtr) -{ - Bucket *bucket = searchHopList(map, neighborhood, key, NULL); - if (bucket == NULL) { - // There is no bucket containing the key in the neighborhood. - return false; - } - - // Return the value of the current mapping (if desired) and update the - // mapping with the new value (if desired). - if (oldValuePtr != NULL) { - *oldValuePtr = bucket->value; - } - if (update) { - bucket->value = newValue; - } - return true; -} - -/** - * Find an empty bucket in a specified neighborhood for a new mapping or - * attempt to re-arrange mappings so there is such a bucket. This operation - * may fail (returning NULL) if an empty bucket is not available or could not - * be relocated to the neighborhood. - * - * @param map the IntMap to search or modify - * @param neighborhood the first bucket in the neighborhood in which - * an empty bucket is needed for a new mapping - * - * @return a pointer to an empty bucket in the desired neighborhood, or - * NULL if a vacancy could not be found or arranged - **/ -static Bucket *findOrMakeVacancy(IntMap *map, Bucket *neighborhood) -{ - // Probe within and beyond the neighborhood for the first empty bucket. - Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES); - - // Keep trying until the empty bucket is in the bucket's neighborhood or we - // are unable to move it any closer by swapping it with a filled bucket. - while (hole != NULL) { - int distance = hole - neighborhood; - if (distance < NEIGHBORHOOD) { - // We've found or relocated an empty bucket close enough to the initial - // hash bucket to be referenced by its hop vector. - return hole; - } - - // The nearest empty bucket isn't within the neighborhood that must - // contain the new entry, so try to swap it with bucket that is closer. - hole = moveEmptyBucket(map, hole); - } - - return NULL; -} - -/**********************************************************************/ -int intMapPut(IntMap *map, - uint64_t key, - void *newValue, - bool update, - void **oldValuePtr) -{ - if (newValue == NULL) { - return UDS_INVALID_ARGUMENT; - } - - // Select the bucket at the start of the neighborhood that must contain any - // entry for the provided key. - Bucket *neighborhood = selectBucket(map, key); - - // Check whether the neighborhood already contains an entry for the key, in - // which case we optionally update it, returning the old value. - if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) { - return UDS_SUCCESS; - } - - /* - * Find an empty bucket in the desired neighborhood for the new entry or - * re-arrange entries in the map so there is such a bucket. This operation - * will usually succeed; the loop body will only be executed on the rare - * occasions that we have to resize the map. - */ - Bucket *bucket; - while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) { - /* - * There is no empty bucket in which to put the new entry in the current - * map, so we're forced to allocate a new bucket array with a larger - * capacity, re-hash all the entries into those buckets, and try again (a - * very expensive operation for large maps). - */ - int result = resizeBuckets(map); - if (result != UDS_SUCCESS) { - return result; - } - - // Resizing the map invalidates all pointers to buckets, so recalculate - // the neighborhood pointer. - neighborhood = selectBucket(map, key); - } - - // Put the new entry in the empty bucket, adding it to the neighborhood. - bucket->key = key; - bucket->value = newValue; - insertInHopList(neighborhood, bucket); - map->size += 1; - - // There was no existing entry, so there was no old value to be returned. - if (oldValuePtr != NULL) { - *oldValuePtr = NULL; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void *intMapRemove(IntMap *map, uint64_t key) -{ - // Select the bucket to search and search it for an existing entry. - Bucket *bucket = selectBucket(map, key); - Bucket *previous; - Bucket *victim = searchHopList(map, bucket, key, &previous); - - if (victim == NULL) { - // There is no matching entry to remove. - return NULL; - } - - // We found an entry to remove. Save the mapped value to return later and - // empty the bucket. - map->size -= 1; - void *value = victim->value; - victim->value = NULL; - victim->key = 0; - - // The victim bucket is now empty, but it still needs to be spliced out of - // the hop list. - if (previous == NULL) { - // The victim is the head of the list, so swing firstHop. - bucket->firstHop = victim->nextHop; - } else { - previous->nextHop = victim->nextHop; - } - victim->nextHop = NULL_HOP_OFFSET; - - return value; -} diff --git a/vdo/base/intMap.h b/vdo/base/intMap.h deleted file mode 100644 index 0b18209..0000000 --- a/vdo/base/intMap.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/intMap.h#1 $ - */ - -#ifndef INT_MAP_H -#define INT_MAP_H - -#include "common.h" - -/** - * IntMap associates pointers (void *) with integer keys - * (uint64_t). NULL pointer values are not - * supported. - * - * The map is implemented as hash table, which should provide constant-time - * insert, query, and remove operations, although the insert may occasionally - * grow the table, which is linear in the number of entries in the map. The - * table will grow as needed to hold new entries, but will not shrink as - * entries are removed. - **/ - -typedef struct intMap IntMap; - -/** - * Allocate and initialize an IntMap. - * - * @param [in] initialCapacity the number of entries the map should - * initially be capable of holding (zero tells - * the map to use its own small default) - * @param [in] initialLoad the load factor of the map, expressed as an - * integer percentage (typically in the range - * 50 to 90, with zero telling the map to use - * its own default) - * @param [out] mapPtr a pointer to hold the new IntMap - * - * @return UDS_SUCCESS or an error code - **/ -int makeIntMap(size_t initialCapacity, - unsigned int initialLoad, - IntMap **mapPtr) - __attribute__((warn_unused_result)); - -/** - * Free an IntMap and null out the reference to it. NOTE: The map does not own - * the pointer values stored in the map and they are not freed by this call. - * - * @param [in,out] mapPtr the reference to the IntMap to free - **/ -void freeIntMap(IntMap **mapPtr); - -/** - * Get the number of entries stored in an IntMap. - * - * @param map the IntMap to query - * - * @return the number of entries in the map - **/ -size_t intMapSize(const IntMap *map); - -/** - * Retrieve the value associated with a given key from the IntMap. - * - * @param map the IntMap to query - * @param key the key to look up - * - * @return the value associated with the given key, or NULL - * if the key is not mapped to any value - **/ -void *intMapGet(IntMap *map, uint64_t key); - -/** - * Try to associate a value (a pointer) with an integer in an IntMap. If the - * map already contains a mapping for the provided key, the old value is - * only replaced with the specified value if update is true. In either case - * the old value is returned. If the map does not already contain a value for - * the specified key, the new value is added regardless of the value of update. - * - * @param [in] map the IntMap to attempt to modify - * @param [in] key the key with which to associate the new value - * @param [in] newValue the value to be associated with the key - * @param [in] update whether to overwrite an existing value - * @param [out] oldValuePtr a pointer in which to store either the old value - * (if the key was already mapped) or - * NULL if the map did not contain the - * key; NULL may be provided if the - * caller does not need to know the old value - * - * @return UDS_SUCCESS or an error code - **/ -int intMapPut(IntMap *map, - uint64_t key, - void *newValue, - bool update, - void **oldValuePtr) - __attribute__((warn_unused_result)); - -/** - * Remove the mapping for a given key from the IntMap. - * - * @param map the IntMap from which to remove the mapping - * @param key the key whose mapping is to be removed - * - * @return the value that was associated with the key, or - * NULL if it was not mapped - **/ -void *intMapRemove(IntMap *map, uint64_t key); - -#endif /* INT_MAP_H */ diff --git a/vdo/base/journalPoint.h b/vdo/base/journalPoint.h deleted file mode 100644 index 30d44cd..0000000 --- a/vdo/base/journalPoint.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/journalPoint.h#2 $ - */ - -#ifndef JOURNAL_POINT_H -#define JOURNAL_POINT_H - -#include "numeric.h" -#include "types.h" - -typedef uint16_t JournalEntryCount; - -/** - * The absolute position of an entry in a recovery journal or slab journal. - **/ -typedef struct { - SequenceNumber sequenceNumber; - JournalEntryCount entryCount; -} JournalPoint; - -/** - * A packed, platform-independent encoding of a JournalPoint. - **/ -typedef struct { - /** - * The packed representation is the little-endian 64-bit representation of - * the low-order 48 bits of the sequence number, shifted up 16 bits, or'ed - * with the 16-bit entry count. - * - * Very long-term, the top 16 bits of the sequence number may not always be - * zero, as this encoding assumes--see BZ 1523240. - **/ - byte encodedPoint[8]; -} __attribute__((packed)) PackedJournalPoint; - -/** - * Move the given journal point forward by one entry. - * - * @param point the journal point to adjust - * @param entriesPerBlock the number of entries in one full block - **/ -static inline void advanceJournalPoint(JournalPoint *point, - JournalEntryCount entriesPerBlock) -{ - point->entryCount++; - if (point->entryCount == entriesPerBlock) { - point->sequenceNumber++; - point->entryCount = 0; - } -} - -/** - * Check whether a journal point is valid. - * - * @param point the journal point - * - * @return true if the journal point is valid - **/ -static inline bool isValidJournalPoint(const JournalPoint *point) -{ - return ((point != NULL) && (point->sequenceNumber > 0)); -} - -/** - * Check whether the first point precedes the second point. - * - * @param first the first journal point - * @param second the second journal point - - * - * @return true if the first point precedes the second point. - **/ -static inline bool beforeJournalPoint(const JournalPoint *first, - const JournalPoint *second) -{ - return ((first->sequenceNumber < second->sequenceNumber) - || ((first->sequenceNumber == second->sequenceNumber) - && (first->entryCount < second->entryCount))); -} - -/** - * Check whether the first point is the same as the second point. - * - * @param first the first journal point - * @param second the second journal point - * - * @return true if both points reference the same logical - * position of an entry the journal - **/ -static inline bool areEquivalentJournalPoints(const JournalPoint *first, - const JournalPoint *second) -{ - return ((first->sequenceNumber == second->sequenceNumber) - && (first->entryCount == second->entryCount)); -} - -/** - * Encode the journal location represented by a JournalPoint into a - * PackedJournalPoint. - * - * @param unpacked The unpacked input point - * @param packed The packed output point - **/ -static inline void packJournalPoint(const JournalPoint *unpacked, - PackedJournalPoint *packed) -{ - uint64_t native = ((unpacked->sequenceNumber << 16) | unpacked->entryCount); - storeUInt64LE(packed->encodedPoint, native); -} - -/** - * Decode the journal location represented by a PackedJournalPoint into a - * JournalPoint. - * - * @param packed The packed input point - * @param unpacked The unpacked output point - **/ -static inline void unpackJournalPoint(const PackedJournalPoint *packed, - JournalPoint *unpacked) -{ - uint64_t native = getUInt64LE(packed->encodedPoint); - unpacked->sequenceNumber = (native >> 16); - unpacked->entryCount = (native & 0xffff); -} - -#endif // JOURNAL_POINT_H diff --git a/vdo/base/lockCounter.c b/vdo/base/lockCounter.c deleted file mode 100644 index e762576..0000000 --- a/vdo/base/lockCounter.c +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lockCounter.c#3 $ - */ - -#include "lockCounter.h" - -#include "atomic.h" -#include "memoryAlloc.h" - -/** - * LockCounter is intended to keep all of the locks for the blocks in the - * recovery journal. The per-zone counters are all kept in a single array which - * is arranged by zone (i.e. zone 0's lock 0 is at index 0, zone 0's lock 1 is - * at index 1, and zone 1's lock 0 is at index 'locks'. This arrangement is - * intended to minimize cache-line contention for counters from different - * zones. - * - * The locks are implemented as a single object instead of as a lock counter - * per lock both to afford this opportunity to reduce cache line contention and - * also to eliminate the need to have a completion per lock. - * - * Lock sets are laid out with the set for recovery journal first, followed by - * the logical zones, and then the physical zones. - **/ -typedef enum lockCounterState { - LOCK_COUNTER_STATE_NOT_NOTIFYING = 0, - LOCK_COUNTER_STATE_NOTIFYING, - LOCK_COUNTER_STATE_SUSPENDED, -} LockCounterState; - -struct lockCounter { - /** The completion for notifying the owner of a lock release */ - VDOCompletion completion; - /** The number of logical zones which may hold locks */ - ZoneCount logicalZones; - /** The number of physical zones which may hold locks */ - ZoneCount physicalZones; - /** The number of locks */ - BlockCount locks; - /** Whether the lock release notification is in flight */ - Atomic32 state; - /** The number of logical zones which hold each lock */ - Atomic32 *logicalZoneCounts; - /** The number of physical zones which hold each lock */ - Atomic32 *physicalZoneCounts; - /** The per-zone, per-lock counts for the journal zone */ - uint16_t *journalCounters; - /** The per-zone, per-lock decrement counts for the journal zone */ - Atomic32 *journalDecrementCounts; - /** The per-zone, per-lock reference counts for logical zones */ - uint16_t *logicalCounters; - /** The per-zone, per-lock reference counts for physical zones */ - uint16_t *physicalCounters; -}; - -/**********************************************************************/ -int makeLockCounter(PhysicalLayer *layer, - void *parent, - VDOAction callback, - ThreadID threadID, - ZoneCount logicalZones, - ZoneCount physicalZones, - BlockCount locks, - LockCounter **lockCounterPtr) -{ - LockCounter *lockCounter; - - int result = ALLOCATE(1, LockCounter, __func__, &lockCounter); - if (result != VDO_SUCCESS) { - return result; - } - - result = ALLOCATE(locks, uint16_t, __func__, &lockCounter->journalCounters); - if (result != VDO_SUCCESS) { - freeLockCounter(&lockCounter); - return result; - } - - result = ALLOCATE(locks, Atomic32, __func__, - &lockCounter->journalDecrementCounts); - if (result != VDO_SUCCESS) { - freeLockCounter(&lockCounter); - return result; - } - - result = ALLOCATE(locks * logicalZones, uint16_t, __func__, - &lockCounter->logicalCounters); - if (result != VDO_SUCCESS) { - freeLockCounter(&lockCounter); - return result; - } - - result = ALLOCATE(locks, Atomic32, __func__, - &lockCounter->logicalZoneCounts); - if (result != VDO_SUCCESS) { - freeLockCounter(&lockCounter); - return result; - } - - result = ALLOCATE(locks * physicalZones, uint16_t, __func__, - &lockCounter->physicalCounters); - if (result != VDO_SUCCESS) { - freeLockCounter(&lockCounter); - return result; - } - - result = ALLOCATE(locks, Atomic32, __func__, - &lockCounter->physicalZoneCounts); - if (result != VDO_SUCCESS) { - freeLockCounter(&lockCounter); - return result; - } - - result = initializeEnqueueableCompletion(&lockCounter->completion, - LOCK_COUNTER_COMPLETION, layer); - if (result != VDO_SUCCESS) { - freeLockCounter(&lockCounter); - return result; - } - - setCallbackWithParent(&lockCounter->completion, callback, threadID, parent); - lockCounter->logicalZones = logicalZones; - lockCounter->physicalZones = physicalZones; - lockCounter->locks = locks; - *lockCounterPtr = lockCounter; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeLockCounter(LockCounter **lockCounterPtr) -{ - if (*lockCounterPtr == NULL) { - return; - } - - LockCounter *lockCounter = *lockCounterPtr; - destroyEnqueueable(&lockCounter->completion); - freeVolatile(lockCounter->physicalZoneCounts); - freeVolatile(lockCounter->logicalZoneCounts); - freeVolatile(lockCounter->journalDecrementCounts); - FREE(lockCounter->journalCounters); - FREE(lockCounter->logicalCounters); - FREE(lockCounter->physicalCounters); - FREE(lockCounter); - *lockCounterPtr = NULL; -} - -/** - * Get a pointer to the zone count for a given lock on a given zone. - * - * @param counter The lock counter - * @param lockNumber The lock to get - * @param zoneType The zone type whose count is desired - * - * @return A pointer to the zone count for the given lock and zone - **/ -static inline Atomic32 *getZoneCountPtr(LockCounter *counter, - BlockCount lockNumber, - ZoneType zoneType) -{ - return ((zoneType == ZONE_TYPE_LOGICAL) - ? &counter->logicalZoneCounts[lockNumber] - : &counter->physicalZoneCounts[lockNumber]); -} - -/** - * Get the zone counter for a given lock on a given zone. - * - * @param counter The lock counter - * @param lockNumber The lock to get - * @param zoneType The zone type whose count is desired - * @param zoneID The zone index whose count is desired - * - * @return The counter for the given lock and zone - **/ -static inline uint16_t *getCounter(LockCounter *counter, - BlockCount lockNumber, - ZoneType zoneType, - ZoneCount zoneID) -{ - BlockCount zoneCounter = (counter->locks * zoneID) + lockNumber; - if (zoneType == ZONE_TYPE_JOURNAL) { - return &counter->journalCounters[zoneCounter]; - } - - if (zoneType == ZONE_TYPE_LOGICAL) { - return &counter->logicalCounters[zoneCounter]; - } - - return &counter->physicalCounters[zoneCounter]; -} - -/** - * Check whether the journal zone is locked for a given lock. - * - * @param counter The LockCounter - * @param lockNumber The lock to check - * - * @return true if the journal zone is locked - **/ -static bool isJournalZoneLocked(LockCounter *counter, BlockCount lockNumber) -{ - uint16_t journalValue - = *(getCounter(counter, lockNumber, ZONE_TYPE_JOURNAL, 0)); - uint32_t decrements - = atomicLoad32(&(counter->journalDecrementCounts[lockNumber])); - ASSERT_LOG_ONLY((decrements <= journalValue), - "journal zone lock counter must not underflow"); - - return (journalValue != decrements); -} - -/**********************************************************************/ -bool isLocked(LockCounter *lockCounter, - BlockCount lockNumber, - ZoneType zoneType) -{ - ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), - "isLocked() called for non-journal zone"); - return (isJournalZoneLocked(lockCounter, lockNumber) - || (atomicLoad32(getZoneCountPtr(lockCounter, lockNumber, zoneType)) - != 0)); -} - -/** - * Check that we are on the journal thread. - * - * @param counter The LockCounter - * @param caller The name of the caller (for logging) - **/ -static void assertOnJournalThread(LockCounter *counter, const char *caller) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() - == counter->completion.callbackThreadID), - "%s() called from journal zone", caller); -} - -/**********************************************************************/ -void initializeLockCount(LockCounter *counter, - BlockCount lockNumber, - uint16_t value) -{ - assertOnJournalThread(counter, __func__); - uint16_t *journalValue = getCounter(counter, lockNumber, ZONE_TYPE_JOURNAL, - 0); - Atomic32 *decrementCount = &(counter->journalDecrementCounts[lockNumber]); - ASSERT_LOG_ONLY((*journalValue == atomicLoad32(decrementCount)), - "count to be initialized not in use"); - - *journalValue = value; - atomicStore32(decrementCount, 0); -} - -/**********************************************************************/ -void acquireLockCountReference(LockCounter *counter, - BlockCount lockNumber, - ZoneType zoneType, - ZoneCount zoneID) -{ - ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), - "invalid lock count increment from journal zone"); - - uint16_t *currentValue = getCounter(counter, lockNumber, zoneType, zoneID); - ASSERT_LOG_ONLY(*currentValue < UINT16_MAX, - "increment of lock counter must not overflow"); - - if (*currentValue == 0) { - // This zone is acquiring this lock for the first time. - atomicAdd32(getZoneCountPtr(counter, lockNumber, zoneType), 1); - } - *currentValue += 1; -} - -/** - * Decrement a non-atomic counter. - * - * @param counter The LockCounter - * @param lockNumber Which lock to decrement - * @param zoneType The type of the zone releasing the reference - * @param zoneID The ID of the zone releasing the reference - * - * @return The new value of the counter - **/ -static uint16_t releaseReference(LockCounter *counter, - BlockCount lockNumber, - ZoneType zoneType, - ZoneCount zoneID) -{ - uint16_t *currentValue = getCounter(counter, lockNumber, zoneType, zoneID); - ASSERT_LOG_ONLY((*currentValue >= 1), - "decrement of lock counter must not underflow"); - - *currentValue -= 1; - return *currentValue; -} - -/** - * Attempt to notify the owner of this LockCounter that some lock has been - * released for some zone type. Will do nothing if another notification is - * already in progress. - * - * @param counter The LockCounter - **/ -static void attemptNotification(LockCounter *counter) -{ - if (compareAndSwap32(&counter->state, - LOCK_COUNTER_STATE_NOT_NOTIFYING, - LOCK_COUNTER_STATE_NOTIFYING)) { - resetCompletion(&counter->completion); - invokeCallback(&counter->completion); - } -} - -/**********************************************************************/ -void releaseLockCountReference(LockCounter *counter, - BlockCount lockNumber, - ZoneType zoneType, - ZoneCount zoneID) -{ - ASSERT_LOG_ONLY((zoneType != ZONE_TYPE_JOURNAL), - "invalid lock count decrement from journal zone"); - if (releaseReference(counter, lockNumber, zoneType, zoneID) != 0) { - return; - } - - if (atomicAdd32(getZoneCountPtr(counter, lockNumber, zoneType), -1) == 0) { - // This zone was the last lock holder of its type, so try to notify the - // owner. - attemptNotification(counter); - } -} - -/**********************************************************************/ -void releaseJournalZoneReference(LockCounter *counter, BlockCount lockNumber) -{ - assertOnJournalThread(counter, __func__); - releaseReference(counter, lockNumber, ZONE_TYPE_JOURNAL, 0); - if (!isJournalZoneLocked(counter, lockNumber)) { - // The journal zone is not locked, so try to notify the owner. - attemptNotification(counter); - } -} - -/**********************************************************************/ -void releaseJournalZoneReferenceFromOtherZone(LockCounter *counter, - BlockCount lockNumber) -{ - atomicAdd32(&(counter->journalDecrementCounts[lockNumber]), 1); -} - -/**********************************************************************/ -void acknowledgeUnlock(LockCounter *counter) -{ - atomicStore32(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING); -} - -/**********************************************************************/ -bool suspendLockCounter(LockCounter *counter) -{ - assertOnJournalThread(counter, __func__); - return ((atomicLoad32(&counter->state) == LOCK_COUNTER_STATE_SUSPENDED) - || compareAndSwap32(&counter->state, - LOCK_COUNTER_STATE_NOT_NOTIFYING, - LOCK_COUNTER_STATE_SUSPENDED)); -} - -/**********************************************************************/ -bool resumeLockCounter(LockCounter *counter) -{ - assertOnJournalThread(counter, __func__); - return compareAndSwap32(&counter->state, - LOCK_COUNTER_STATE_SUSPENDED, - LOCK_COUNTER_STATE_NOT_NOTIFYING); -} diff --git a/vdo/base/lockCounter.h b/vdo/base/lockCounter.h deleted file mode 100644 index cbda7bd..0000000 --- a/vdo/base/lockCounter.h +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lockCounter.h#2 $ - */ - -#ifndef LOCK_COUNTER_H -#define LOCK_COUNTER_H - -#include "completion.h" -#include "types.h" - -/** - * LockCounter provides a set of shared reference count locks which is safe - * across multiple zones with a minimum of cross-thread synchronization - * operations. For each lock in the set, it maintains a set of per-zone lock - * counts, and a single, atomic count of the number of zones holding locks. - * Whenever a zone's individual counter for a lock goes from 0 to 1, the - * zone count for that lock is incremented. Whenever a zone's individual - * counter for a lock goes from 1 to 0, the zone count for that lock is - * decremented. If the zone count goes to 0, and the lock counter's - * completion is not in use, the completion is launched to inform the counter's - * owner that some lock has been released. It is the owner's responsibility to - * check for which locks have been released, and to inform the lock counter - * that it has received the notification by calling acknowledgeUnlock(). - **/ - -/** - * Create a lock counter. - * - * @param [in] layer The physical layer of the VDO - * @param [in] parent The parent to notify when the lock count goes - * to zero - * @param [in] callback The function to call when the lock count goes - * to zero - * @param [in] threadID The id of thread on which to run the callback - * @param [in] logicalZones The total number of logical zones - * @param [in] physicalZones The total number of physical zones - * @param [in] locks The number of locks - * @param [out] lockCounterPtr A pointer to hold the new counter - * - * @return VDO_SUCCESS or an error - **/ -int makeLockCounter(PhysicalLayer *layer, - void *parent, - VDOAction callback, - ThreadID threadID, - ZoneCount logicalZones, - ZoneCount physicalZones, - BlockCount locks, - LockCounter **lockCounterPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy a lock counter and NULL out the reference to it. - * - * @param lockCounterPtr A pointer to the lock counter reference to free - **/ -void freeLockCounter(LockCounter **lockCounterPtr); - -/** - * Check whether a lock is locked for a zone type. If the recovery journal has - * a lock on the lock number, both logical and physical zones are considered - * locked. - * - * @param lockCounter The set of locks to check - * @param lockNumber The lock to check - * @param zoneType The type of the zone - * - * @return true if the specified lock has references (is locked) - **/ -bool isLocked(LockCounter *lockCounter, - BlockCount lockNumber, - ZoneType zoneType) - __attribute__((warn_unused_result)); - -/** - * Initialize the value of the journal zone's counter for a given lock. This - * must be called from the journal zone. - * - * @param counter The counter to initialize - * @param lockNumber Which lock to initialize - * @param value The value to set - **/ -void initializeLockCount(LockCounter *counter, - BlockCount lockNumber, - uint16_t value); - -/** - * Acquire a reference to a given lock in the specified zone. This method must - * not be used from the journal zone. - * - * @param counter The LockCounter - * @param lockNumber Which lock to increment - * @param zoneType The type of the zone acquiring the reference - * @param zoneID The ID of the zone acquiring the reference - **/ -void acquireLockCountReference(LockCounter *counter, - BlockCount lockNumber, - ZoneType zoneType, - ZoneCount zoneID); - -/** - * Release a reference to a given lock in the specified zone. This method - * must not be used from the journal zone. - * - * @param counter The LockCounter - * @param lockNumber Which lock to increment - * @param zoneType The type of the zone releasing the reference - * @param zoneID The ID of the zone releasing the reference - **/ -void releaseLockCountReference(LockCounter *counter, - BlockCount lockNumber, - ZoneType zoneType, - ZoneCount zoneID); - -/** - * Release a single journal zone reference from the journal zone. This method - * must be called from the journal zone. - * - * @param counter The counter from which to release a reference - * @param lockNumber The lock from which to release a reference - **/ -void releaseJournalZoneReference(LockCounter *counter, BlockCount lockNumber); - -/** - * Release a single journal zone reference from any zone. This method shouldn't - * be called from the journal zone as it would be inefficient; use - * releaseJournalZoneReference() instead. - * - * @param counter The counter from which to release a reference - * @param lockNumber The lock from which to release a reference - **/ -void releaseJournalZoneReferenceFromOtherZone(LockCounter *counter, - BlockCount lockNumber); - -/** - * Inform a lock counter that an unlock notification was received by the - * caller. - * - * @param counter The counter to inform - **/ -void acknowledgeUnlock(LockCounter *counter); - -/** - * Prevent the lock counter from issuing notifications. - * - * @param counter The counter - * - * @return true if the lock counter was not notifying and hence - * the suspend was efficacious - **/ -bool suspendLockCounter(LockCounter *counter) - __attribute__((warn_unused_result)); - -/** - * Re-allow notifications from a suspended lock counter. - * - * @param counter The counter - * - * @return true if the lock counter was suspended - **/ -bool resumeLockCounter(LockCounter *counter) - __attribute__((warn_unused_result)); - -#endif // LOCK_COUNTER_H diff --git a/vdo/base/logicalZone.c b/vdo/base/logicalZone.c deleted file mode 100644 index 0834ff1..0000000 --- a/vdo/base/logicalZone.c +++ /dev/null @@ -1,463 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/logicalZone.c#6 $ - */ - -#include "logicalZone.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "actionManager.h" -#include "adminState.h" -#include "allocationSelector.h" -#include "atomic.h" -#include "blockMap.h" -#include "completion.h" -#include "constants.h" -#include "dataVIO.h" -#include "flush.h" -#include "intMap.h" -#include "vdoInternal.h" - -struct logicalZone { - /** The completion for flush notifications */ - VDOCompletion completion; - /** The owner of this zone */ - LogicalZones *zones; - /** Which logical zone this is */ - ZoneCount zoneNumber; - /** The thread id for this zone */ - ThreadID threadID; - /** In progress operations keyed by LBN */ - IntMap *lbnOperations; - /** The logical to physical map */ - BlockMapZone *blockMapZone; - /** The current flush generation */ - SequenceNumber flushGeneration; - /** The oldest active generation in this zone */ - SequenceNumber oldestActiveGeneration; - /** The number of IOs in the current flush generation */ - BlockCount iosInFlushGeneration; - /** - * The oldest locked generation in this zone (an atomic copy of - * oldestActiveGeneration) - **/ - Atomic64 oldestLockedGeneration; - /** The youngest generation of the current notification */ - SequenceNumber notificationGeneration; - /** Whether a notification is in progress */ - bool notifying; - /** The queue of active data write VIOs */ - RingNode writeVIOs; - /** The administrative state of the zone */ - AdminState state; - /** The selector for determining which physical zone to allocate from */ - AllocationSelector *selector; -}; - -struct logicalZones { - /** The VDO whose zones these are */ - VDO *vdo; - /** The manager for administrative actions */ - ActionManager *manager; - /** The number of zones */ - ZoneCount zoneCount; - /** The logical zones themselves */ - LogicalZone zones[]; -}; - -/** - * Convert a generic VDOCompletion to a LogicalZone. - * - * @param completion The completion to convert - * - * @return The completion as a LogicalZone - **/ -static LogicalZone *asLogicalZone(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(LogicalZone, completion) == 0); - assertCompletionType(completion->type, GENERATION_FLUSHED_COMPLETION); - return (LogicalZone *) completion; -} - -/**********************************************************************/ -LogicalZone *getLogicalZone(LogicalZones *zones, ZoneCount zoneNumber) -{ - return (zoneNumber < zones->zoneCount) ? &zones->zones[zoneNumber] : NULL; -} - -/** - * Implements ZoneThreadGetter - **/ -static ThreadID getThreadIDForZone(void *context, ZoneCount zoneNumber) -{ - return getLogicalZoneThreadID(getLogicalZone(context, zoneNumber)); -} - -/** - * Initialize a logical zone. - * - * @param zones The LogicalZones to which this zone belongs - * @param zoneNumber The LogicalZone's index - **/ -static int initializeZone(LogicalZones *zones, ZoneCount zoneNumber) -{ - LogicalZone *zone = &zones->zones[zoneNumber]; - zone->zones = zones; - int result = makeIntMap(LOCK_MAP_CAPACITY, 0, &zone->lbnOperations); - if (result != VDO_SUCCESS) { - return result; - } - - VDO *vdo = zones->vdo; - result = initializeEnqueueableCompletion(&zone->completion, - GENERATION_FLUSHED_COMPLETION, - vdo->layer); - if (result != VDO_SUCCESS) { - return result; - } - - zone->zoneNumber = zoneNumber; - zone->threadID = getLogicalZoneThread(getThreadConfig(vdo), - zoneNumber); - zone->blockMapZone = getBlockMapZone(vdo->blockMap, zoneNumber); - initializeRing(&zone->writeVIOs); - atomicStore64(&zone->oldestLockedGeneration, 0); - - return makeAllocationSelector(getThreadConfig(vdo)->physicalZoneCount, - zone->threadID, &zone->selector); -} - -/**********************************************************************/ -int makeLogicalZones(VDO *vdo, LogicalZones **zonesPtr) -{ - const ThreadConfig *threadConfig = getThreadConfig(vdo); - if (threadConfig->logicalZoneCount == 0) { - return VDO_SUCCESS; - } - - LogicalZones *zones; - int result = ALLOCATE_EXTENDED(LogicalZones, threadConfig->logicalZoneCount, - LogicalZone, __func__, &zones); - if (result != VDO_SUCCESS) { - return result; - } - - zones->vdo = vdo; - zones->zoneCount = threadConfig->logicalZoneCount; - for (ZoneCount zone = 0; zone < threadConfig->logicalZoneCount; zone++) { - result = initializeZone(zones, zone); - if (result != VDO_SUCCESS) { - freeLogicalZones(&zones); - return result; - } - } - - result = makeActionManager(zones->zoneCount, getThreadIDForZone, - getAdminThread(threadConfig), zones, NULL, - vdo->layer, &zones->manager); - if (result != VDO_SUCCESS) { - freeLogicalZones(&zones); - return result; - } - - *zonesPtr = zones; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeLogicalZones(LogicalZones **zonesPtr) -{ - LogicalZones *zones = *zonesPtr; - if (zones == NULL) { - return; - } - - freeActionManager(&zones->manager); - - for (ZoneCount index = 0; index < zones->zoneCount; index++) { - LogicalZone *zone = &zones->zones[index]; - freeAllocationSelector(&zone->selector); - destroyEnqueueable(&zone->completion); - freeIntMap(&zone->lbnOperations); - } - - FREE(zones); - *zonesPtr = NULL; -} - -/**********************************************************************/ -static inline void assertOnZoneThread(LogicalZone *zone, const char *what) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() == zone->threadID), - "%s() called on correct thread", what); -} - -/** - * Check whether this zone has drained. - * - * @param zone The zone to check - **/ -static void checkForDrainComplete(LogicalZone *zone) -{ - if (!isDraining(&zone->state) || zone->notifying - || !isRingEmpty(&zone->writeVIOs)) { - return; - } - - finishDraining(&zone->state); -} - -/** - * Initiate a drain. - * - * Implements AdminInitiator. - **/ -static void initiateDrain(AdminState *state) -{ - checkForDrainComplete(container_of(state, LogicalZone, state)); -} - -/** - * Drain a logical zone. - * - *

Implements ZoneAction. - **/ -static void drainLogicalZone(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - LogicalZone *zone = getLogicalZone(context, zoneNumber); - startDraining(&zone->state, getCurrentManagerOperation(zone->zones->manager), - parent, initiateDrain); -} - -/**********************************************************************/ -void drainLogicalZones(LogicalZones *zones, - AdminStateCode operation, - VDOCompletion *parent) -{ - scheduleOperation(zones->manager, operation, NULL, drainLogicalZone, NULL, - parent); -} - -/** - * Resume a logical zone. - * - *

Implements ZoneAction. - **/ -static void resumeLogicalZone(void *context, - ZoneCount zoneNumber, - VDOCompletion *parent) -{ - LogicalZone *zone = getLogicalZone(context, zoneNumber); - finishCompletion(parent, resumeIfQuiescent(&zone->state)); -} - -/**********************************************************************/ -void resumeLogicalZones(LogicalZones *zones, VDOCompletion *parent) -{ - scheduleOperation(zones->manager, ADMIN_STATE_RESUMING, NULL, - resumeLogicalZone, NULL, parent); -} - -/**********************************************************************/ -ThreadID getLogicalZoneThreadID(const LogicalZone *zone) -{ - return zone->threadID; -} - -/**********************************************************************/ -BlockMapZone *getBlockMapForZone(const LogicalZone *zone) -{ - return zone->blockMapZone; -} - -/**********************************************************************/ -IntMap *getLBNLockMap(const LogicalZone *zone) -{ - return zone->lbnOperations; -} - -/**********************************************************************/ -LogicalZone *getNextLogicalZone(const LogicalZone *zone) -{ - return getLogicalZone(zone->zones, zone->zoneNumber + 1); -} - -/** - * Convert a RingNode to a DataVIO. - * - * @param ringNode The RingNode to convert - * - * @return The DataVIO which owns the RingNode - **/ -static inline DataVIO *dataVIOFromRingNode(RingNode *ringNode) -{ - return (DataVIO *) ((byte *) ringNode - offsetof(DataVIO, writeNode)); -} - -/** - * Update the oldest active generation. If it has changed, update the - * atomic copy as well. - * - * @param zone The zone - * - * @return true if the oldest active generation has changed - **/ -static bool updateOldestActiveGeneration(LogicalZone *zone) -{ - SequenceNumber currentOldest = zone->oldestActiveGeneration; - if (isRingEmpty(&zone->writeVIOs)) { - zone->oldestActiveGeneration = zone->flushGeneration; - } else { - zone->oldestActiveGeneration - = dataVIOFromRingNode(zone->writeVIOs.next)->flushGeneration; - } - - if (zone->oldestActiveGeneration == currentOldest) { - return false; - } - - atomicStore64(&zone->oldestLockedGeneration, zone->oldestActiveGeneration); - return true; -} - -/**********************************************************************/ -void incrementFlushGeneration(LogicalZone *zone, - SequenceNumber expectedGeneration) -{ - assertOnZoneThread(zone, __func__); - ASSERT_LOG_ONLY((zone->flushGeneration == expectedGeneration), - "logical zone %u flush generation %" PRIu64 - " should be %llu before increment", - zone->zoneNumber, zone->flushGeneration, - expectedGeneration); - - zone->flushGeneration++; - zone->iosInFlushGeneration = 0; - updateOldestActiveGeneration(zone); -} - -/**********************************************************************/ -SequenceNumber getOldestLockedGeneration(const LogicalZone *zone) -{ - return (SequenceNumber) atomicLoad64(&zone->oldestLockedGeneration); -} - -/**********************************************************************/ -int acquireFlushGenerationLock(DataVIO *dataVIO) -{ - LogicalZone *zone = dataVIO->logical.zone; - assertOnZoneThread(zone, __func__); - if (!isNormal(&zone->state)) { - return VDO_INVALID_ADMIN_STATE; - } - - dataVIO->flushGeneration = zone->flushGeneration; - pushRingNode(&zone->writeVIOs, &dataVIO->writeNode); - dataVIO->hasFlushGenerationLock = true; - zone->iosInFlushGeneration++; - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void attemptGenerationCompleteNotification(VDOCompletion *completion); - -/** - * Notify the flush that at least one generation no longer has active VIOs. - * This callback is registered in attemptGenerationCompleteNotification(). - * - * @param completion The zone completion - **/ -static void notifyFlusher(VDOCompletion *completion) -{ - LogicalZone *zone = asLogicalZone(completion); - completeFlushes(zone->zones->vdo->flusher); - launchCallback(completion, attemptGenerationCompleteNotification, - zone->threadID); -} - -/** - * Notify the flusher if some generation no longer has active VIOs. - * - * @param completion The zone completion - **/ -static void attemptGenerationCompleteNotification(VDOCompletion *completion) -{ - LogicalZone *zone = asLogicalZone(completion); - assertOnZoneThread(zone, __func__); - if (zone->oldestActiveGeneration <= zone->notificationGeneration) { - zone->notifying = false; - checkForDrainComplete(zone); - return; - } - - zone->notifying = true; - zone->notificationGeneration = zone->oldestActiveGeneration; - launchCallback(&zone->completion, notifyFlusher, - getFlusherThreadID(zone->zones->vdo->flusher)); -} - -/**********************************************************************/ -void releaseFlushGenerationLock(DataVIO *dataVIO) -{ - LogicalZone *zone = dataVIO->logical.zone; - assertOnZoneThread(zone, __func__); - if (isRingEmpty(&dataVIO->writeNode)) { - // This VIO never got a lock, either because it is a read, or because - // we are in read-only mode. - ASSERT_LOG_ONLY(!dataVIO->hasFlushGenerationLock, - "hasFlushGenerationLock false for VIO not on active list"); - return; - } - - unspliceRingNode(&dataVIO->writeNode); - dataVIO->hasFlushGenerationLock = false; - ASSERT_LOG_ONLY(zone->oldestActiveGeneration <= dataVIO->flushGeneration, - "DataVIO releasing lock on generation %" PRIu64 - " is not older than oldest active generation %llu", - dataVIO->flushGeneration, zone->oldestActiveGeneration); - - if (!updateOldestActiveGeneration(zone) || zone->notifying) { - return; - } - - attemptGenerationCompleteNotification(&zone->completion); -} - -/**********************************************************************/ -AllocationSelector *getAllocationSelector(LogicalZone *zone) -{ - return zone->selector; -} - -/**********************************************************************/ -void dumpLogicalZone(const LogicalZone *zone) -{ - logInfo("LogicalZone %u", zone->zoneNumber); - logInfo(" flushGeneration=%llu oldestActiveGeneration=%" PRIu64 - " oldestLockedGeneration=%llu notificationGeneration=%" PRIu64 - " notifying=%s iosInCurrentGeneration=%llu", - zone->flushGeneration, zone->oldestActiveGeneration, - relaxedLoad64(&zone->oldestLockedGeneration), - zone->notificationGeneration, boolToString(zone->notifying), - zone->iosInFlushGeneration); -} diff --git a/vdo/base/logicalZone.h b/vdo/base/logicalZone.h deleted file mode 100644 index 8e0eae6..0000000 --- a/vdo/base/logicalZone.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/logicalZone.h#3 $ - */ - -#ifndef LOGICAL_ZONE_H -#define LOGICAL_ZONE_H - -#include "adminState.h" -#include "intMap.h" -#include "types.h" - -/** - * Get a logical zone by number. - * - * @param zones A set of logical zones - * @param zoneNumber The number of the zone to get - * - * @return The requested zone - **/ -LogicalZone *getLogicalZone(LogicalZones *zones, ZoneCount zoneNumber) - __attribute__((warn_unused_result)); - -/** - * Create a set of logical zones. - * - * @param [in] vdo The VDO to which the zones will belong - * @param [out] zonesPtr A pointer to hold the new zones - * - * @return VDO_SUCCESS or an error code - **/ -int makeLogicalZones(VDO *vdo, LogicalZones **zonesPtr) - __attribute__((warn_unused_result)); - -/** - * Free a set of logical zones and null out the reference to it. - * - * @param zonePtr A pointer to the zone to free - **/ -void freeLogicalZones(LogicalZones **zonePtr); - -/** - * Drain a set of logical zones. - * - * @param zones The logical zones to suspend - * @param operation The type of drain to perform - * @param completion The object to notify when the zones are suspended - **/ -void drainLogicalZones(LogicalZones *zones, - AdminStateCode operation, - VDOCompletion *completion); - -/** - * Resume a set of logical zones. - * - * @param zones The logical zones to resume - * @param parent The object to notify when the zones have resumed - **/ -void resumeLogicalZones(LogicalZones *zones, VDOCompletion *parent); - -/** - * Get the ID of a logical zone's thread. - * - * @param zone The zone - * - * @return The zone's thread ID - **/ -ThreadID getLogicalZoneThreadID(const LogicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the portion of the block map for this zone. - * - * @param zone The zone - * - * @return The block map zone - **/ -BlockMapZone *getBlockMapForZone(const LogicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the logical lock map for this zone. - * - * @param zone The zone - * - * @return The logical lock map for the zone - **/ -IntMap *getLBNLockMap(const LogicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the next-highest-numbered logical zone, or NULL if the - * zone is the highest-numbered zone in its VDO. - * - * @param zone The logical zone to query - * - * @return The logical zone whose zone number is one greater than the given - * zone, or NULL if there is no such zone - **/ -LogicalZone *getNextLogicalZone(const LogicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Increment the flush generation in a logical zone. - * - * @param zone The logical zone - * @param expectedGeneration The expected value of the flush generation - * before the increment - **/ -void incrementFlushGeneration(LogicalZone *zone, - SequenceNumber expectedGeneration); - -/** - * Get the oldest flush generation which is locked by a logical zone. - * - * @param zone The logical zone - * - * @return The oldest generation locked by the zone - **/ -SequenceNumber getOldestLockedGeneration(const LogicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Acquire the shared lock on a flush generation by a write DataVIO. - * - * @param dataVIO The DataVIO - * - * @return VDO_SUCCESS or an error code - **/ -int acquireFlushGenerationLock(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Release the shared lock on a flush generation held by a write DataVIO. If - * there are pending flushes, and this DataVIO completes the oldest generation - * active in this zone, an attempt will be made to finish any flushes which may - * now be complete. - * - * @param dataVIO The DataVIO whose lock is to be released - **/ -void releaseFlushGenerationLock(DataVIO *dataVIO); - -/** - * Get the selector for deciding which physical zone should be allocated from - * next for activities in a logical zone. - * - * @param zone The logical zone of the operation which needs an allocation - * - * @return The allocation selector for this zone - **/ -AllocationSelector *getAllocationSelector(LogicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Dump information about a logical zone to the log for debugging, in a - * thread-unsafe fashion. - * - * @param zone The zone to dump - **/ -void dumpLogicalZone(const LogicalZone *zone); - -#endif // LOGICAL_ZONE_H diff --git a/vdo/base/lz4.c b/vdo/base/lz4.c deleted file mode 100644 index 1114aa8..0000000 --- a/vdo/base/lz4.c +++ /dev/null @@ -1,886 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/lz4.c#2 $ - */ - -// Get the memcpy fixup from common.h. -#include "common.h" - -/* - LZ4 - Fast LZ compression algorithm - Copyright (C) 2011-2012, Yann Collet. - BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following disclaimer - in the documentation and/or other materials provided with the - distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - You can contact the author at : - - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html - - LZ4 source repository : http://code.google.com/p/lz4/ -*/ -/* - * With authors permission dual licensed as BSD/GPL for linux kernel - * - * Origin: http://lz4.googlecode.com/svn/trunk - * Revision: 88 - */ - -//************************************** -// Tuning parameters -//************************************** -// MEMORY_USAGE : -// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) -// Increasing memory usage improves compression ratio -// Reduced memory usage can improve speed, due to cache effect -// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache -#define MEMORY_USAGE 14 - -// NOTCOMPRESSIBLE_DETECTIONLEVEL : -// Decreasing this value will make the algorithm skip faster data segments considered "incompressible" -// This may decrease compression ratio dramatically, but will be faster on incompressible data -// Increasing this value will make the algorithm search more before declaring a segment "incompressible" -// This could improve compression a bit, but will be slower on incompressible data -// The default value (6) is recommended -#define NOTCOMPRESSIBLE_DETECTIONLEVEL 6 - -// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : -// This will provide a small boost to performance for big endian cpu, but the resulting compressed stream will be incompatible with little-endian CPU. -// You can set this option to 1 in situations where data will remain within closed environment -// This option is useless on Little_Endian CPU (such as x86) -//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 - - - -//************************************** -// CPU Feature Detection -//************************************** -// 32 or 64 bits ? -#if (defined(__x86_64__) || defined(__x86_64) || defined(__amd64__) || defined(__amd64) || defined(__ppc64__) || defined(_WIN64) || defined(__LP64__) || defined(_LP64) ) // Detects 64 bits mode -# define LZ4_ARCH64 1 -#else -# define LZ4_ARCH64 0 -#endif - -// Little Endian or Big Endian ? -// GCC normally defines these three macros (and PDP-endian which we ignore). -#if !defined(__ORDER_LITTLE_ENDIAN__) || !defined(__ORDER_BIG_ENDIAN__) \ - || !defined(__BYTE_ORDER__) -#error "GCC byte order macros not defined?" -#endif -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -# define LZ4_BIG_ENDIAN 1 -#elif __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ -# error "fix byte order check" -#endif - -// Unaligned memory access is automatically enabled for "common" CPU, such as x86. -// For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected -// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance -#if defined(__ARM_FEATURE_UNALIGNED) -# define LZ4_FORCE_UNALIGNED_ACCESS 1 -#endif - -// Define this parameter if your target system or compiler does not support hardware bit count -#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count -# define LZ4_FORCE_SW_BITCOUNT -#endif - - -//************************************** -// Compiler Options -//************************************** -#if __STDC_VERSION__ >= 199901L // C99 -/* "restrict" is a known keyword */ -#else -# define restrict // Disable restrict -#endif - -#define _GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) - -#ifdef _MSC_VER // Visual Studio -# include // For Visual 2005 -# if LZ4_ARCH64 // 64-bit -# pragma intrinsic(_BitScanForward64) // For Visual 2005 -# pragma intrinsic(_BitScanReverse64) // For Visual 2005 -# else -# pragma intrinsic(_BitScanForward) // For Visual 2005 -# pragma intrinsic(_BitScanReverse) // For Visual 2005 -# endif -#endif - -#ifdef _MSC_VER -# define lz4_bswap16(x) _byteswap_ushort(x) -#else -# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) -#endif - -#if (_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) -# define expect(expr,value) (__builtin_expect ((expr),(value)) ) -#else -# define expect(expr,value) (expr) -#endif - -//************************************** -// Includes -//************************************** -#ifdef __KERNEL__ -# include // for memset -#else /* __KERNEL__ */ -# include // for malloc -# include // for memset -#endif /* __KERNEL__ */ -#include "lz4.h" - - -//************************************** -// Basic Types -//************************************** -#if defined(_MSC_VER) // Visual Studio does not support 'stdint' natively -# define BYTE unsigned __int8 -# define U16 unsigned __int16 -# define U32 unsigned __int32 -# define S32 __int32 -# define U64 unsigned __int64 -#else -# ifdef __KERNEL__ -# include -# else /* __KERNEL__ */ -# include -# endif /* __KERNEL__ */ -# define BYTE uint8_t -# define U16 uint16_t -# define U32 uint32_t -# define S32 int32_t -# define U64 uint64_t -#endif - -#ifndef LZ4_FORCE_UNALIGNED_ACCESS -# pragma pack(push, 1) -#endif - -typedef struct _U16_S { U16 v; } U16_S; -typedef struct _U32_S { U32 v; } U32_S; -typedef struct _U64_S { U64 v; } U64_S; - -#ifndef LZ4_FORCE_UNALIGNED_ACCESS -# pragma pack(pop) -#endif - -#define A64(x) (((U64_S *)(x))->v) -#define A32(x) (((U32_S *)(x))->v) -#define A16(x) (((U16_S *)(x))->v) - - -//************************************** -// Constants -//************************************** -#define MINMATCH 4 - -#define HASH_LOG (MEMORY_USAGE-2) -#define HASHTABLESIZE (1 << HASH_LOG) -#define HASH_MASK (HASHTABLESIZE - 1) - -#define SKIPSTRENGTH (NOTCOMPRESSIBLE_DETECTIONLEVEL>2?NOTCOMPRESSIBLE_DETECTIONLEVEL:2) -#define STACKLIMIT 13 -#define HEAPMODE (HASH_LOG>STACKLIMIT) // Defines if memory is allocated into the stack (local variable), or into the heap (malloc()). -#define COPYLENGTH 8 -#define LASTLITERALS 5 -#define MFLIMIT (COPYLENGTH+MINMATCH) -#define MINLENGTH (MFLIMIT+1) - -#define MAXD_LOG 16 -#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) - -#define ML_BITS 4 -#define ML_MASK ((1U<> ((MINMATCH*8)-HASH_LOG)) -#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) -#define LZ4_WILDCOPY(s,d,e) do { LZ4_COPYPACKET(s,d) } while (d>3); - #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clzll(val) >> 3); - #else - int r; - if (!(val>>32)) { r=4; } else { r=0; val>>=32; } - if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } - r += (!val); - return r; - #endif -#else - #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanForward64( &r, val ); - return (int)(r>>3); - #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctzll(val) >> 3); - #else - static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; - return DeBruijnBytePos[((U64)((val & -val) * 0x0218A392CDABBD3F)) >> 58]; - #endif -#endif -} - -#else - -static inline int LZ4_NbCommonBytes (register U32 val) -{ -#if defined(LZ4_BIG_ENDIAN) - #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanReverse( &r, val ); - return (int)(r>>3); - #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_clz(val) >> 3); - #else - int r; - if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } - r += (!val); - return r; - #endif -#else - #if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) - unsigned long r = 0; - _BitScanForward( &r, val ); - return (int)(r>>3); - #elif defined(__GNUC__) && (_GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) - return (__builtin_ctz(val) >> 3); - #else - static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; - return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; - #endif -#endif -} - -#endif - - - -//****************************** -// Compression functions -//****************************** - -// LZ4_compressCtx : -// ----------------- -// Compress 'isize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. -// If it cannot achieve it, compression will stop, and result of the function will be zero. -// return : the number of bytes written in buffer 'dest', or 0 if the compression fails - -static inline int LZ4_compressCtx(void** ctx, - const char* source, - char* dest, - int isize, - int maxOutputSize) -{ -#if HEAPMODE - struct refTables *srt = (struct refTables *) (*ctx); - HTYPE* HashTable; -#else - HTYPE HashTable[HASHTABLESIZE] = {0}; -#endif - - const BYTE* ip = (BYTE*) source; - INITBASE(base); - const BYTE* anchor = ip; - const BYTE* const iend = ip + isize; - const BYTE* const mflimit = iend - MFLIMIT; -#define matchlimit (iend - LASTLITERALS) - - BYTE* op = (BYTE*) dest; - BYTE* const oend = op + maxOutputSize; - - int len, length; - const int skipStrength = SKIPSTRENGTH; - U32 forwardH; - - - // Init - if (isizehashTable); - memset((void*)HashTable, 0, sizeof(srt->hashTable)); -#else - (void) ctx; -#endif - - - // First Byte - HashTable[LZ4_HASH_VALUE(ip)] = ip - base; - ip++; forwardH = LZ4_HASH_VALUE(ip); - - // Main Loop - for ( ; ; ) - { - int findMatchAttempts = (1U << skipStrength) + 3; - const BYTE* forwardIp = ip; - const BYTE* ref; - BYTE* token; - - // Find a match - do { - U32 h = forwardH; - int step = findMatchAttempts++ >> skipStrength; - ip = forwardIp; - forwardIp = ip + step; - - if (unlikely(forwardIp > mflimit)) { goto _last_literals; } - - forwardH = LZ4_HASH_VALUE(forwardIp); - ref = base + HashTable[h]; - HashTable[h] = ip - base; - - } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); - - // Catch up - while ((ip>anchor) && (ref>(BYTE*)source) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; } - - // Encode Literal length - length = (int)(ip - anchor); - token = op++; - if (unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend)) return 0; // Check output limit -#ifdef _MSC_VER - if (length>=(int)RUN_MASK) - { - int len = length-RUN_MASK; - *token=(RUN_MASK<254) - { - do { *op++ = 255; len -= 255; } while (len>254); - *op++ = (BYTE)len; - memcpy(op, anchor, length); - op += length; - goto _next_match; - } - else - *op++ = (BYTE)len; - } - else *token = (length<=(int)RUN_MASK) { *token=(RUN_MASK< 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } - else *token = (length<>8) > oend)) return 0; // Check output limit - if (len>=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; } - else *token += len; - - // Test end of chunk - if (ip > mflimit) { anchor = ip; break; } - - // Fill table - HashTable[LZ4_HASH_VALUE(ip-2)] = ip - 2 - base; - - // Test next position - ref = base + HashTable[LZ4_HASH_VALUE(ip)]; - HashTable[LZ4_HASH_VALUE(ip)] = ip - base; - if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; } - - // Prepare next loop - anchor = ip++; - forwardH = LZ4_HASH_VALUE(ip); - } - -_last_literals: - // Encode Last Literals - { - int lastRun = (int)(iend - anchor); - if (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize) return 0; - if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } - else *op++ = (lastRun<> ((MINMATCH*8)-HASHLOG64K)) -#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) -static inline int LZ4_compress64kCtx(void** ctx, - const char* source, - char* dest, - int isize, - int maxOutputSize) -{ -#if HEAPMODE - struct refTables *srt = (struct refTables *) (*ctx); - U16* HashTable; -#else - U16 HashTable[HASH64KTABLESIZE] = {0}; -#endif - - const BYTE* ip = (BYTE*) source; - const BYTE* anchor = ip; - const BYTE* const base = ip; - const BYTE* const iend = ip + isize; - const BYTE* const mflimit = iend - MFLIMIT; -#define matchlimit (iend - LASTLITERALS) - - BYTE* op = (BYTE*) dest; - BYTE* const oend = op + maxOutputSize; - - int len, length; - const int skipStrength = SKIPSTRENGTH; - U32 forwardH; - - - // Init - if (isizehashTable); - memset((void*)HashTable, 0, sizeof(srt->hashTable)); -#else - (void) ctx; -#endif - - - // First Byte - ip++; forwardH = LZ4_HASH64K_VALUE(ip); - - // Main Loop - for ( ; ; ) - { - int findMatchAttempts = (1U << skipStrength) + 3; - const BYTE* forwardIp = ip; - const BYTE* ref; - BYTE* token; - - // Find a match - do { - U32 h = forwardH; - int step = findMatchAttempts++ >> skipStrength; - ip = forwardIp; - forwardIp = ip + step; - - if (forwardIp > mflimit) { goto _last_literals; } - - forwardH = LZ4_HASH64K_VALUE(forwardIp); - ref = base + HashTable[h]; - HashTable[h] = (U16)(ip - base); - - } while (A32(ref) != A32(ip)); - - // Catch up - while ((ip>anchor) && (ref>(BYTE*)source) && (ip[-1]==ref[-1])) { ip--; ref--; } - - // Encode Literal length - length = (int)(ip - anchor); - token = op++; - if (unlikely(op + length + (2 + 1 + LASTLITERALS) + (length>>8) > oend)) return 0; // Check output limit -#ifdef _MSC_VER - if (length>=(int)RUN_MASK) - { - int len = length-RUN_MASK; - *token=(RUN_MASK<254) - { - do { *op++ = 255; len -= 255; } while (len>254); - *op++ = (BYTE)len; - memcpy(op, anchor, length); - op += length; - goto _next_match; - } - else - *op++ = (BYTE)len; - } - else *token = (length<=(int)RUN_MASK) { *token=(RUN_MASK< 254 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } - else *token = (length<>8) > oend)) return 0; // Check output limit - if (len>=(int)ML_MASK) { *token+=ML_MASK; len-=ML_MASK; for(; len > 509 ; len-=510) { *op++ = 255; *op++ = 255; } if (len > 254) { len-=255; *op++ = 255; } *op++ = (BYTE)len; } - else *token += len; - - // Test end of chunk - if (ip > mflimit) { anchor = ip; break; } - - // Fill table - HashTable[LZ4_HASH64K_VALUE(ip-2)] = (U16)(ip - 2 - base); - - // Test next position - ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; - HashTable[LZ4_HASH64K_VALUE(ip)] = (U16)(ip - base); - if (A32(ref) == A32(ip)) { token = op++; *token=0; goto _next_match; } - - // Prepare next loop - anchor = ip++; - forwardH = LZ4_HASH64K_VALUE(ip); - } - -_last_literals: - // Encode Last Literals - { - int lastRun = (int)(iend - anchor); - if (op + lastRun + 1 + (lastRun-RUN_MASK+255)/255 > oend) return 0; - if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK< 254 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } - else *op++ = (lastRun<>ML_BITS)) == RUN_MASK) { size_t len; for (;(len=*ip++)==255;length+=255){} length += len; } - - // copy literals - cpy = op+length; - if (unlikely(cpy>oend-COPYLENGTH)) - { - if (cpy != oend) goto _output_error; // Error : not enough place for another match (min 4) + 5 literals - memcpy(op, ip, length); - ip += length; - break; // EOF - } - LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy; - - // get offset - LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; - if (unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset create reference outside destination buffer - - // get matchlength - if ((length=(token&ML_MASK)) == ML_MASK) { for (;*ip==255;length+=255) {ip++;} length += *ip++; } - - // copy repeated sequence - if (unlikely((op-ref)oend-COPYLENGTH) - { - if (cpy > oend) goto _output_error; // Error : request to write beyond destination buffer - LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH)); - while(op>ML_BITS)) == RUN_MASK) { int s=255; while ((ipoend-COPYLENGTH) || (ip+length>iend-COPYLENGTH)) - { - if (cpy > oend) goto _output_error; // Error : writes beyond output buffer - if (ip+length != iend) goto _output_error; // Error : LZ4 format requires to consume all input at this stage - memcpy(op, ip, length); - op += length; - break; // Necessarily EOF, due to parsing restrictions - } - LZ4_WILDCOPY(ip, op, cpy); ip -= (op-cpy); op = cpy; - - // get offset - LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; - if (ref < (BYTE* const)dest) goto _output_error; // Error : offset creates reference outside of destination buffer - - // get matchlength - if ((length=(token&ML_MASK)) == ML_MASK) { while (ipoend-COPYLENGTH) - { - if (cpy > oend) goto _output_error; // Error : request to write outside of destination buffer - LZ4_SECURECOPY(ref, op, (oend-COPYLENGTH)); - while(op 0) && ((n & (n - 1)) == 0); -} - -/** - * Efficiently calculate the base-2 logarithm of a number truncated to an - * integer value. - * - * This also happens to be the bit index of the highest-order non-zero bit in - * the binary representation of the number, which can easily be used to - * calculate the bit shift corresponding to a bit mask or an array capacity, - * or to calculate the binary floor or ceiling (next lowest or highest power - * of two). - * - * @param n The input value - * - * @return the integer log2 of the value, or -1 if the value is zero - **/ -static inline int logBaseTwo(uint64_t n) -{ - if (n == 0) { - return -1; - } - // Many CPUs, including x86, directly support this calculation, so use the - // GCC function for counting the number of leading high-order zero bits. - return 63 - __builtin_clzll(n); -} - -/** - * Find the minimum of two physical block numbers. - **/ -__attribute__((warn_unused_result)) -static inline PhysicalBlockNumber minBlock(PhysicalBlockNumber a, - PhysicalBlockNumber b) -{ - return (a < b) ? a : b; -} - -/** - * Find the maximum of two physical block numbers. - **/ -__attribute__((warn_unused_result)) -static inline PhysicalBlockNumber maxBlock(PhysicalBlockNumber a, - PhysicalBlockNumber b) -{ - return (a > b) ? a : b; -} - -/** - * Find the minimum of two block counts. - **/ -__attribute__((warn_unused_result)) -static inline BlockCount minBlockCount(BlockCount a, BlockCount b) -{ - return (a < b) ? a : b; -} - -/** - * Find the maximum of two block counts. - **/ -__attribute__((warn_unused_result)) -static inline BlockCount maxBlockCount(BlockCount a, BlockCount b) -{ - return (a > b) ? a : b; -} - -/** - * Find the minimum of two sequence numbers. - **/ -__attribute__((warn_unused_result)) -static inline SequenceNumber minSequenceNumber(SequenceNumber a, - SequenceNumber b) -{ - return (a < b) ? a : b; -} - -/** - * Return the minimum of two page counts. - **/ -__attribute__((warn_unused_result)) -static inline PageCount minPageCount(PageCount a, PageCount b) -{ - return (a < b) ? a : b; -} - -/** - * Return the maximum of two page counts. - **/ -__attribute__((warn_unused_result)) -static inline PageCount maxPageCount(PageCount a, PageCount b) -{ - return (a > b) ? a : b; -} - -/** - * Round upward towards the nearest multiple of quantum. - * - * @param number a number - * @param quantum the quantum - * - * @return the least multiple of quantum not less than number - **/ -__attribute__((warn_unused_result)) -static inline size_t roundUpToMultipleSizeT(size_t number, size_t quantum) -{ - return number + quantum - 1 - ((number + quantum - 1) % quantum); -} - -/** - * Round upward towards the nearest multiple of quantum for uint64_t - * - * @param number a number - * @param quantum the quantum - * - * @return the least multiple of quantum not less than number - **/ -__attribute__((warn_unused_result)) -static inline uint64_t roundUpToMultipleUInt64T(uint64_t number, - uint64_t quantum) -{ - return number + quantum - 1 - ((number + quantum - 1) % quantum); -} - -/** - * Check whether the given value is between the lower and upper bounds, - * within a cyclic range of values from 0 to (modulus - 1). The value - * and both bounds must be smaller than the modulus. - * - * @param lower The lowest value to accept - * @param value The value to check - * @param upper The highest value to accept - * @param modulus The size of the cyclic space, no more than 2^15 - * - * @return true if the value is in range - **/ -static inline bool inCyclicRange(uint16_t lower, - uint16_t value, - uint16_t upper, - uint16_t modulus) -{ - if (value < lower) { - value += modulus; - } - if (upper < lower) { - upper += modulus; - } - return (value <= upper); -} - -/** - * Compute the number of buckets of a given size which are required to hold a - * given number of objects. - * - * @param objectCount The number of objects to hold - * @param bucketSize The size of a bucket - * - * @return The number of buckets required - **/ -static inline uint64_t computeBucketCount(uint64_t objectCount, - uint64_t bucketSize) -{ - uint64_t quotient = objectCount / bucketSize; - if ((objectCount % bucketSize) > 0) { - ++quotient; - } - return quotient; -} - -#endif // NUM_UTILS_H diff --git a/vdo/base/packedRecoveryJournalBlock.h b/vdo/base/packedRecoveryJournalBlock.h deleted file mode 100644 index b592225..0000000 --- a/vdo/base/packedRecoveryJournalBlock.h +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packedRecoveryJournalBlock.h#3 $ - */ - -#ifndef PACKED_RECOVERY_JOURNAL_BLOCK_H -#define PACKED_RECOVERY_JOURNAL_BLOCK_H - -#include "numeric.h" - -#include "constants.h" -#include "recoveryJournalEntry.h" -#include "types.h" - -typedef struct { - SequenceNumber blockMapHead; // Block map head sequence number - SequenceNumber slabJournalHead; // Slab journal head sequence number - SequenceNumber sequenceNumber; // Sequence number for this block - Nonce nonce; // A given VDO instance's nonce - BlockCount logicalBlocksUsed; // Count of logical blocks in use - BlockCount blockMapDataBlocks; // Count of allocated block map pages - JournalEntryCount entryCount; // Number of entries written - uint8_t checkByte; // The protection check byte - uint8_t recoveryCount; // The number of recoveries completed - VDOMetadataType metadataType; // Metadata type -} RecoveryBlockHeader; - -/** - * The packed, on-disk representation of a recovery journal block header. - * All fields are kept in little-endian byte order. - **/ -typedef union __attribute__((packed)) { - struct __attribute__((packed)) { - /** Block map head 64-bit sequence number */ - byte blockMapHead[8]; - - /** Slab journal head 64-bit sequence number */ - byte slabJournalHead[8]; - - /** The 64-bit sequence number for this block */ - byte sequenceNumber[8]; - - /** A given VDO instance's 64-bit nonce */ - byte nonce[8]; - - /** 8-bit metadata type (should always be one for the recovery journal) */ - uint8_t metadataType; - - /** 16-bit count of the entries encoded in the block */ - byte entryCount[2]; - - /** 64-bit count of the logical blocks used when this block was opened */ - byte logicalBlocksUsed[8]; - - /** 64-bit count of the block map blocks used when this block was opened */ - byte blockMapDataBlocks[8]; - - /** The protection check byte */ - uint8_t checkByte; - - /** The number of recoveries completed */ - uint8_t recoveryCount; - } fields; - - // A raw view of the packed encoding. - uint8_t raw[8 + 8 + 8 + 8 + 1 + 2 + 8 + 8 + 1 + 1]; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - // This view is only valid on little-endian machines and is only present for - // ease of directly examining packed entries in GDB. - struct __attribute__((packed)) { - SequenceNumber blockMapHead; - SequenceNumber slabJournalHead; - SequenceNumber sequenceNumber; - Nonce nonce; - VDOMetadataType metadataType; - JournalEntryCount entryCount; - BlockCount logicalBlocksUsed; - BlockCount blockMapDataBlocks; - uint8_t checkByte; - uint8_t recoveryCount; - } littleEndian; -#endif -} PackedJournalHeader; - -typedef struct { - /** The protection check byte */ - uint8_t checkByte; - - /** The number of recoveries completed */ - uint8_t recoveryCount; - - /** The number of entries in this sector */ - uint8_t entryCount; - - /** Journal entries for this sector */ - PackedRecoveryJournalEntry entries[]; -} __attribute__((packed)) PackedJournalSector; - -enum { - // Allowing more than 311 entries in each block changes the math - // concerning the amortization of metadata writes and recovery speed. - RECOVERY_JOURNAL_ENTRIES_PER_BLOCK = 311, - /** The number of entries in each sector (except the last) when filled */ - RECOVERY_JOURNAL_ENTRIES_PER_SECTOR - = ((VDO_SECTOR_SIZE - sizeof(PackedJournalSector)) - / sizeof(PackedRecoveryJournalEntry)), - /** The number of entries in the last sector when a block is full */ - RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR - = (RECOVERY_JOURNAL_ENTRIES_PER_BLOCK - % RECOVERY_JOURNAL_ENTRIES_PER_SECTOR), -}; - -/** - * Find the recovery journal sector from the block header and sector number. - * - * @param header The header of the recovery journal block - * @param sectorNumber The index of the sector (1-based) - * - * @return A packed recovery journal sector - **/ -__attribute__((warn_unused_result)) -static inline -PackedJournalSector *getJournalBlockSector(PackedJournalHeader *header, - int sectorNumber) -{ - char *sectorData = ((char *) header) + (VDO_SECTOR_SIZE * sectorNumber); - return (PackedJournalSector *) sectorData; -} - -/** - * Generate the packed representation of a recovery block header. - * - * @param header The header containing the values to encode - * @param packed The header into which to pack the values - **/ -static inline void packRecoveryBlockHeader(const RecoveryBlockHeader *header, - PackedJournalHeader *packed) -{ - storeUInt64LE(packed->fields.blockMapHead, header->blockMapHead); - storeUInt64LE(packed->fields.slabJournalHead, header->slabJournalHead); - storeUInt64LE(packed->fields.sequenceNumber, header->sequenceNumber); - storeUInt64LE(packed->fields.nonce, header->nonce); - storeUInt64LE(packed->fields.logicalBlocksUsed, header->logicalBlocksUsed); - storeUInt64LE(packed->fields.blockMapDataBlocks, header->blockMapDataBlocks); - storeUInt16LE(packed->fields.entryCount, header->entryCount); - - packed->fields.checkByte = header->checkByte; - packed->fields.recoveryCount = header->recoveryCount; - packed->fields.metadataType = header->metadataType; -} - -/** - * Decode the packed representation of a recovery block header. - * - * @param packed The packed header to decode - * @param header The header into which to unpack the values - **/ -static inline void unpackRecoveryBlockHeader(const PackedJournalHeader *packed, - RecoveryBlockHeader *header) -{ - *header = (RecoveryBlockHeader) { - .blockMapHead = getUInt64LE(packed->fields.blockMapHead), - .slabJournalHead = getUInt64LE(packed->fields.slabJournalHead), - .sequenceNumber = getUInt64LE(packed->fields.sequenceNumber), - .nonce = getUInt64LE(packed->fields.nonce), - .logicalBlocksUsed = getUInt64LE(packed->fields.logicalBlocksUsed), - .blockMapDataBlocks = getUInt64LE(packed->fields.blockMapDataBlocks), - .entryCount = getUInt16LE(packed->fields.entryCount), - .checkByte = packed->fields.checkByte, - .recoveryCount = packed->fields.recoveryCount, - .metadataType = packed->fields.metadataType, - }; -} - -#endif // PACKED_RECOVERY_JOURNAL_BLOCK_H diff --git a/vdo/base/packer.c b/vdo/base/packer.c deleted file mode 100644 index efb4dd4..0000000 --- a/vdo/base/packer.c +++ /dev/null @@ -1,1023 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packer.c#8 $ - */ - -#include "packerInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "adminState.h" -#include "allocatingVIO.h" -#include "allocationSelector.h" -#include "compressionState.h" -#include "dataVIO.h" -#include "hashLock.h" -#include "pbnLock.h" -#include "vdo.h" -#include "vdoInternal.h" - -/** - * Check that we are on the packer thread. - * - * @param packer The packer - * @param caller The function which is asserting - **/ -static inline void assertOnPackerThread(Packer *packer, const char *caller) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() == packer->threadID), - "%s() called from packer thread", caller); -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static inline InputBin *inputBinFromRingNode(RingNode *node) -{ - STATIC_ASSERT(offsetof(InputBin, ring) == 0); - return (InputBin *) node; -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static inline OutputBin *outputBinFromRingNode(RingNode *node) -{ - STATIC_ASSERT(offsetof(OutputBin, ring) == 0); - return (OutputBin *) node; -} - -/**********************************************************************/ -InputBin *nextBin(const Packer *packer, InputBin *bin) -{ - if (bin->ring.next == &packer->inputBins) { - return NULL; - } else { - return inputBinFromRingNode(bin->ring.next); - } -} - -/**********************************************************************/ -InputBin *getFullestBin(const Packer *packer) -{ - if (isRingEmpty(&packer->inputBins)) { - return NULL; - } else { - return inputBinFromRingNode(packer->inputBins.next); - } -} - -/** - * Insert an input bin to the list, which is in ascending order of free space. - * Since all bins are already in the list, this actually moves the bin to the - * correct position in the list. - * - * @param packer The packer - * @param bin The input bin to move to its sorted position - **/ -static void insertInSortedList(Packer *packer, InputBin *bin) -{ - for (InputBin *activeBin = getFullestBin(packer); - activeBin != NULL; - activeBin = nextBin(packer, activeBin)) { - if (activeBin->freeSpace > bin->freeSpace) { - pushRingNode(&activeBin->ring, &bin->ring); - return; - } - } - - pushRingNode(&packer->inputBins, &bin->ring); -} - -/** - * Allocate an input bin and put it into the packer's list. - * - * @param packer The packer - **/ -__attribute__((warn_unused_result)) -static int makeInputBin(Packer *packer) -{ - InputBin *bin; - int result = ALLOCATE_EXTENDED(InputBin, MAX_COMPRESSION_SLOTS, VIO *, - __func__, &bin); - if (result != VDO_SUCCESS) { - return result; - } - - bin->freeSpace = packer->binDataSize; - initializeRing(&bin->ring); - pushRingNode(&packer->inputBins, &bin->ring); - return VDO_SUCCESS; -} - -/** - * Push an output bin onto the stack of idle bins. - * - * @param packer The packer - * @param bin The output bin - **/ -static void pushOutputBin(Packer *packer, OutputBin *bin) -{ - ASSERT_LOG_ONLY(!hasWaiters(&bin->outgoing), - "idle output bin has no waiters"); - packer->idleOutputBins[packer->idleOutputBinCount++] = bin; -} - -/** - * Pop an output bin off the end of the stack of idle bins. - * - * @param packer The packer - * - * @return an idle output bin, or NULL if there are no idle bins - **/ -__attribute__((warn_unused_result)) -static OutputBin *popOutputBin(Packer *packer) -{ - if (packer->idleOutputBinCount == 0) { - return NULL; - } - - size_t index = --packer->idleOutputBinCount; - OutputBin *bin = packer->idleOutputBins[index]; - packer->idleOutputBins[index] = NULL; - return bin; -} - -/** - * Allocate a new output bin and push it onto the packer's stack of idle bins. - * - * @param packer The packer - * @param layer The physical layer that will receive the compressed block - * writes from the output bin - * - * @return VDO_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int makeOutputBin(Packer *packer, PhysicalLayer *layer) -{ - OutputBin *output; - int result = ALLOCATE(1, OutputBin, __func__, &output); - if (result != VDO_SUCCESS) { - return result; - } - - // Add the bin to the stack even before it's fully initialized so it will - // be freed even if we fail to initialize it below. - initializeRing(&output->ring); - pushRingNode(&packer->outputBins, &output->ring); - pushOutputBin(packer, output); - - result = ALLOCATE_EXTENDED(CompressedBlock, packer->binDataSize, char, - "compressed block", &output->block); - if (result != VDO_SUCCESS) { - return result; - } - - return layer->createCompressedWriteVIO(layer, output, (char *) output->block, - &output->writer); -} - -/** - * Free an idle output bin and null out the reference to it. - * - * @param binPtr The reference to the output bin to free - **/ -static void freeOutputBin(OutputBin **binPtr) -{ - OutputBin *bin = *binPtr; - if (bin == NULL) { - return; - } - - unspliceRingNode(&bin->ring); - - VIO *vio = allocatingVIOAsVIO(bin->writer); - freeVIO(&vio); - FREE(bin->block); - FREE(bin); - *binPtr = NULL; -} - -/**********************************************************************/ -int makePacker(PhysicalLayer *layer, - BlockCount inputBinCount, - BlockCount outputBinCount, - const ThreadConfig *threadConfig, - Packer **packerPtr) -{ - Packer *packer; - int result = ALLOCATE_EXTENDED(Packer, outputBinCount, - OutputBin *, __func__, &packer); - if (result != VDO_SUCCESS) { - return result; - } - - packer->threadID = getPackerZoneThread(threadConfig); - packer->binDataSize = VDO_BLOCK_SIZE - sizeof(CompressedBlockHeader); - packer->size = inputBinCount; - packer->maxSlots = MAX_COMPRESSION_SLOTS; - packer->outputBinCount = outputBinCount; - initializeRing(&packer->inputBins); - initializeRing(&packer->outputBins); - - result = makeAllocationSelector(threadConfig->physicalZoneCount, - packer->threadID, &packer->selector); - if (result != VDO_SUCCESS) { - freePacker(&packer); - return result; - } - - for (BlockCount i = 0; i < inputBinCount; i++) { - int result = makeInputBin(packer); - if (result != VDO_SUCCESS) { - freePacker(&packer); - return result; - } - } - - /* - * The canceled bin can hold up to half the number of user VIOs. Every - * canceled VIO in the bin must have a canceler for which it is waiting, and - * any canceler will only have canceled one lock holder at a time. - */ - result = ALLOCATE_EXTENDED(InputBin, MAXIMUM_USER_VIOS / 2, VIO *, __func__, - &packer->canceledBin); - if (result != VDO_SUCCESS) { - freePacker(&packer); - return result; - } - - for (BlockCount i = 0; i < outputBinCount; i++) { - int result = makeOutputBin(packer, layer); - if (result != VDO_SUCCESS) { - freePacker(&packer); - return result; - } - } - - *packerPtr = packer; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freePacker(Packer **packerPtr) -{ - Packer *packer = *packerPtr; - if (packer == NULL) { - return; - } - - InputBin *input; - while ((input = getFullestBin(packer)) != NULL) { - unspliceRingNode(&input->ring); - FREE(input); - } - - FREE(packer->canceledBin); - - OutputBin *output; - while ((output = popOutputBin(packer)) != NULL) { - freeOutputBin(&output); - } - - freeAllocationSelector(&packer->selector); - FREE(packer); - *packerPtr = NULL; -} - -/** - * Get the Packer from a DataVIO. - * - * @param dataVIO The DataVIO - * - * @return The Packer from the VDO to which the DataVIO belongs - **/ -static inline Packer *getPackerFromDataVIO(DataVIO *dataVIO) -{ - return getVDOFromDataVIO(dataVIO)->packer; -} - -/**********************************************************************/ -bool isSufficientlyCompressible(DataVIO *dataVIO) -{ - Packer *packer = getPackerFromDataVIO(dataVIO); - return (dataVIO->compression.size < packer->binDataSize); -} - -/**********************************************************************/ -ThreadID getPackerThreadID(Packer *packer) -{ - return packer->threadID; -} - -/**********************************************************************/ -PackerStatistics getPackerStatistics(const Packer *packer) -{ - /* - * This is called from getVDOStatistics(), which is called from outside the - * packer thread. These are just statistics with no semantics that could - * rely on memory order, so unfenced reads are sufficient. - */ - return (PackerStatistics) { - .compressedFragmentsWritten = relaxedLoad64(&packer->fragmentsWritten), - .compressedBlocksWritten = relaxedLoad64(&packer->blocksWritten), - .compressedFragmentsInPacker = relaxedLoad64(&packer->fragmentsPending), - }; -} - -/** - * Abort packing a DataVIO. - * - * @param dataVIO The DataVIO to abort - **/ -static void abortPacking(DataVIO *dataVIO) -{ - setCompressionDone(dataVIO); - relaxedAdd64(&getPackerFromDataVIO(dataVIO)->fragmentsPending, -1); - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - continueDataVIO(dataVIO, VDO_SUCCESS); -} - -/** - * This continues the VIO completion without packing the VIO. - * - * @param waiter The wait queue entry of the VIO to continue - * @param unused An argument required so this function may be called - * from notifyAllWaiters - **/ -static void continueVIOWithoutPacking(Waiter *waiter, - void *unused __attribute__((unused))) -{ - abortPacking(waiterAsDataVIO(waiter)); -} - -/** - * Check whether the packer has drained. - * - * @param packer The packer - **/ -static void checkForDrainComplete(Packer *packer) -{ - if (isDraining(&packer->state) - && (packer->canceledBin->slotsUsed == 0) - && (packer->idleOutputBinCount == packer->outputBinCount)) { - finishDraining(&packer->state); - } -} - -/**********************************************************************/ -static void writePendingBatches(Packer *packer); - -/** - * Ensure that a completion is running on the packer thread. - * - * @param completion The compressed write VIO - * - * @return true if the completion is on the packer thread - **/ -__attribute__((warn_unused_result)) -static bool switchToPackerThread(VDOCompletion *completion) -{ - VIO *vio = asVIO(completion); - ThreadID threadID = vio->vdo->packer->threadID; - if (completion->callbackThreadID == threadID) { - return true; - } - - completion->callbackThreadID = threadID; - invokeCallback(completion); - return false; -} - -/** - * Finish processing an output bin whose write has completed. If there was - * an error, any DataVIOs waiting on the bin write will be notified. - * - * @param packer The packer which owns the bin - * @param bin The bin which has finished - **/ -static void finishOutputBin(Packer *packer, OutputBin *bin) -{ - if (hasWaiters(&bin->outgoing)) { - notifyAllWaiters(&bin->outgoing, continueVIOWithoutPacking, NULL); - } else { - // No waiters implies no error, so the compressed block was written. - relaxedAdd64(&packer->fragmentsPending, -bin->slotsUsed); - relaxedAdd64(&packer->fragmentsWritten, bin->slotsUsed); - relaxedAdd64(&packer->blocksWritten, 1); - } - - bin->slotsUsed = 0; - pushOutputBin(packer, bin); -} - -/** - * This finishes the bin write process after the bin is written to disk. This - * is the VIO callback function registered by writeOutputBin(). - * - * @param completion The compressed write VIO - **/ -static void completeOutputBin(VDOCompletion *completion) -{ - if (!switchToPackerThread(completion)) { - return; - } - - VIO *vio = asVIO(completion); - if (completion->result != VDO_SUCCESS) { - updateVIOErrorStats(vio, - "Completing compressed write VIO for physical block %" - PRIu64 " with error", - vio->physical); - } - - Packer *packer = vio->vdo->packer; - finishOutputBin(packer, completion->parent); - writePendingBatches(packer); - checkForDrainComplete(packer); -} - -/** - * Implements WaiterCallback. Continues the DataVIO waiter. - **/ -static void continueWaiter(Waiter *waiter, - void *context __attribute__((unused))) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - continueDataVIO(dataVIO, VDO_SUCCESS); -} - -/** - * Implements WaiterCallback. Updates the DataVIO waiter to refer to its slot - * in the compressed block, gives the DataVIO a share of the PBN lock on that - * block, and reserves a reference count increment on the lock. - **/ -static void shareCompressedBlock(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - OutputBin *bin = context; - - dataVIO->newMapped = (ZonedPBN) { - .pbn = bin->writer->allocation, - .zone = bin->writer->zone, - .state = getStateForSlot(dataVIO->compression.slot), - }; - dataVIOAsVIO(dataVIO)->physical = dataVIO->newMapped.pbn; - - shareCompressedWriteLock(dataVIO, bin->writer->allocationLock); - - // Wait again for all the waiters to get a share. - int result = enqueueWaiter(&bin->outgoing, waiter); - // Cannot fail since this waiter was just dequeued. - ASSERT_LOG_ONLY(result == VDO_SUCCESS, "impossible enqueueWaiter error"); -} - -/** - * Finish a compressed block write. This callback is registered in - * continueAfterAllocation(). - * - * @param completion The compressed write completion - **/ -static void finishCompressedWrite(VDOCompletion *completion) -{ - OutputBin *bin = completion->parent; - assertInPhysicalZone(bin->writer); - - if (completion->result != VDO_SUCCESS) { - releaseAllocationLock(bin->writer); - // Invokes completeOutputBin() on the packer thread, which will deal with - // the waiters. - vioDoneCallback(completion); - return; - } - - // First give every DataVIO/HashLock a share of the PBN lock to ensure it - // can't be released until they've all done their incRefs. - notifyAllWaiters(&bin->outgoing, shareCompressedBlock, bin); - - // The waiters now hold the (downgraded) PBN lock. - bin->writer->allocationLock = NULL; - - // Invokes the callbacks registered before entering the packer. - notifyAllWaiters(&bin->outgoing, continueWaiter, NULL); - - // Invokes completeOutputBin() on the packer thread. - vioDoneCallback(completion); -} - -/** - * Continue the write path for a compressed write AllocatingVIO now that block - * allocation is complete (the AllocatingVIO may or may not have actually - * received an allocation). - * - * @param allocatingVIO The AllocatingVIO which has finished the allocation - * process - **/ -static void continueAfterAllocation(AllocatingVIO *allocatingVIO) -{ - VIO *vio = allocatingVIOAsVIO(allocatingVIO); - VDOCompletion *completion = vioAsCompletion(vio); - if (allocatingVIO->allocation == ZERO_BLOCK) { - completion->requeue = true; - setCompletionResult(completion, VDO_NO_SPACE); - vioDoneCallback(completion); - return; - } - - setPhysicalZoneCallback(allocatingVIO, finishCompressedWrite, - THIS_LOCATION("$F(meta);cb=finishCompressedWrite")); - completion->layer->writeCompressedBlock(allocatingVIO); -} - -/** - * Launch an output bin. - * - * @param packer The packer which owns the bin - * @param bin The output bin to launch - **/ -static void launchCompressedWrite(Packer *packer, OutputBin *bin) -{ - if (isReadOnly(getVDOFromAllocatingVIO(bin->writer)->readOnlyNotifier)) { - finishOutputBin(packer, bin); - return; - } - - VIO *vio = allocatingVIOAsVIO(bin->writer); - resetCompletion(vioAsCompletion(vio)); - vio->callback = completeOutputBin; - vio->priority = VIO_PRIORITY_COMPRESSED_DATA; - allocateDataBlock(bin->writer, packer->selector, VIO_COMPRESSED_WRITE_LOCK, - continueAfterAllocation); -} - -/** - * Consume from the pending queue the next batch of VIOs that can be packed - * together in a single compressed block. VIOs that have been mooted since - * being placed in the pending queue will not be returned. - * - * @param packer The packer - * @param batch The counted array to fill with the next batch of VIOs - **/ -static void getNextBatch(Packer *packer, OutputBatch *batch) -{ - BlockSize spaceRemaining = packer->binDataSize; - batch->slotsUsed = 0; - - DataVIO *dataVIO; - while ((dataVIO = waiterAsDataVIO(getFirstWaiter(&packer->batchedDataVIOs))) - != NULL) { - // If there's not enough space for the next DataVIO, the batch is done. - if ((dataVIO->compression.size > spaceRemaining) - || (batch->slotsUsed == packer->maxSlots)) { - break; - } - - // Remove the next DataVIO from the queue and put it in the output batch. - dequeueNextWaiter(&packer->batchedDataVIOs); - batch->slots[batch->slotsUsed++] = dataVIO; - spaceRemaining -= dataVIO->compression.size; - } -} - -/** - * Pack the next batch of compressed VIOs from the batched queue into an - * output bin and write the output bin. - * - * @param packer The packer - * @param output The output bin to fill - * - * @return true if a write was issued for the output bin - **/ -__attribute__((warn_unused_result)) -static bool writeNextBatch(Packer *packer, OutputBin *output) -{ - OutputBatch batch; - getNextBatch(packer, &batch); - - if (batch.slotsUsed == 0) { - // The pending queue must now be empty (there may have been mooted VIOs). - return false; - } - - // If the batch contains only a single VIO, then we save nothing by saving - // the compressed form. Continue processing the single VIO in the batch. - if (batch.slotsUsed == 1) { - abortPacking(batch.slots[0]); - return false; - } - - resetCompressedBlockHeader(&output->block->header); - - size_t spaceUsed = 0; - for (SlotNumber slot = 0; slot < batch.slotsUsed; slot++) { - DataVIO *dataVIO = batch.slots[slot]; - dataVIO->compression.slot = slot; - putCompressedBlockFragment(output->block, slot, spaceUsed, - dataVIO->compression.data, - dataVIO->compression.size); - spaceUsed += dataVIO->compression.size; - - int result = enqueueDataVIO(&output->outgoing, dataVIO, - THIS_LOCATION(NULL)); - if (result != VDO_SUCCESS) { - abortPacking(dataVIO); - continue; - } - - output->slotsUsed += 1; - } - - launchCompressedWrite(packer, output); - return true; -} - -/** - * Put a DataVIO in a specific InputBin in which it will definitely fit. - * - * @param bin The bin in which to put the DataVIO - * @param dataVIO The DataVIO to add - **/ -static void addToInputBin(InputBin *bin, DataVIO *dataVIO) -{ - dataVIO->compression.bin = bin; - dataVIO->compression.slot = bin->slotsUsed; - bin->incoming[bin->slotsUsed++] = dataVIO; -} - -/** - * Start a new batch of VIOs in an InputBin, moving the existing batch, if - * any, to the queue of pending batched VIOs in the packer. - * - * @param packer The packer - * @param bin The bin to prepare - **/ -static void startNewBatch(Packer *packer, InputBin *bin) -{ - // Move all the DataVIOs in the current batch to the batched queue so they - // will get packed into the next free output bin. - for (SlotNumber slot = 0; slot < bin->slotsUsed; slot++) { - DataVIO *dataVIO = bin->incoming[slot]; - dataVIO->compression.bin = NULL; - - if (!mayWriteCompressedDataVIO(dataVIO)) { - /* - * Compression of this DataVIO was canceled while it was waiting; put it - * in the canceled bin so it can be rendezvous with the canceling - * DataVIO. - */ - addToInputBin(packer->canceledBin, dataVIO); - continue; - } - - int result = enqueueDataVIO(&packer->batchedDataVIOs, dataVIO, - THIS_LOCATION(NULL)); - if (result != VDO_SUCCESS) { - // Impossible but we're required to check the result from enqueue. - abortPacking(dataVIO); - } - } - - // The bin is now empty. - bin->slotsUsed = 0; - bin->freeSpace = packer->binDataSize; -} - -/** - * Add a DataVIO to a bin's incoming queue, handle logical space change, and - * call physical space processor. - * - * @param packer The packer - * @param bin The bin to which to add the the DataVIO - * @param dataVIO The DataVIO to add to the bin's queue - **/ -static void addDataVIOToInputBin(Packer *packer, - InputBin *bin, - DataVIO *dataVIO) -{ - // If the selected bin doesn't have room, start a new batch to make room. - if (bin->freeSpace < dataVIO->compression.size) { - startNewBatch(packer, bin); - } - - addToInputBin(bin, dataVIO); - bin->freeSpace -= dataVIO->compression.size; - - // If we happen to exactly fill the bin, start a new input batch. - if ((bin->slotsUsed == packer->maxSlots) || (bin->freeSpace == 0)) { - startNewBatch(packer, bin); - } - - // Now that we've finished changing the free space, restore the sort order. - insertInSortedList(packer, bin); -} - -/** - * Move DataVIOs in pending batches from the batchedDataVIOs to all free output - * bins, issuing writes for the output bins as they are packed. This will loop - * until either the pending queue is drained or all output bins are busy - * writing a compressed block. - * - * @param packer The packer - **/ -static void writePendingBatches(Packer *packer) -{ - if (packer->writingBatches) { - /* - * We've attempted to re-enter this function recursively due to completion - * handling, which can lead to kernel stack overflow as in VDO-1340. It's - * perfectly safe to break the recursion and do nothing since we know any - * pending batches will eventually be handled by the earlier call. - */ - return; - } - - // Record that we are in this function for the above check. IMPORTANT: never - // return from this function without clearing this flag. - packer->writingBatches = true; - - OutputBin *output; - while (hasWaiters(&packer->batchedDataVIOs) - && ((output = popOutputBin(packer)) != NULL)) { - if (!writeNextBatch(packer, output)) { - // We didn't use the output bin to write, so push it back on the stack. - pushOutputBin(packer, output); - } - } - - packer->writingBatches = false; -} - -/** - * Select the input bin that should be used to pack the compressed data in a - * DataVIO with other DataVIOs. - * - * @param packer The packer - * @param dataVIO The DataVIO - **/ -__attribute__((warn_unused_result)) -static InputBin *selectInputBin(Packer *packer, DataVIO *dataVIO) -{ - // First best fit: select the bin with the least free space that has enough - // room for the compressed data in the DataVIO. - InputBin *fullestBin = getFullestBin(packer); - for (InputBin *bin = fullestBin; bin != NULL; bin = nextBin(packer, bin)) { - if (bin->freeSpace >= dataVIO->compression.size) { - return bin; - } - } - - /* - * None of the bins have enough space for the DataVIO. We're not allowed to - * create new bins, so we have to overflow one of the existing bins. It's - * pretty intuitive to select the fullest bin, since that "wastes" the least - * amount of free space in the compressed block. But if the space currently - * used in the fullest bin is smaller than the compressed size of the - * incoming block, it seems wrong to force that bin to write when giving up - * on compressing the incoming DataVIO would likewise "waste" the the least - * amount of free space. - */ - if (dataVIO->compression.size - >= (packer->binDataSize - fullestBin->freeSpace)) { - return NULL; - } - - // The fullest bin doesn't have room, but writing it out and starting a new - // batch with the incoming DataVIO will increase the packer's free space. - return fullestBin; -} - -/**********************************************************************/ -void attemptPacking(DataVIO *dataVIO) -{ - Packer *packer = getPackerFromDataVIO(dataVIO); - assertOnPackerThread(packer, __func__); - - VIOCompressionState state = getCompressionState(dataVIO); - int result = ASSERT((state.status == VIO_COMPRESSING), - "attempt to pack DataVIO not ready for packing, state: " - "%u", - state.status); - if (result != VDO_SUCCESS) { - return; - } - - /* - * Increment whether or not this DataVIO will be packed or not since - * abortPacking() always decrements the counter. - */ - relaxedAdd64(&packer->fragmentsPending, 1); - - // If packing of this DataVIO is disallowed for administrative reasons, give - // up before making any state changes. - if (!isNormal(&packer->state) - || (dataVIO->flushGeneration < packer->flushGeneration)) { - abortPacking(dataVIO); - return; - } - - /* - * The check of mayBlockInPacker() here will set the DataVIO's compression - * state to VIO_PACKING if the DataVIO is allowed to be compressed (if it has - * already been canceled, we'll fall out here). Once the DataVIO is in the - * VIO_PACKING state, it must be guaranteed to be put in an input bin before - * any more requests can be processed by the packer thread. Otherwise, a - * canceling DataVIO could attempt to remove the canceled DataVIO from the - * packer and fail to rendezvous with it (VDO-2809). We must also make sure - * that we will actually bin the DataVIO and not give up on it as being - * larger than the space used in the fullest bin. Hence we must call - * selectInputBin() before calling mayBlockInPacker() (VDO-2826). - */ - InputBin *bin = selectInputBin(packer, dataVIO); - if ((bin == NULL) || !mayBlockInPacker(dataVIO)) { - abortPacking(dataVIO); - return; - } - - addDataVIOToInputBin(packer, bin, dataVIO); - writePendingBatches(packer); -} - -/** - * Force a pending write for all non-empty bins on behalf of a flush or - * suspend. - * - * @param packer The packer being flushed - **/ -static void writeAllNonEmptyBins(Packer *packer) -{ - for (InputBin *bin = getFullestBin(packer); - bin != NULL; - bin = nextBin(packer, bin)) { - startNewBatch(packer, bin); - // We don't need to re-sort the bin here since this loop will make every - // bin have the same amount of free space, so every ordering is sorted. - } - - writePendingBatches(packer); -} - -/**********************************************************************/ -void flushPacker(Packer *packer) -{ - assertOnPackerThread(packer, __func__); - if (isNormal(&packer->state)) { - writeAllNonEmptyBins(packer); - } -} - -/* - * This method is only exposed for unit tests and should not normally be called - * directly; use removeLockHolderFromPacker() instead. - */ -void removeFromPacker(DataVIO *dataVIO) -{ - InputBin *bin = dataVIO->compression.bin; - ASSERT_LOG_ONLY((bin != NULL), "DataVIO in packer has an input bin"); - - SlotNumber slot = dataVIO->compression.slot; - bin->slotsUsed--; - if (slot < bin->slotsUsed) { - bin->incoming[slot] = bin->incoming[bin->slotsUsed]; - bin->incoming[slot]->compression.slot = slot; - } - - dataVIO->compression.bin = NULL; - dataVIO->compression.slot = 0; - - Packer *packer = getPackerFromDataVIO(dataVIO); - if (bin != packer->canceledBin) { - bin->freeSpace += dataVIO->compression.size; - insertInSortedList(packer, bin); - } - - abortPacking(dataVIO); - checkForDrainComplete(packer); -} - -/**********************************************************************/ -void removeLockHolderFromPacker(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInPackerZone(dataVIO); - - DataVIO *lockHolder = dataVIO->compression.lockHolder; - dataVIO->compression.lockHolder = NULL; - removeFromPacker(lockHolder); -} - -/**********************************************************************/ -void incrementPackerFlushGeneration(Packer *packer) -{ - assertOnPackerThread(packer, __func__); - packer->flushGeneration++; - flushPacker(packer); -} - -/** - * Initiate a drain. - * - * Implements AdminInitiator. - **/ -static void initiateDrain(AdminState *state) -{ - Packer *packer = container_of(state, Packer, state); - writeAllNonEmptyBins(packer); - checkForDrainComplete(packer); -} - -/**********************************************************************/ -void drainPacker(Packer *packer, VDOCompletion *completion) -{ - assertOnPackerThread(packer, __func__); - startDraining(&packer->state, ADMIN_STATE_SUSPENDING, completion, - initiateDrain); -} - -/**********************************************************************/ -void resumePacker(Packer *packer, VDOCompletion *parent) -{ - assertOnPackerThread(packer, __func__); - finishCompletion(parent, resumeIfQuiescent(&packer->state)); -} - -/**********************************************************************/ -void resetSlotCount(Packer *packer, CompressedFragmentCount slots) -{ - if (slots > MAX_COMPRESSION_SLOTS) { - return; - } - - packer->maxSlots = slots; -} - -/**********************************************************************/ -static void dumpInputBin(const InputBin *bin, bool canceled) -{ - if (bin->slotsUsed == 0) { - // Don't dump empty input bins. - return; - } - - logInfo(" %sBin slotsUsed=%u freeSpace=%zu", - (canceled ? "Canceled" : "Input"), bin->slotsUsed, bin->freeSpace); - - // XXX dump VIOs in bin->incoming? The VIOs should have been dumped from the - // VIO pool. Maybe just dump their addresses so it's clear they're here? -} - -/**********************************************************************/ -static void dumpOutputBin(const OutputBin *bin) -{ - size_t count = countWaiters(&bin->outgoing); - if (bin->slotsUsed == 0) { - // Don't dump empty output bins. - return; - } - - logInfo(" OutputBin contains %zu outgoing waiters", count); - - // XXX dump VIOs in bin->outgoing? The VIOs should have been dumped from the - // VIO pool. Maybe just dump their addresses so it's clear they're here? - - // XXX dump writer VIO? -} - -/**********************************************************************/ -void dumpPacker(const Packer *packer) -{ - logInfo("Packer"); - logInfo(" flushGeneration=%llu state %s writingBatches=%s", - packer->flushGeneration, getAdminStateName(&packer->state), - boolToString(packer->writingBatches)); - - logInfo(" inputBinCount=%llu", packer->size); - for (InputBin *bin = getFullestBin(packer); - bin != NULL; - bin = nextBin(packer, bin)) { - dumpInputBin(bin, false); - } - - dumpInputBin(packer->canceledBin, true); - - logInfo(" outputBinCount=%zu idleOutputBinCount=%zu", - packer->outputBinCount, packer->idleOutputBinCount); - const RingNode *head = &packer->outputBins; - for (RingNode *node = head->next; node != head; node = node->next) { - dumpOutputBin(outputBinFromRingNode(node)); - } -} diff --git a/vdo/base/packer.h b/vdo/base/packer.h deleted file mode 100644 index 6661552..0000000 --- a/vdo/base/packer.h +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packer.h#3 $ - */ - -#ifndef PACKER_H -#define PACKER_H - -#include "completion.h" -#include "physicalLayer.h" -#include "statistics.h" -#include "threadConfig.h" -#include "types.h" - -enum { - DEFAULT_PACKER_INPUT_BINS = 16, - DEFAULT_PACKER_OUTPUT_BINS = 256, -}; - -typedef struct packer Packer; - -/** - * Make a new block packer. - * - * @param [in] layer The physical layer to which compressed blocks - * will be written - * @param [in] inputBinCount The number of partial bins to keep in memory - * @param [in] outputBinCount The number of compressed blocks that can be - * written concurrently - * @param [in] threadConfig The thread configuration of the VDO - * @param [out] packerPtr A pointer to hold the new packer - * - * @return VDO_SUCCESS or an error - **/ -int makePacker(PhysicalLayer *layer, - BlockCount inputBinCount, - BlockCount outputBinCount, - const ThreadConfig *threadConfig, - Packer **packerPtr) - __attribute__((warn_unused_result)); - -/** - * Free a block packer and null out the reference to it. - * - * @param packerPtr A pointer to the packer to free - **/ -void freePacker(Packer **packerPtr); - -/** - * Check whether the compressed data in a DataVIO will fit in a packer bin. - * - * @param dataVIO The DataVIO - * - * @return true if the DataVIO will fit in a bin - **/ -bool isSufficientlyCompressible(DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Get the thread ID of the packer's zone. - * - * @param packer The packer - * - * @return The packer's thread ID - **/ -ThreadID getPackerThreadID(Packer *packer); - -/** - * Get the current statistics from the packer. - * - * @param packer The packer to query - * - * @return a copy of the current statistics for the packer - **/ -PackerStatistics getPackerStatistics(const Packer *packer) - __attribute__((warn_unused_result)); - -/** - * Attempt to rewrite the data in this DataVIO as part of a compressed block. - * - * @param dataVIO The DataVIO to pack - **/ -void attemptPacking(DataVIO *dataVIO); - -/** - * Request that the packer flush asynchronously. All bins with at least two - * compressed data blocks will be written out, and any solitary pending VIOs - * will be released from the packer. While flushing is in progress, any VIOs - * submitted to attemptPacking() will be continued immediately without - * attempting to pack them. - * - * @param packer The packer to flush - **/ -void flushPacker(Packer *packer); - -/** - * Remove a lock holder from the packer. - * - * @param completion The DataVIO which needs a lock held by a DataVIO in the - * packer. The dataVIO's compressedVIO.lockHolder field will - * point to the DataVIO to remove. - **/ -void removeLockHolderFromPacker(VDOCompletion *completion); - -/** - * Increment the flush generation in the packer. This will also cause the - * packer to flush so that any VIOs from previous generations will exit the - * packer. - * - * @param packer The packer - **/ -void incrementPackerFlushGeneration(Packer *packer); - -/** - * Drain the packer by preventing any more VIOs from entering the packer and - * then flushing. - * - * @param packer The packer to drain - * @param completion The completion to finish when the packer has drained - **/ -void drainPacker(Packer *packer, VDOCompletion *completion); - -/** - * Resume a packer which has been suspended. - * - * @param packer The packer to resume - * @param parent The completion to finish when the packer has resumed - * - * @return VDO_SUCCESS or an error - **/ -void resumePacker(Packer *packer, VDOCompletion *parent); - -/** - * Dump the packer, in a thread-unsafe fashion. - * - * @param packer The packer - **/ -void dumpPacker(const Packer *packer); - -#endif /* PACKER_H */ diff --git a/vdo/base/packerInternals.h b/vdo/base/packerInternals.h deleted file mode 100644 index e5aa500..0000000 --- a/vdo/base/packerInternals.h +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/packerInternals.h#4 $ - */ - -#ifndef PACKER_INTERNALS_H -#define PACKER_INTERNALS_H - -#include "packer.h" - -#include "atomic.h" - -#include "adminState.h" -#include "compressedBlock.h" -#include "header.h" -#include "types.h" -#include "waitQueue.h" - -/** - * Each InputBin holds an incomplete batch of DataVIOs that only partially fill - * a compressed block. The InputBins are kept in a ring sorted by the amount of - * unused space so the first bin with enough space to hold a newly-compressed - * DataVIO can easily be found. When the bin fills up or is flushed, the - * incoming DataVIOs are moved to the Packer's batchedDataVIOs queue, from - * which they will eventually be routed to an idle OutputBin. - * - * There is one special input bin which is used to hold DataVIOs which have - * been canceled and removed from their input bin by the packer. These DataVIOs - * need to wait for the canceller to rendezvous with them (VDO-2809) and so - * they sit in this special bin. - **/ -struct inputBin { - /** List links for Packer.sortedBins */ - RingNode ring; - /** The number of items in the bin */ - SlotNumber slotsUsed; - /** The number of compressed block bytes remaining in the current batch */ - size_t freeSpace; - /** The current partial batch of DataVIOs, waiting for more */ - DataVIO *incoming[]; -}; - -/** - * Each OutputBin allows a single compressed block to be packed and written. - * When it is not idle, it holds a batch of DataVIOs that have been packed - * into the compressed block, written asynchronously, and are waiting for the - * write to complete. - **/ -typedef struct { - /** List links for Packer.outputBins */ - RingNode ring; - /** The storage for encoding the compressed block representation */ - CompressedBlock *block; - /** The AllocatingVIO wrapping the compressed block for writing */ - AllocatingVIO *writer; - /** The number of compression slots used in the compressed block */ - SlotNumber slotsUsed; - /** The DataVIOs packed into the block, waiting for the write to complete */ - WaitQueue outgoing; -} OutputBin; - -/** - * A counted array holding a batch of DataVIOs that should be packed into an - * output bin. - **/ -typedef struct { - size_t slotsUsed; - DataVIO *slots[MAX_COMPRESSION_SLOTS]; -} OutputBatch; - -struct packer { - /** The ID of the packer's callback thread */ - ThreadID threadID; - /** The selector for determining which physical zone to allocate from */ - AllocationSelector *selector; - /** The number of input bins */ - BlockCount size; - /** The block size minus header size */ - size_t binDataSize; - /** The number of compression slots */ - size_t maxSlots; - /** A ring of all InputBins, kept sorted by freeSpace */ - RingNode inputBins; - /** A ring of all OutputBins */ - RingNode outputBins; - /** - * A bin to hold DataVIOs which were canceled out of the packer and are - * waiting to rendezvous with the canceling DataVIO. - **/ - InputBin *canceledBin; - - /** The current flush generation */ - SequenceNumber flushGeneration; - - /** The administrative state of the packer */ - AdminState state; - /** True when writing batched DataVIOs */ - bool writingBatches; - - // Atomic counters corresponding to the fields of PackerStatistics: - - /** Number of compressed data items written since startup */ - Atomic64 fragmentsWritten; - /** Number of blocks containing compressed items written since startup */ - Atomic64 blocksWritten; - /** Number of DataVIOs that are pending in the packer */ - Atomic64 fragmentsPending; - - /** Queue of batched DataVIOs waiting to be packed */ - WaitQueue batchedDataVIOs; - - /** The total number of output bins allocated */ - size_t outputBinCount; - /** The number of idle output bins on the stack */ - size_t idleOutputBinCount; - /** The stack of idle output bins (0=bottom) */ - OutputBin *idleOutputBins[]; -}; - -/** - * This returns the first bin in the freeSpace-sorted list. - **/ -InputBin *getFullestBin(const Packer *packer); - -/** - * This returns the next bin in the freeSpace-sorted list. - **/ -InputBin *nextBin(const Packer *packer, InputBin *bin); - -/** - * Change the maxiumum number of compression slots the packer will use. The new - * number of slots must be less than or equal to MAX_COMPRESSION_SLOTS. Bins - * which already have fragments will not be resized until they are next written - * out. - * - * @param packer The packer - * @param slots The new number of slots - **/ -void resetSlotCount(Packer *packer, CompressedFragmentCount slots); - -/** - * Remove a DataVIO from the packer. This method is exposed for testing. - * - * @param dataVIO The DataVIO to remove - **/ -void removeFromPacker(DataVIO *dataVIO); - -#endif /* PACKER_INTERNALS_H */ diff --git a/vdo/base/partitionCopy.c b/vdo/base/partitionCopy.c deleted file mode 100644 index d5fa6de..0000000 --- a/vdo/base/partitionCopy.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/partitionCopy.c#2 $ - */ - -#include "partitionCopy.h" - -#include "memoryAlloc.h" - -#include "completion.h" -#include "constants.h" -#include "extent.h" -#include "numUtils.h" - -enum { - STRIDE_LENGTH = 2048 -}; - -/** - * A partition copy completion. - **/ -typedef struct { - /** completion header */ - VDOCompletion completion; - /** the source partition to copy from */ - Partition *source; - /** the target partition to copy to */ - Partition *target; - /** the current in-partition PBN the copy is beginning at */ - PhysicalBlockNumber currentIndex; - /** the last block to copy */ - PhysicalBlockNumber endingIndex; - /** the backing data used by the extent */ - char *data; - /** the extent being used to copy */ - VDOExtent *extent; -} CopyCompletion; - -/** - * Convert a VDOCompletion to a CopyCompletion. - * - * @param completion The completion to convert - * - * @return the completion as a CopyCompletion - **/ -__attribute__((warn_unused_result)) -static inline -CopyCompletion *asCopyCompletion(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(CopyCompletion, completion) == 0); - assertCompletionType(completion->type, PARTITION_COPY_COMPLETION); - return (CopyCompletion *) completion; -} - -/**********************************************************************/ -int makeCopyCompletion(PhysicalLayer *layer, VDOCompletion **completionPtr) -{ - CopyCompletion *copy; - int result = ALLOCATE(1, CopyCompletion, __func__, ©); - if (result != VDO_SUCCESS) { - return result; - } - initializeCompletion(©->completion, PARTITION_COPY_COMPLETION, layer); - - result = ALLOCATE((VDO_BLOCK_SIZE * STRIDE_LENGTH), char, - "partition copy extent", ©->data); - if (result != VDO_SUCCESS) { - VDOCompletion *completion = ©->completion; - freeCopyCompletion(&completion); - return result; - } - - result = createExtent(layer, VIO_TYPE_PARTITION_COPY, VIO_PRIORITY_HIGH, - STRIDE_LENGTH, copy->data, ©->extent); - if (result != VDO_SUCCESS) { - VDOCompletion *completion = ©->completion; - freeCopyCompletion(&completion); - return result; - } - - *completionPtr = ©->completion; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeCopyCompletion(VDOCompletion **completionPtr) -{ - if (*completionPtr == NULL) { - return; - } - - CopyCompletion *copy = asCopyCompletion(*completionPtr); - freeExtent(©->extent); - FREE(copy->data); - FREE(copy); - *completionPtr = NULL; -} - -/**********************************************************************/ -static void copyPartitionStride(CopyCompletion *copy); - -/** - * Determine the number of blocks to copy in the current stride. - * - * @param copy The copy completion - * - * @return The number of blocks to copy in the current stride - **/ -static inline BlockCount getStrideSize(CopyCompletion *copy) -{ - return minBlockCount(STRIDE_LENGTH, copy->endingIndex - copy->currentIndex); -} - -/** - * Process a completed write during a partition copy. - * - * @param completion The extent which has just completed writing - **/ -static void completeWriteForCopy(VDOCompletion *completion) -{ - CopyCompletion *copy = asCopyCompletion(completion->parent); - copy->currentIndex += getStrideSize(copy); - if (copy->currentIndex >= copy->endingIndex) { - // We're done. - finishCompletion(completion->parent, VDO_SUCCESS); - return; - } - copyPartitionStride(copy); -} - -/** - * Process a completed read during a partition copy, and launch the - * corresponding write to the new partition. - * - * @param completion The extent which has just completed reading - **/ -static void completeReadForCopy(VDOCompletion *completion) -{ - CopyCompletion *copy = asCopyCompletion(completion->parent); - PhysicalBlockNumber layerStartBlock; - int result = translateToPBN(copy->target, copy->currentIndex, - &layerStartBlock); - if (result != VDO_SUCCESS) { - finishCompletion(completion->parent, result); - return; - } - - completion->callback = completeWriteForCopy; - writePartialMetadataExtent(asVDOExtent(completion), layerStartBlock, - getStrideSize(copy)); -} - -/** - * Copy a stride from one partition to the new partition. - * - * @param copy The CopyCompletion - **/ -static void copyPartitionStride(CopyCompletion *copy) -{ - PhysicalBlockNumber layerStartBlock; - int result = translateToPBN(copy->source, copy->currentIndex, - &layerStartBlock); - if (result != VDO_SUCCESS) { - finishCompletion(©->completion, result); - return; - } - - prepareCompletion(©->extent->completion, completeReadForCopy, - finishParentCallback, copy->completion.callbackThreadID, - ©->completion); - readPartialMetadataExtent(copy->extent, layerStartBlock, - getStrideSize(copy)); -} - -/** - * Verify that the source can be copied to the target safely. - * - * @param source The source partition - * @param target The target partition - * - * @return VDO_SUCCESS or an error code - **/ -static int validatePartitionCopy(Partition *source, Partition *target) -{ - BlockCount sourceSize = getFixedLayoutPartitionSize(source); - BlockCount targetSize = getFixedLayoutPartitionSize(target); - - PhysicalBlockNumber sourceStart = getFixedLayoutPartitionOffset(source); - PhysicalBlockNumber sourceEnd = sourceStart + sourceSize; - PhysicalBlockNumber targetStart = getFixedLayoutPartitionOffset(target); - PhysicalBlockNumber targetEnd = targetStart + targetSize; - - int result = ASSERT(sourceSize <= targetSize, - "target partition must be not smaller than source" - " partition"); - if (result != UDS_SUCCESS) { - return result; - } - - return ASSERT(((sourceEnd <= targetStart) || (targetEnd <= sourceStart)), - "target partition must not overlap source partition"); -} - -/**********************************************************************/ -void copyPartitionAsync(VDOCompletion *completion, - Partition *source, - Partition *target, - VDOCompletion *parent) -{ - int result = validatePartitionCopy(source, target); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - CopyCompletion *copy = asCopyCompletion(completion); - prepareToFinishParent(©->completion, parent); - copy->source = source; - copy->target = target; - copy->currentIndex = 0; - copy->endingIndex = getFixedLayoutPartitionSize(source); - copyPartitionStride(copy); -} diff --git a/vdo/base/partitionCopy.h b/vdo/base/partitionCopy.h deleted file mode 100644 index 574ac13..0000000 --- a/vdo/base/partitionCopy.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/partitionCopy.h#2 $ - */ - -#ifndef PARTITION_COPY_H -#define PARTITION_COPY_H - -#include "fixedLayout.h" -#include "physicalLayer.h" -#include "types.h" - -/** - * Make a copy completion. - * - * @param [in] layer The layer on which the partitions reside - * @param [out] completionPtr A pointer to hold the copy completion - * - * @return VDO_SUCCESS or an error - **/ -int makeCopyCompletion(PhysicalLayer *layer, VDOCompletion **completionPtr) - __attribute__((warn_unused_result)); - -/** - * Free a copy completion and NULL out the reference to it. - * - * @param completionPtr A pointer to the complete to be freed - **/ -void freeCopyCompletion(VDOCompletion **completionPtr); - -/** - * Copy a partition. - * - * @param completion The copy completion to use - * @param source The partition to copy from - * @param target The partition to copy to - * @param parent The parent to finish when the copy is complete - **/ -void copyPartitionAsync(VDOCompletion *completion, - Partition *source, - Partition *target, - VDOCompletion *parent); - -#endif /* PARTITION_COPY_H */ diff --git a/vdo/base/pbnLock.c b/vdo/base/pbnLock.c deleted file mode 100644 index 5e9a274..0000000 --- a/vdo/base/pbnLock.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLock.c#3 $ - */ - -#include "pbnLock.h" - -#include "logger.h" - -#include "blockAllocator.h" -#include "referenceBlock.h" - -struct pbnLockImplementation { - PBNLockType type; - const char *name; - const char *releaseReason; -}; - -/** - * This array must have an entry for every PBNLockType value. - **/ -static const PBNLockImplementation LOCK_IMPLEMENTATIONS[] = { - [VIO_READ_LOCK] = { - .type = VIO_READ_LOCK, - .name = "read", - .releaseReason = "candidate duplicate", - }, - [VIO_WRITE_LOCK] = { - .type = VIO_WRITE_LOCK, - .name = "write", - .releaseReason = "newly allocated", - }, - [VIO_COMPRESSED_WRITE_LOCK] = { - .type = VIO_COMPRESSED_WRITE_LOCK, - .name = "compressed write", - .releaseReason = "failed compression", - }, - [VIO_BLOCK_MAP_WRITE_LOCK] = { - .type = VIO_BLOCK_MAP_WRITE_LOCK, - .name = "block map write", - .releaseReason = "block map write", - }, -}; - -/**********************************************************************/ -static inline bool hasLockType(const PBNLock *lock, PBNLockType type) -{ - return (lock->implementation == &LOCK_IMPLEMENTATIONS[type]); -} - -/**********************************************************************/ -bool isPBNReadLock(const PBNLock *lock) -{ - return hasLockType(lock, VIO_READ_LOCK); -} - -/**********************************************************************/ -static inline void setPBNLockType(PBNLock *lock, PBNLockType type) -{ - lock->implementation = &LOCK_IMPLEMENTATIONS[type]; -} - -/**********************************************************************/ -void initializePBNLock(PBNLock *lock, PBNLockType type) -{ - lock->holderCount = 0; - setPBNLockType(lock, type); -} - -/**********************************************************************/ -void downgradePBNWriteLock(PBNLock *lock) -{ - ASSERT_LOG_ONLY(!isPBNReadLock(lock), - "PBN lock must not already have been downgraded"); - ASSERT_LOG_ONLY(!hasLockType(lock, VIO_BLOCK_MAP_WRITE_LOCK), - "must not downgrade block map write locks"); - ASSERT_LOG_ONLY(lock->holderCount == 1, - "PBN write lock should have one holder but has %u", - lock->holderCount); - if (hasLockType(lock, VIO_WRITE_LOCK)) { - // DataVIO write locks are downgraded in place--the writer retains the - // hold on the lock. They've already had a single incRef journaled. - lock->incrementLimit = MAXIMUM_REFERENCE_COUNT - 1; - } else { - // Compressed block write locks are downgraded when they are shared with - // all their hash locks. The writer is releasing its hold on the lock. - lock->holderCount = 0; - lock->incrementLimit = MAXIMUM_REFERENCE_COUNT; - } - setPBNLockType(lock, VIO_READ_LOCK); -} - -/**********************************************************************/ -bool claimPBNLockIncrement(PBNLock *lock) -{ - /* - * Claim the next free reference atomically since hash locks from multiple - * hash zone threads might be concurrently deduplicating against a single - * PBN lock on compressed block. As long as hitting the increment limit will - * lead to the PBN lock being released in a sane time-frame, we won't - * overflow a 32-bit claim counter, allowing a simple add instead of a - * compare-and-swap. - */ - uint32_t claimNumber = atomicAdd32(&lock->incrementsClaimed, 1); - return (claimNumber <= lock->incrementLimit); -} - -/**********************************************************************/ -void assignProvisionalReference(PBNLock *lock) -{ - ASSERT_LOG_ONLY(!lock->hasProvisionalReference, - "lock does not have a provisional reference"); - lock->hasProvisionalReference = true; -} - -/**********************************************************************/ -void unassignProvisionalReference(PBNLock *lock) -{ - lock->hasProvisionalReference = false; -} - -/**********************************************************************/ -void releaseProvisionalReference(PBNLock *lock, - PhysicalBlockNumber lockedPBN, - BlockAllocator *allocator) -{ - if (hasProvisionalReference(lock)) { - releaseBlockReference(allocator, lockedPBN, - lock->implementation->releaseReason); - unassignProvisionalReference(lock); - } -} diff --git a/vdo/base/pbnLock.h b/vdo/base/pbnLock.h deleted file mode 100644 index bd6512b..0000000 --- a/vdo/base/pbnLock.h +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLock.h#3 $ - */ - -#ifndef PBN_LOCK_H -#define PBN_LOCK_H - -#include "atomic.h" -#include "types.h" - -/** - * The type of a PBN lock. - **/ -typedef enum { - VIO_READ_LOCK = 0, - VIO_WRITE_LOCK, - VIO_COMPRESSED_WRITE_LOCK, - VIO_BLOCK_MAP_WRITE_LOCK, -} PBNLockType; - -typedef struct pbnLockImplementation PBNLockImplementation; - -/** - * A PBN lock. - **/ -struct pbnLock { - /** The implementation of the lock */ - const PBNLockImplementation *implementation; - - /** The number of VIOs holding or sharing this lock */ - VIOCount holderCount; - /** - * The number of compressed block writers holding a share of this lock while - * they are acquiring a reference to the PBN. - **/ - uint8_t fragmentLocks; - - /** - * Whether the locked PBN has been provisionally referenced on behalf of the - * lock holder. - **/ - bool hasProvisionalReference; - - /** - * For read locks, the number of references that were known to be available - * on the locked block at the time the lock was acquired. - **/ - uint8_t incrementLimit; - - /** - * For read locks, the number of DataVIOs that have tried to claim one of - * the available increments during the lifetime of the lock. Each claim will - * first increment this counter, so it can exceed the increment limit. - **/ - Atomic32 incrementsClaimed; -}; - -/** - * Initialize a PBNLock. - * - * @param lock The lock to initialize - * @param type The type of the lock - **/ -void initializePBNLock(PBNLock *lock, PBNLockType type); - -/** - * Check whether a PBNLock is a read lock. - * - * @param lock The lock to check - * - * @return true if the lock is a read lock - **/ -bool isPBNReadLock(const PBNLock *lock) - __attribute__((warn_unused_result)); - -/** - * Downgrade a PBN write lock to a PBN read lock. The lock holder count is - * cleared and the caller is responsible for setting the new count. - * - * @param lock The PBN write lock to downgrade - **/ -void downgradePBNWriteLock(PBNLock *lock); - -/** - * Try to claim one of the available reference count increments on a read - * lock. Claims may be attempted from any thread. A claim is only valid until - * the PBN lock is released. - * - * @param lock The PBN read lock from which to claim an increment - * - * @return true if the claim succeeded, guaranteeing one - * increment can be made without overflowing the PBN's reference count - **/ -bool claimPBNLockIncrement(PBNLock *lock) - __attribute__((warn_unused_result)); - -/** - * Check whether a PBN lock has a provisional reference. - * - * @param lock The PBN lock - **/ -static inline bool hasProvisionalReference(PBNLock *lock) -{ - return ((lock != NULL) && lock->hasProvisionalReference); -} - -/** - * Inform a PBN lock that it is responsible for a provisional reference. - * - * @param lock The PBN lock - **/ -void assignProvisionalReference(PBNLock *lock); - -/** - * Inform a PBN lock that it is no longer responsible for a provisional - * reference. - * - * @param lock The PBN lock - **/ -void unassignProvisionalReference(PBNLock *lock); - -/** - * If the lock is responsible for a provisional reference, release that - * reference. This method is called when the lock is released. - * - * @param lock The lock - * @param lockedPBN The PBN covered by the lock - * @param allocator The block allocator from which to release the reference - **/ -void releaseProvisionalReference(PBNLock *lock, - PhysicalBlockNumber lockedPBN, - BlockAllocator *allocator); - -#endif /* PBN_LOCK_H */ diff --git a/vdo/base/pbnLockPool.c b/vdo/base/pbnLockPool.c deleted file mode 100644 index 38e2f32..0000000 --- a/vdo/base/pbnLockPool.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLockPool.c#2 $ - */ - -#include "pbnLockPool.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "ringNode.h" -#include "pbnLock.h" - -/** - * Unused (idle) PBN locks are kept in a ring. Just like in a malloc - * implementation, the lock structure is unused memory, so we can save a bit - * of space (and not pollute the lock structure proper) by using a union to - * overlay the lock structure with the free list. - **/ -typedef union idlePBNLock { - /** Only used while locks are in the pool */ - RingNode node; - /** Only used while locks are not in the pool */ - PBNLock lock; -} IdlePBNLock; - -/** - * The lock pool is little more than the memory allocated for the locks. - **/ -struct pbnLockPool { - /** The number of locks allocated for the pool */ - size_t capacity; - /** The number of locks currently borrowed from the pool */ - size_t borrowed; - /** A ring containing all idle PBN lock instances */ - RingNode idleRing; - /** The memory for all the locks allocated by this pool */ - IdlePBNLock locks[]; -}; - -/**********************************************************************/ -int makePBNLockPool(size_t capacity, PBNLockPool **poolPtr) -{ - PBNLockPool *pool; - int result = ALLOCATE_EXTENDED(PBNLockPool, capacity, IdlePBNLock, __func__, - &pool); - if (result != VDO_SUCCESS) { - return result; - } - - pool->capacity = capacity; - pool->borrowed = capacity; - initializeRing(&pool->idleRing); - - for (size_t i = 0; i < capacity; i++) { - PBNLock *lock = &pool->locks[i].lock; - returnPBNLockToPool(pool, &lock); - } - - *poolPtr = pool; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freePBNLockPool(PBNLockPool **poolPtr) -{ - if (*poolPtr == NULL) { - return; - } - - PBNLockPool *pool = *poolPtr; - ASSERT_LOG_ONLY(pool->borrowed == 0, - "All PBN locks must be returned to the pool before it is" - " freed, but %zu locks are still on loan", - pool->borrowed); - FREE(pool); - *poolPtr = NULL; -} - -/**********************************************************************/ -int borrowPBNLockFromPool(PBNLockPool *pool, - PBNLockType type, - PBNLock **lockPtr) -{ - if (pool->borrowed >= pool->capacity) { - return logErrorWithStringError(VDO_LOCK_ERROR, - "no free PBN locks left to borrow"); - } - pool->borrowed += 1; - - RingNode *idleNode = popRingNode(&pool->idleRing); - // The lock was zeroed when it was placed in the pool, but the overlapping - // ring pointers are non-zero after a pop. - memset(idleNode, 0, sizeof(*idleNode)); - - STATIC_ASSERT(offsetof(IdlePBNLock, node) == offsetof(IdlePBNLock, lock)); - PBNLock *lock = (PBNLock *) idleNode; - initializePBNLock(lock, type); - - *lockPtr = lock; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void returnPBNLockToPool(PBNLockPool *pool, PBNLock **lockPtr) -{ - // Take what should be the last lock reference from the caller - PBNLock *lock = *lockPtr; - *lockPtr = NULL; - - // A bit expensive, but will promptly catch some use-after-free errors. - memset(lock, 0, sizeof(*lock)); - - RingNode *idleNode = (RingNode *) lock; - initializeRing(idleNode); - pushRingNode(&pool->idleRing, idleNode); - - ASSERT_LOG_ONLY(pool->borrowed > 0, "shouldn't return more than borrowed"); - pool->borrowed -= 1; -} diff --git a/vdo/base/pbnLockPool.h b/vdo/base/pbnLockPool.h deleted file mode 100644 index 6853f84..0000000 --- a/vdo/base/pbnLockPool.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pbnLockPool.h#1 $ - */ - -#ifndef PBN_LOCK_POOL_H -#define PBN_LOCK_POOL_H - -#include "pbnLock.h" -#include "types.h" - -typedef struct pbnLockPool PBNLockPool; - -/** - * Create a new PBN lock pool and all the lock instances it can loan out. - * - * @param [in] capacity The number of PBN locks to allocate for the pool - * @param [out] poolPtr A pointer to receive the new pool - * - * @return a VDO_SUCCESS or an error code - **/ -int makePBNLockPool(size_t capacity, PBNLockPool **poolPtr) - __attribute__((warn_unused_result)); - -/** - * Free a PBN lock pool null out the reference to it. This also frees all all - * the PBN locks it allocated, so the caller must ensure that all locks have - * been returned to the pool. - * - * @param [in,out] poolPtr The reference to the lock pool to free - **/ -void freePBNLockPool(PBNLockPool **poolPtr); - -/** - * Borrow a PBN lock from the pool and initialize it with the provided type. - * Pools do not grow on demand or allocate memory, so this will fail if the - * pool is empty. Borrowed locks are still associated with this pool and must - * be returned to only this pool. - * - * @param [in] pool The pool from which to borrow - * @param [in] type The type with which to initialize the lock - * @param [out] lockPtr A pointer to receive the borrowed lock - * - * @return VDO_SUCCESS, or VDO_LOCK_ERROR if the pool is empty - **/ -int borrowPBNLockFromPool(PBNLockPool *pool, - PBNLockType type, - PBNLock **lockPtr) - __attribute__((warn_unused_result)); - -/** - * Return to the pool a lock that was borrowed from it, and null out the - * caller's reference to it. It must be the last live reference, as if the - * memory were being freed (the lock memory will re-initialized or zeroed). - * - * @param [in] pool The pool from which the lock was borrowed - * @param [in,out] lockPtr The last reference to the lock being returned - **/ -void returnPBNLockToPool(PBNLockPool *pool, PBNLock **lockPtr); - -#endif // PBN_LOCK_POOL_H diff --git a/vdo/base/physicalLayer.c b/vdo/base/physicalLayer.c deleted file mode 100644 index 231a3bf..0000000 --- a/vdo/base/physicalLayer.c +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalLayer.c#1 $ - */ - -#include "physicalLayer.h" - -static PhysicalLayerGetter *physicalLayerGetter; - -/**********************************************************************/ -void registerPhysicalLayerGetter(PhysicalLayerGetter *getter) -{ - physicalLayerGetter = getter; -} - -/**********************************************************************/ -PhysicalLayer *getPhysicalLayer(void) -{ - if (physicalLayerGetter != NULL) { - return (*physicalLayerGetter)(); - } - return NULL; -} diff --git a/vdo/base/physicalLayer.h b/vdo/base/physicalLayer.h deleted file mode 100644 index 18d6a20..0000000 --- a/vdo/base/physicalLayer.h +++ /dev/null @@ -1,427 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalLayer.h#2 $ - */ - -#ifndef PHYSICAL_LAYER_H -#define PHYSICAL_LAYER_H - -#include "types.h" - -static const CRC32Checksum INITIAL_CHECKSUM = 0xffffffff; - -enum { - /* The size of a CRC-32 checksum */ - CHECKSUM_SIZE = sizeof(CRC32Checksum), -}; - -/** - * A function to destroy a physical layer and NULL out the reference to it. - * - * @param layerPtr A pointer to the layer to destroy - **/ -typedef void LayerDestructor(PhysicalLayer **layerPtr); - -/** - * A function to update a running CRC-32 checksum. - * - * @param crc The current value of the crc - * @param buffer The data to add to the checksum - * @param length The length of the data - * - * @return The updated value of the checksum - **/ -typedef uint32_t CRC32Updater(CRC32Checksum crc, - const byte *buffer, - size_t length); - -/** - * A function to report the block count of a physicalLayer. - * - * @param layer The layer - * - * @return The block count of the layer - **/ -typedef BlockCount BlockCountGetter(PhysicalLayer *layer); - -/** - * A function which can allocate a buffer suitable for use in an - * ExtentReader or ExtentWriter. - * - * @param [in] layer The physical layer in question - * @param [in] bytes The size of the buffer, in bytes. - * @param [in] why The occasion for allocating the buffer - * @param [out] bufferPtr A pointer to hold the buffer - * - * @return a success or error code - **/ -typedef int BufferAllocator(PhysicalLayer *layer, - size_t bytes, - const char *why, - char **bufferPtr); - -/** - * A function which can read an extent from a physicalLayer. - * - * @param [in] layer The physical layer from which to read - * @param [in] startBlock The physical block number of the start of the - * extent - * @param [in] blockCount The number of blocks in the extent - * @param [out] buffer A buffer to hold the extent - * @param [out] blocksRead A pointer to hold the number of blocks read (may be - * NULL) - * - * @return a success or error code - **/ -typedef int ExtentReader(PhysicalLayer *layer, - PhysicalBlockNumber startBlock, - size_t blockCount, - char *buffer, - size_t *blocksRead); - -/** - * A function which can write an extent to a physicalLayer. - * - * @param [in] layer The physical layer to which to write - * @param [in] startBlock The physical block number of the start of the - * extent - * @param [in] blockCount The number of blocks in the extent - * @param [in] buffer The buffer which contains the data - * @param [out] blocksWritten A pointer to hold the number of blocks written - * (may be NULL) - * - * @return a success or error code - **/ -typedef int ExtentWriter(PhysicalLayer *layer, - PhysicalBlockNumber startBlock, - size_t blockCount, - char *buffer, - size_t *blocksWritten); - -/** - * A function to allocate a metadata VIO. - * - * @param [in] layer The physical layer - * @param [in] vioType The type of VIO to create - * @param [in] priority The relative priority to assign to the VIOs - * @param [in] parent The parent of this VIO - * @param [in] data The buffer - * @param [out] vioPtr A pointer to hold the new VIO - * - * @return VDO_SUCCESS or an error - **/ -typedef int MetadataVIOCreator(PhysicalLayer *layer, - VIOType vioType, - VIOPriority priority, - void *parent, - char *data, - VIO **vioPtr); - -/** - * A function to allocate an AllocatingVIO for compressed writes. - * - * @param [in] layer The physical layer - * @param [in] parent The parent of this VIO - * @param [in] data The buffer - * @param [out] allocatingVIOPtr A pointer to hold the new AllocatingVIO - * - * @return VDO_SUCCESS or an error - **/ -typedef int CompressedWriteVIOCreator(PhysicalLayer *layer, - void *parent, - char *data, - AllocatingVIO **allocatingVIOPtr); - -/** - * A function to destroy a VIO. The pointer to the VIO will be nulled out. - * - * @param vioPtr A pointer to the VIO to destroy - **/ -typedef void VIODestructor(VIO **vioPtr); - -/** - * A function to zero the contents of a DataVIO. - * - * @param dataVIO The DataVIO to zero - **/ -typedef AsyncDataOperation DataVIOZeroer; - -/** - * A function to copy the contents of a DataVIO into another DataVIO. - * - * @param source The dataVIO to copy from - * @param destination The dataVIO to copy to - **/ -typedef void DataCopier(DataVIO *source, DataVIO *destination); - -/** - * A function to apply a partial write to a DataVIO which has completed the - * read portion of a read-modify-write operation. - * - * @param dataVIO The dataVIO to modify - **/ -typedef AsyncDataOperation DataModifier; - -/** - * A function to asynchronously hash the block data, setting the chunk name of - * the DataVIO. This is asynchronous to allow the computation to be done on - * different threads. - * - * @param dataVIO The DataVIO to hash - **/ -typedef AsyncDataOperation DataHasher; - -/** - * A function to determine whether a block is a duplicate. This function - * expects the 'physical' field of the DataVIO to be set to the physical block - * where the block will be written if it is not a duplicate. If the block does - * turn out to be a duplicate, the DataVIO's 'isDuplicate' field will be set to - * true, and the DataVIO's 'advice' field will be set to the physical block and - * mapping state of the already stored copy of the block. - * - * @param dataVIO The DataVIO containing the block to check. - **/ -typedef AsyncDataOperation DuplicationChecker; - -/** - * A function to verify the duplication advice by examining an already-stored - * data block. This function expects the 'physical' field of the DataVIO to be - * set to the physical block where the block will be written if it is not a - * duplicate, and the 'duplicate' field to be set to the physical block and - * mapping state where a copy of the data may already exist. If the block is - * not a duplicate, the DataVIO's 'isDuplicate' field will be cleared. - * - * @param dataVIO The dataVIO containing the block to check. - **/ -typedef AsyncDataOperation DuplicationVerifier; - -/** - * A function to read a single DataVIO from the layer. - * - * If the DataVIO does not describe a read-modify-write operation, the - * physical layer may safely acknowledge the related user I/O request - * as complete. - * - * @param dataVIO The DataVIO to read - **/ -typedef AsyncDataOperation DataReader; - -/** - * A function to read a single metadata VIO from the layer. - * - * @param vio The vio to read - **/ -typedef AsyncOperation MetadataReader; - -/** - * A function to write a single DataVIO to the layer - * - * @param dataVIO The DataVIO to write - **/ -typedef AsyncDataOperation DataWriter; - -/** - * A function to write a single metadata VIO from the layer. - * - * @param vio The vio to write - **/ -typedef AsyncOperation MetadataWriter; - -/** - * A function to inform the layer that a DataVIO's related I/O request can be - * safely acknowledged as complete, even though the DataVIO itself may have - * further processing to do. - * - * @param dataVIO The DataVIO to acknowledge - **/ -typedef AsyncDataOperation DataAcknowledger; - -/** - * A function to compare the contents of a DataVIO to another DataVIO. - * - * @param first The first DataVIO to compare - * @param second The second DataVIO to compare - * - * @return true if the contents of the two DataVIOs are the same - **/ -typedef bool DataVIOComparator(DataVIO *first, DataVIO *second); - -/** - * A function to compress the data in a DataVIO. - * - * @param dataVIO The DataVIO to compress - **/ -typedef AsyncDataOperation DataCompressor; - -/** - * Update albireo. - * - * @param dataVIO The DataVIO which needs to change the entry for its data - **/ -typedef AsyncDataOperation AlbireoUpdater; - -/** - * A function to finish flush requests - * - * @param vdoFlush The flush requests - **/ -typedef void FlushComplete(VDOFlush **vdoFlush); - -/** - * A function to query the write policy of the layer. - * - * @param layer The layer to query - * - * @return the write policy of the layer - **/ -typedef WritePolicy WritePolicyGetter(PhysicalLayer *layer); - -/** - * A function to create an object that can be enqueued to run in a specified - * thread. The Enqueueable will be put into the 'enqueueable' field of the - * supplied completion. - * - * @param completion The completion to invoke the callback of - * - * @return VDO_SUCCESS or an error code - **/ -typedef int EnqueueableCreator(VDOCompletion *completion); - -/** - * A function to destroy and deallocate an Enqueueable object. - * - * @param enqueueablePtr Pointer to the object pointer to be destroyed - **/ -typedef void EnqueueableDestructor(Enqueueable **enqueueablePtr); - -/** - * A function to enqueue the Enqueueable object to run on the thread specified - * by its associated completion. - * - * @param enqueueable The object to be enqueued - **/ -typedef void Enqueuer(Enqueueable *enqueueable); - -/** - * A function to wait for an admin operation to complete. This function should - * not be called from a base-code thread. - * - * @param layer The layer on which to wait - **/ -typedef void OperationWaiter(PhysicalLayer *layer); - -/** - * A function to inform the layer of the result of an admin operation. - * - * @param layer The layer to inform - **/ -typedef void OperationComplete(PhysicalLayer *layer); - -/** - * A function to get the id of the current thread. - * - * @return The id of the current thread - **/ -typedef ThreadID ThreadIDGetter(void); - -/** - * A function to return the physical layer pointer for the current thread. - * - * @return The physical layer pointer - **/ -typedef PhysicalLayer *PhysicalLayerGetter(void); - -/** - * An abstraction representing the underlying physical layer. - **/ -struct physicalLayer { - // Management interface - LayerDestructor *destroy; - - // Synchronous interface - CRC32Updater *updateCRC32; - BlockCountGetter *getBlockCount; - - // Synchronous IO interface - BufferAllocator *allocateIOBuffer; - ExtentReader *reader; - ExtentWriter *writer; - - WritePolicyGetter *getWritePolicy; - - // Synchronous interfaces (vio-based) - MetadataVIOCreator *createMetadataVIO; - CompressedWriteVIOCreator *createCompressedWriteVIO; - VIODestructor *freeVIO; - DataVIOZeroer *zeroDataVIO; - DataCopier *copyData; - DataModifier *applyPartialWrite; - - // Asynchronous interface (vio-based) - DataHasher *hashData; - DuplicationChecker *checkForDuplication; - DuplicationVerifier *verifyDuplication; - DataReader *readData; - DataWriter *writeData; - CompressedWriter *writeCompressedBlock; - MetadataReader *readMetadata; - MetadataWriter *writeMetadata; - MetadataWriter *flush; - DataAcknowledger *acknowledgeDataVIO; - DataVIOComparator *compareDataVIOs; - DataCompressor *compressDataVIO; - AlbireoUpdater *updateAlbireo; - - // Asynchronous interface (other) - FlushComplete *completeFlush; - EnqueueableCreator *createEnqueueable; - EnqueueableDestructor *destroyEnqueueable; - Enqueuer *enqueue; - OperationWaiter *waitForAdminOperation; - OperationComplete *completeAdminOperation; - - // Thread specific interface - ThreadIDGetter *getCurrentThreadID; -}; - -/** - * Register the layer-specific implementation of getPhysicalLayer(). - * - * @param getter The function to be called - **/ -void registerPhysicalLayerGetter(PhysicalLayerGetter *getter); - -/** - * Fetch the physical layer pointer for the current thread. - * - * @return The physical layer pointer - **/ -PhysicalLayer *getPhysicalLayer(void); - -/** - * Get the id of the callback thread on which a completion is current running. - * - * @return the current thread ID - **/ -static inline ThreadID getCallbackThreadID(void) -{ - return getPhysicalLayer()->getCurrentThreadID(); -} - -#endif // PHYSICAL_LAYER_H diff --git a/vdo/base/physicalZone.c b/vdo/base/physicalZone.c deleted file mode 100644 index accb631..0000000 --- a/vdo/base/physicalZone.c +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalZone.c#3 $ - */ - -#include "physicalZone.h" - -#include "memoryAlloc.h" - -#include "blockAllocator.h" -#include "blockMap.h" -#include "completion.h" -#include "constants.h" -#include "dataVIO.h" -#include "flush.h" -#include "hashLock.h" -#include "intMap.h" -#include "pbnLock.h" -#include "pbnLockPool.h" -#include "slabDepot.h" -#include "vdoInternal.h" - -enum { - // Each user DataVIO needs a PBN read lock and write lock, and each packer - // output bin has an AllocatingVIO that needs a PBN write lock. - LOCK_POOL_CAPACITY = 2 * MAXIMUM_USER_VIOS + DEFAULT_PACKER_OUTPUT_BINS, -}; - -struct physicalZone { - /** Which physical zone this is */ - ZoneCount zoneNumber; - /** The thread ID for this zone */ - ThreadID threadID; - /** In progress operations keyed by PBN */ - IntMap *pbnOperations; - /** Pool of unused PBNLock instances */ - PBNLockPool *lockPool; - /** The block allocator for this zone */ - BlockAllocator *allocator; -}; - -/**********************************************************************/ -int makePhysicalZone(VDO *vdo, ZoneCount zoneNumber, PhysicalZone **zonePtr) -{ - PhysicalZone *zone; - int result = ALLOCATE(1, PhysicalZone, __func__, &zone); - if (result != VDO_SUCCESS) { - return result; - } - - result = makeIntMap(LOCK_MAP_CAPACITY, 0, &zone->pbnOperations); - if (result != VDO_SUCCESS) { - freePhysicalZone(&zone); - return result; - } - - result = makePBNLockPool(LOCK_POOL_CAPACITY, &zone->lockPool); - if (result != VDO_SUCCESS) { - freePhysicalZone(&zone); - return result; - } - - zone->zoneNumber = zoneNumber; - zone->threadID = getPhysicalZoneThread(getThreadConfig(vdo), zoneNumber); - zone->allocator = getBlockAllocatorForZone(vdo->depot, zoneNumber); - - *zonePtr = zone; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freePhysicalZone(PhysicalZone **zonePtr) -{ - if (*zonePtr == NULL) { - return; - } - - PhysicalZone *zone = *zonePtr; - freePBNLockPool(&zone->lockPool); - freeIntMap(&zone->pbnOperations); - FREE(zone); - *zonePtr = NULL; -} - -/**********************************************************************/ -ZoneCount getPhysicalZoneNumber(const PhysicalZone *zone) -{ - return zone->zoneNumber; -} - -/**********************************************************************/ -ThreadID getPhysicalZoneThreadID(const PhysicalZone *zone) -{ - return zone->threadID; -} - -/**********************************************************************/ -BlockAllocator *getBlockAllocator(const PhysicalZone *zone) -{ - return zone->allocator; -} - -/**********************************************************************/ -PBNLock *getPBNLock(PhysicalZone *zone, PhysicalBlockNumber pbn) -{ - return ((zone == NULL) ? NULL : intMapGet(zone->pbnOperations, pbn)); -} - -/**********************************************************************/ -int attemptPBNLock(PhysicalZone *zone, - PhysicalBlockNumber pbn, - PBNLockType type, - PBNLock **lockPtr) -{ - // Borrow and prepare a lock from the pool so we don't have to do two IntMap - // accesses in the common case of no lock contention. - PBNLock *newLock; - int result = borrowPBNLockFromPool(zone->lockPool, type, &newLock); - if (result != VDO_SUCCESS) { - ASSERT_LOG_ONLY(false, "must always be able to borrow a PBN lock"); - return result; - } - - PBNLock *lock; - result = intMapPut(zone->pbnOperations, pbn, newLock, false, - (void **) &lock); - if (result != VDO_SUCCESS) { - returnPBNLockToPool(zone->lockPool, &newLock); - return result; - } - - if (lock != NULL) { - // The lock is already held, so we don't need the borrowed lock. - returnPBNLockToPool(zone->lockPool, &newLock); - - result = ASSERT(lock->holderCount > 0, - "physical block %llu lock held", pbn); - if (result != VDO_SUCCESS) { - return result; - } - *lockPtr = lock; - } else { - *lockPtr = newLock; - } - return VDO_SUCCESS; -} - -/**********************************************************************/ -void releasePBNLock(PhysicalZone *zone, - PhysicalBlockNumber lockedPBN, - PBNLock **lockPtr) -{ - PBNLock *lock = *lockPtr; - if (lock == NULL) { - return; - } - *lockPtr = NULL; - - ASSERT_LOG_ONLY(lock->holderCount > 0, - "should not be releasing a lock that is not held"); - - lock->holderCount -= 1; - if (lock->holderCount > 0) { - // The lock was shared and is still referenced, so don't release it yet. - return; - } - - PBNLock *holder = intMapRemove(zone->pbnOperations, lockedPBN); - ASSERT_LOG_ONLY((lock == holder), - "physical block lock mismatch for block %llu", - lockedPBN); - - releaseProvisionalReference(lock, lockedPBN, zone->allocator); - - returnPBNLockToPool(zone->lockPool, &lock); -} - -/**********************************************************************/ -void dumpPhysicalZone(const PhysicalZone *zone) -{ - dumpBlockAllocator(zone->allocator); -} diff --git a/vdo/base/physicalZone.h b/vdo/base/physicalZone.h deleted file mode 100644 index 2c02bbe..0000000 --- a/vdo/base/physicalZone.h +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/physicalZone.h#1 $ - */ - -#ifndef PHYSICAL_ZONE_H -#define PHYSICAL_ZONE_H - -#include "pbnLock.h" -#include "types.h" - -/** - * Create a physical zone. - * - * @param [in] vdo The VDO to which the zone will belong - * @param [in] zoneNumber The number of the zone to create - * @param [out] zonePtr A pointer to hold the new PhysicalZone - * - * @return VDO_SUCCESS or an error code - **/ -int makePhysicalZone(VDO *vdo, ZoneCount zoneNumber, PhysicalZone **zonePtr) - __attribute__((warn_unused_result)); - -/** - * Free a physical zone and null out the reference to it. - * - * @param zonePtr A pointer to the zone to free - **/ -void freePhysicalZone(PhysicalZone **zonePtr); - -/** - * Get the zone number of a physical zone. - * - * @param zone The zone - * - * @return The number of the zone - **/ -ZoneCount getPhysicalZoneNumber(const PhysicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the ID of a physical zone's thread. - * - * @param zone The zone - * - * @return The zone's thread ID - **/ -ThreadID getPhysicalZoneThreadID(const PhysicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the block allocator from a physical zone. - * - * @param zone The zone - * - * @return The zone's allocator - **/ -BlockAllocator *getBlockAllocator(const PhysicalZone *zone) - __attribute__((warn_unused_result)); - -/** - * Get the lock on a PBN if one exists. - * - * @param zone The physical zone responsible for the PBN - * @param pbn The physical block number whose lock is desired - * - * @return The lock or NULL if the PBN is not locked - **/ -PBNLock *getPBNLock(PhysicalZone *zone, PhysicalBlockNumber pbn) - __attribute__((warn_unused_result)); - -/** - * Attempt to lock a physical block in the zone responsible for it. If the PBN - * is already locked, the existing lock will be returned. Otherwise, a new - * lock instance will be borrowed from the pool, initialized, and returned. - * The lock owner will be NULL for a new lock acquired by the caller, who is - * responsible for setting that field promptly. The lock owner will be - * non-NULL when there is already an existing lock on the PBN. - * - * @param [in] zone The physical zone responsible for the PBN - * @param [in] pbn The physical block number to lock - * @param [in] type The type with which to initialize a new lock - * @param [out] lockPtr A pointer to receive the lock, existing or new - * - * @return VDO_SUCCESS or an error - **/ -int attemptPBNLock(PhysicalZone *zone, - PhysicalBlockNumber pbn, - PBNLockType type, - PBNLock **lockPtr) - __attribute__((warn_unused_result)); - -/** - * Release a physical block lock if it is held, return it to the lock pool, - * and null out the caller's reference to it. It must be the last live - * reference, as if the memory were being freed (the lock memory will - * re-initialized or zeroed). - * - * @param [in] zone The physical zone in which the lock was obtained - * @param [in] lockedPBN The physical block number to unlock - * @param [in,out] lockPtr The last reference to the lock being released - **/ -void releasePBNLock(PhysicalZone *zone, - PhysicalBlockNumber lockedPBN, - PBNLock **lockPtr); - -/** - * Dump information about a physical zone to the log for debugging. - * - * @param zone The zone to dump - **/ -void dumpPhysicalZone(const PhysicalZone *zone); - -#endif // PHYSICAL_ZONE_H diff --git a/vdo/base/pointerMap.c b/vdo/base/pointerMap.c deleted file mode 100644 index 395f266..0000000 --- a/vdo/base/pointerMap.c +++ /dev/null @@ -1,633 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.c#1 $ - */ - -/** - * Hash table implementation of a map from integers to pointers, implemented - * using the Hopscotch Hashing algorithm by Herlihy, Shavit, and Tzafrir (see - * http://en.wikipedia.org/wiki/Hopscotch_hashing). This implementation does - * not contain any of the locking/concurrency features of the algorithm, just - * the collision resolution scheme. - * - * Hopscotch Hashing is based on hashing with open addressing and linear - * probing. All the entries are stored in a fixed array of buckets, with no - * dynamic allocation for collisions. Unlike linear probing, all the entries - * that hash to a given bucket are stored within a fixed neighborhood starting - * at that bucket. Chaining is effectively represented as a bit vector - * relative to each bucket instead of as pointers or explicit offsets. - * - * When an empty bucket cannot be found within a given neighborhood, - * subsequent neighborhoods are searched, and one or more entries will "hop" - * into those neighborhoods. When this process works, an empty bucket will - * move into the desired neighborhood, allowing the entry to be added. When - * that process fails (typically when the buckets are around 90% full), the - * table must be resized and the all entries rehashed and added to the - * expanded table. - * - * Unlike linear probing, the number of buckets that must be searched in the - * worst case has a fixed upper bound (the size of the neighborhood). Those - * entries occupy a small number of memory cache lines, leading to improved - * use of the cache (fewer misses on both successful and unsuccessful - * searches). Hopscotch hashing outperforms linear probing at much higher load - * factors, so even with the increased memory burden for maintaining the hop - * vectors, less memory is needed to achieve that performance. Hopscotch is - * also immune to "contamination" from deleting entries since entries are - * genuinely removed instead of being replaced by a placeholder. - * - * The published description of the algorithm used a bit vector, but the paper - * alludes to an offset scheme which is used by this implementation. Since the - * entries in the neighborhood are within N entries of the hash bucket at the - * start of the neighborhood, a pair of small offset fields each log2(N) bits - * wide is all that's needed to maintain the hops as a linked list. In order - * to encode "no next hop" (i.e. NULL) as the natural initial value of zero, - * the offsets are biased by one (i.e. 0 => NULL, 1 => offset=0, 2 => - * offset=1, etc.) We can represent neighborhoods of up to 255 entries with - * just 8+8=16 bits per entry. The hop list is sorted by hop offset so the - * first entry in the list is always the bucket closest to the start of the - * neighborhood. - * - * While individual accesses tend to be very fast, the table resize operations - * are very very expensive. If an upper bound on the latency of adding an - * entry to the table is needed, we either need to ensure the table is - * pre-sized to be large enough so no resize is ever needed, or we'll need to - * develop an approach to incrementally resize the table. - **/ - -#include "pointerMap.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" - -enum { - DEFAULT_CAPACITY = 16, // the number of neighborhoods in a new table - NEIGHBORHOOD = 255, // the number of buckets in each neighborhood - MAX_PROBES = 1024, // limit on the number of probes for a free bucket - NULL_HOP_OFFSET = 0, // the hop offset value terminating the hop list - DEFAULT_LOAD = 75 // a compromise between memory use and performance -}; - -/** - * Buckets are packed together to reduce memory usage and improve cache - * efficiency. It would be tempting to encode the hop offsets separately and - * maintain alignment of key/value pairs, but it's crucial to keep the hop - * fields near the buckets that they use them so they'll tend to share cache - * lines. - **/ -typedef struct __attribute__((packed)) bucket { - uint8_t firstHop; // the biased offset of the first entry in the hop list - // of the neighborhood that hashes to this bucket - uint8_t nextHop; // the biased offset of the next bucket in the hop list - - const void *key; // the key stored in this bucket - void *value; // the value stored in this bucket (NULL if empty) -} Bucket; - -/** - * The concrete definition of the opaque PointerMap type. To avoid having to - * wrap the neighborhoods of the last entries back around to the start of the - * bucket array, we allocate a few more buckets at the end of the array - * instead, which is why capacity and bucketCount are different. - **/ -struct pointerMap { - /** the number of entries stored in the map */ - size_t size; - /** the number of neighborhoods in the map */ - size_t capacity; - /** the number of buckets in the bucket array */ - size_t bucketCount; - /** the array of hash buckets */ - Bucket *buckets; - /** the function for comparing keys for equality */ - PointerKeyComparator *comparator; - /** the function for getting a hash code from a key */ - PointerKeyHasher *hasher; -}; - -/** - * Initialize a PointerMap. - * - * @param map the map to initialize - * @param capacity the initial capacity of the map - * - * @return UDS_SUCCESS or an error code - **/ -static int allocateBuckets(PointerMap *map, size_t capacity) -{ - map->size = 0; - map->capacity = capacity; - - // Allocate NEIGHBORHOOD - 1 extra buckets so the last bucket can have a - // full neighborhood without have to wrap back around to element zero. - map->bucketCount = capacity + (NEIGHBORHOOD - 1); - return ALLOCATE(map->bucketCount, Bucket, "PointerMap buckets", - &map->buckets); -} - -/**********************************************************************/ -int makePointerMap(size_t initialCapacity, - unsigned int initialLoad, - PointerKeyComparator comparator, - PointerKeyHasher hasher, - PointerMap **mapPtr) -{ - // Use the default initial load if the caller did not specify one. - if (initialLoad == 0) { - initialLoad = DEFAULT_LOAD; - } - if (initialLoad > 100) { - return UDS_INVALID_ARGUMENT; - } - - PointerMap *map; - int result = ALLOCATE(1, PointerMap, "PointerMap", &map); - if (result != UDS_SUCCESS) { - return result; - } - - map->hasher = hasher; - map->comparator = comparator; - - // Use the default capacity if the caller did not specify one. - size_t capacity = (initialCapacity > 0) ? initialCapacity : DEFAULT_CAPACITY; - - // Scale up the capacity by the specified initial load factor. - // (i.e to hold 1000 entries at 80% load we need a capacity of 1250) - capacity = capacity * 100 / initialLoad; - - result = allocateBuckets(map, capacity); - if (result != UDS_SUCCESS) { - freePointerMap(&map); - return result; - } - - *mapPtr = map; - return UDS_SUCCESS; -} - -/** - * Free the bucket array for the map. - * - * @param map the map whose bucket array is to be freed - **/ -static void freeBuckets(PointerMap *map) -{ - FREE(map->buckets); - map->buckets = NULL; -} - -/**********************************************************************/ -void freePointerMap(PointerMap **mapPtr) -{ - if (*mapPtr != NULL) { - freeBuckets(*mapPtr); - FREE(*mapPtr); - *mapPtr = NULL; - } -} - -/**********************************************************************/ -size_t pointerMapSize(const PointerMap *map) -{ - return map->size; -} - -/** - * Convert a biased hop offset within a neighborhood to a pointer to the - * bucket it references. - * - * @param neighborhood the first bucket in the neighborhood - * @param hopOffset the biased hop offset to the desired bucket - * - * @return NULL if hopOffset is zero, otherwise a pointer to - * the bucket in the neighborhood at hopOffset - 1 - **/ -static Bucket *dereferenceHop(Bucket *neighborhood, unsigned int hopOffset) -{ - if (hopOffset == NULL_HOP_OFFSET) { - return NULL; - } - - STATIC_ASSERT(NULL_HOP_OFFSET == 0); - return &neighborhood[hopOffset - 1]; -} - -/** - * Add a bucket into the hop list for the neighborhood, inserting it into the - * list so the hop list remains sorted by hop offset. - * - * @param neighborhood the first bucket in the neighborhood - * @param newBucket the bucket to add to the hop list - **/ -static void insertInHopList(Bucket *neighborhood, Bucket *newBucket) -{ - // Zero indicates a NULL hop offset, so bias the hop offset by one. - int hopOffset = 1 + (newBucket - neighborhood); - - // Handle the special case of adding a bucket at the start of the list. - int nextHop = neighborhood->firstHop; - if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { - newBucket->nextHop = nextHop; - neighborhood->firstHop = hopOffset; - return; - } - - // Search the hop list for the insertion point that maintains the sort - // order. - for (;;) { - Bucket *bucket = dereferenceHop(neighborhood, nextHop); - nextHop = bucket->nextHop; - - if ((nextHop == NULL_HOP_OFFSET) || (nextHop > hopOffset)) { - newBucket->nextHop = nextHop; - bucket->nextHop = hopOffset; - return; - } - } -} - -/** - * Select and return the hash bucket for a given search key. - * - * @param map the map to search - * @param key the mapping key - **/ -static Bucket *selectBucket(const PointerMap *map, const void *key) -{ - /* - * Scale the 32-bit hash to a bucket index by treating it as a binary - * fraction and multiplying that by the capacity. If the hash is uniformly - * distributed over [0 .. 2^32-1], then (hash * capacity / 2^32) should be - * uniformly distributed over [0 .. capacity-1]. The multiply and shift is - * much faster than a divide (modulus) on X86 CPUs. - */ - uint64_t hash = map->hasher(key); - return &map->buckets[(hash * map->capacity) >> 32]; -} - -/** - * Search the hop list associated with given hash bucket for a given search - * key. If the key is found, returns a pointer to the entry (bucket or - * collision), otherwise returns NULL. - * - * @param [in] map the map being searched - * @param [in] bucket the map bucket to search for the key - * @param [in] key the mapping key - * @param [out] previousPtr if not NULL, a pointer in which to - * store the bucket in the list preceding the one - * that had the matching key - * - * @return an entry that matches the key, or NULL if not found - **/ -static Bucket *searchHopList(PointerMap *map, - Bucket *bucket, - const void *key, - Bucket **previousPtr) -{ - Bucket *previous = NULL; - unsigned int nextHop = bucket->firstHop; - while (nextHop != NULL_HOP_OFFSET) { - // Check the neighboring bucket indexed by the offset for the desired key. - Bucket *entry = dereferenceHop(bucket, nextHop); - if ((entry->value != NULL) && map->comparator(key, entry->key)) { - if (previousPtr != NULL) { - *previousPtr = previous; - } - return entry; - } - nextHop = entry->nextHop; - previous = entry; - } - return NULL; -} - -/**********************************************************************/ -void *pointerMapGet(PointerMap *map, const void *key) -{ - Bucket *match = searchHopList(map, selectBucket(map, key), key, NULL); - return ((match != NULL) ? match->value : NULL); -} - -/** - * Increase the number of hash buckets and rehash all the existing entries, - * storing them in the new buckets. - * - * @param map the map to resize - **/ -static int resizeBuckets(PointerMap *map) -{ - // Copy the top-level map data to the stack. - PointerMap oldMap = *map; - - // Re-initialize the map to be empty and 50% larger. - size_t newCapacity = map->capacity / 2 * 3; - logInfo("%s: attempting resize from %zu to %zu, current size=%zu", - __func__, map->capacity, newCapacity, map->size); - int result = allocateBuckets(map, newCapacity); - if (result != UDS_SUCCESS) { - *map = oldMap; - return result; - } - - // Populate the new hash table from the entries in the old bucket array. - for (size_t i = 0; i < oldMap.bucketCount; i++) { - Bucket *entry = &oldMap.buckets[i]; - if (entry->value == NULL) { - continue; - } - - result = pointerMapPut(map, entry->key, entry->value, true, NULL); - if (result != UDS_SUCCESS) { - // Destroy the new partial map and restore the map from the stack. - freeBuckets(map); - *map = oldMap; - return result; - } - } - - // Destroy the old bucket array. - freeBuckets(&oldMap); - return UDS_SUCCESS; -} - -/** - * Probe the bucket array starting at the given bucket for the next empty - * bucket, returning a pointer to it. NULL will be returned if - * the search reaches the end of the bucket array or if the number of linear - * probes exceeds a specified limit. - * - * @param map the map containing the buckets to search - * @param bucket the bucket at which to start probing - * @param maxProbes the maximum number of buckets to search - * - * @return the next empty bucket, or NULL if the search failed - **/ -static Bucket *findEmptyBucket(PointerMap *map, - Bucket *bucket, - unsigned int maxProbes) -{ - // Limit the search to either the nearer of the end of the bucket array or a - // fixed distance beyond the initial bucket. - size_t remaining = &map->buckets[map->bucketCount] - bucket; - Bucket *sentinel = &bucket[minSizeT(remaining, maxProbes)]; - - for (Bucket *entry = bucket; entry < sentinel; entry++) { - if (entry->value == NULL) { - return entry; - } - } - return NULL; -} - -/** - * Move an empty bucket closer to the start of the bucket array. This searches - * the neighborhoods that contain the empty bucket for a non-empty bucket - * closer to the start of the array. If such a bucket is found, this swaps the - * two buckets by moving the entry to the empty bucket. - * - * @param map the map containing the bucket - * @param hole the empty bucket to fill with an entry that precedes it in one - * of its enclosing neighborhoods - * - * @return the bucket that was vacated by moving its entry to the provided - * hole, or NULL if no entry could be moved - **/ -static Bucket *moveEmptyBucket(PointerMap *map __attribute__((unused)), - Bucket *hole) -{ - /* - * Examine every neighborhood that the empty bucket is part of, starting - * with the one in which it is the last bucket. No boundary check is needed - * for the negative array arithmetic since this function is only called when - * hole is at least NEIGHBORHOOD cells deeper into the array than a valid - * bucket. - */ - for (Bucket *bucket = &hole[1 - NEIGHBORHOOD]; bucket < hole; bucket++) { - // Find the entry that is nearest to the bucket, which means it will be - // nearest to the hash bucket whose neighborhood is full. - Bucket *newHole = dereferenceHop(bucket, bucket->firstHop); - if (newHole == NULL) { - // There are no buckets in this neighborhood that are in use by this one - // (they must all be owned by overlapping neighborhoods). - continue; - } - - // Skip this bucket if its first entry is actually further away than the - // hole that we're already trying to fill. - if (hole < newHole) { - continue; - } - - /* - * We've found an entry in this neighborhood that we can "hop" further - * away, moving the hole closer to the hash bucket, if not all the way - * into its neighborhood. - */ - - // The entry that will be the new hole is the first bucket in the list, - // so setting firstHop is all that's needed remove it from the list. - bucket->firstHop = newHole->nextHop; - newHole->nextHop = NULL_HOP_OFFSET; - - // Move the entry into the original hole. - hole->key = newHole->key; - hole->value = newHole->value; - newHole->value = NULL; - - // Insert the filled hole into the hop list for the neighborhood. - insertInHopList(bucket, hole); - return newHole; - } - - // We couldn't find an entry to relocate to the hole. - return NULL; -} - -/** - * Find and update any existing mapping for a given key, returning the value - * associated with the key in the provided pointer. - * - * @param [in] map the PointerMap to attempt to modify - * @param [in] neighborhood the first bucket in the neighborhood that - * would contain the search key - * @param [in] key the key with which to associate the new value - * @param [in] newValue the value to be associated with the key - * @param [in] update whether to overwrite an existing value - * @param [out] oldValuePtr a pointer in which to store the old value - * (unmodified if no mapping was found) - * - * @return true if the map contains a mapping for the key - * false if it does not - **/ -static bool updateMapping(PointerMap *map, - Bucket *neighborhood, - const void *key, - void *newValue, - bool update, - void **oldValuePtr) -{ - Bucket *bucket = searchHopList(map, neighborhood, key, NULL); - if (bucket == NULL) { - // There is no bucket containing the key in the neighborhood. - return false; - } - - // Return the value of the current mapping (if desired) and update the - // mapping with the new value (if desired). - if (oldValuePtr != NULL) { - *oldValuePtr = bucket->value; - } - if (update) { - // We're dropping the old key pointer on the floor here, assuming it's a - // property of the value or that it's otherwise safe to just forget. - bucket->key = key; - bucket->value = newValue; - } - return true; -} - -/** - * Find an empty bucket in a specified neighborhood for a new mapping or - * attempt to re-arrange mappings so there is such a bucket. This operation - * may fail (returning NULL) if an empty bucket is not available or could not - * be relocated to the neighborhood. - * - * @param map the PointerMap to search or modify - * @param neighborhood the first bucket in the neighborhood in which - * an empty bucket is needed for a new mapping - * - * @return a pointer to an empty bucket in the desired neighborhood, or - * NULL if a vacancy could not be found or arranged - **/ -static Bucket *findOrMakeVacancy(PointerMap *map, Bucket *neighborhood) -{ - // Probe within and beyond the neighborhood for the first empty bucket. - Bucket *hole = findEmptyBucket(map, neighborhood, MAX_PROBES); - - // Keep trying until the empty bucket is in the bucket's neighborhood or we - // are unable to move it any closer by swapping it with a filled bucket. - while (hole != NULL) { - int distance = hole - neighborhood; - if (distance < NEIGHBORHOOD) { - // We've found or relocated an empty bucket close enough to the initial - // hash bucket to be referenced by its hop vector. - return hole; - } - - // The nearest empty bucket isn't within the neighborhood that must - // contain the new entry, so try to swap it with bucket that is closer. - hole = moveEmptyBucket(map, hole); - } - - return NULL; -} - -/**********************************************************************/ -int pointerMapPut(PointerMap *map, - const void *key, - void *newValue, - bool update, - void **oldValuePtr) -{ - if (newValue == NULL) { - return UDS_INVALID_ARGUMENT; - } - - // Select the bucket at the start of the neighborhood that must contain any - // entry for the provided key. - Bucket *neighborhood = selectBucket(map, key); - - // Check whether the neighborhood already contains an entry for the key, in - // which case we optionally update it, returning the old value. - if (updateMapping(map, neighborhood, key, newValue, update, oldValuePtr)) { - return UDS_SUCCESS; - } - - /* - * Find an empty bucket in the desired neighborhood for the new entry or - * re-arrange entries in the map so there is such a bucket. This operation - * will usually succeed; the loop body will only be executed on the rare - * occasions that we have to resize the map. - */ - Bucket *bucket; - while ((bucket = findOrMakeVacancy(map, neighborhood)) == NULL) { - /* - * There is no empty bucket in which to put the new entry in the current - * map, so we're forced to allocate a new bucket array with a larger - * capacity, re-hash all the entries into those buckets, and try again (a - * very expensive operation for large maps). - */ - int result = resizeBuckets(map); - if (result != UDS_SUCCESS) { - return result; - } - - // Resizing the map invalidates all pointers to buckets, so recalculate - // the neighborhood pointer. - neighborhood = selectBucket(map, key); - } - - // Put the new entry in the empty bucket, adding it to the neighborhood. - bucket->key = key; - bucket->value = newValue; - insertInHopList(neighborhood, bucket); - map->size += 1; - - // There was no existing entry, so there was no old value to be returned. - if (oldValuePtr != NULL) { - *oldValuePtr = NULL; - } - return UDS_SUCCESS; -} - -/**********************************************************************/ -void *pointerMapRemove(PointerMap *map, const void *key) -{ - // Select the bucket to search and search it for an existing entry. - Bucket *bucket = selectBucket(map, key); - Bucket *previous; - Bucket *victim = searchHopList(map, bucket, key, &previous); - - if (victim == NULL) { - // There is no matching entry to remove. - return NULL; - } - - // We found an entry to remove. Save the mapped value to return later and - // empty the bucket. - map->size -= 1; - void *value = victim->value; - victim->value = NULL; - victim->key = 0; - - // The victim bucket is now empty, but it still needs to be spliced out of - // the hop list. - if (previous == NULL) { - // The victim is the head of the list, so swing firstHop. - bucket->firstHop = victim->nextHop; - } else { - previous->nextHop = victim->nextHop; - } - victim->nextHop = NULL_HOP_OFFSET; - - return value; -} diff --git a/vdo/base/pointerMap.h b/vdo/base/pointerMap.h deleted file mode 100644 index 1bd0bd2..0000000 --- a/vdo/base/pointerMap.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/pointerMap.h#1 $ - */ - -#ifndef POINTER_MAP_H -#define POINTER_MAP_H - -#include "common.h" - -/** - * PointerMap associates pointer values (void *) with the data - * referenced by pointer keys (void *). NULL pointer - * values are not supported. A NULL key value is supported when - * the instance's key comparator and hasher functions support it. - * - * The map is implemented as hash table, which should provide constant-time - * insert, query, and remove operations, although the insert may occasionally - * grow the table, which is linear in the number of entries in the map. The - * table will grow as needed to hold new entries, but will not shrink as - * entries are removed. - * - * The key and value pointers passed to the map are retained and used by the - * map, but are not owned by the map. Freeing the map does not attempt to free - * the pointers. The client is entirely responsible for the memory managment - * of the keys and values. The current interface and implementation assume - * that keys will be properties of the values, or that keys will not be memory - * managed, or that keys will not need to be freed as a result of being - * replaced when a key is re-mapped. - **/ - -typedef struct pointerMap PointerMap; - -/** - * The prototype of functions that compare the referents of two pointer keys - * for equality. If two keys are equal, then both keys must have the same the - * hash code associated with them by the hasher function defined below. - - * @param thisKey The first element to compare - * @param thatKey The second element to compare - * - * @return true if and only if the referents of the two - * key pointers are to be treated as the same key by the map - **/ -typedef bool PointerKeyComparator(const void *thisKey, const void *thatKey); - -/** - * The prototype of functions that get or calculate a hash code associated - * with the referent of pointer key. The hash code must be uniformly - * distributed over all uint32_t values. The hash code associated with a given - * key must not change while the key is in the map. If the comparator function - * says two keys are equal, then this function must return the same hash code - * for both keys. This function may be called many times for a key while an - * entry is stored for it in the map. - * - * @param key The pointer key to hash - * - * @return the hash code for the key - **/ -typedef uint32_t PointerKeyHasher(const void *key); - -/** - * Allocate and initialize a PointerMap. - * - * @param [in] initialCapacity The number of entries the map should - * initially be capable of holding (zero tells - * the map to use its own small default) - * @param [in] initialLoad The load factor of the map, expressed as an - * integer percentage (typically in the range - * 50 to 90, with zero telling the map to use - * its own default) - * @param [in] comparator The function to use to compare the referents - * of two pointer keys for equality - * @param [in] hasher The function to use obtain the hash code - * associated with each pointer key - * @param [out] mapPtr A pointer to hold the new PointerMap - * - * @return UDS_SUCCESS or an error code - **/ -int makePointerMap(size_t initialCapacity, - unsigned int initialLoad, - PointerKeyComparator comparator, - PointerKeyHasher hasher, - PointerMap **mapPtr) - __attribute__((warn_unused_result)); - -/** - * Free a PointerMap and null out the reference to it. NOTE: The map does not - * own the pointer keys and values stored in the map and they are not freed by - * this call. - * - * @param [in,out] mapPtr The reference to the PointerMap to free - **/ -void freePointerMap(PointerMap **mapPtr); - -/** - * Get the number of entries stored in a PointerMap. - * - * @param map The PointerMap to query - * - * @return the number of entries in the map - **/ -size_t pointerMapSize(const PointerMap *map); - -/** - * Retrieve the value associated with a given key from the PointerMap. - * - * @param map The PointerMap to query - * @param key The key to look up (may be NULL if the - * comparator and hasher functions support it) - * - * @return the value associated with the given key, or NULL - * if the key is not mapped to any value - **/ -void *pointerMapGet(PointerMap *map, const void *key); - -/** - * Try to associate a value (a pointer) with an integer in a PointerMap. - * If the map already contains a mapping for the provided key, the old value is - * only replaced with the specified value if update is true. In either case - * the old value is returned. If the map does not already contain a value for - * the specified key, the new value is added regardless of the value of update. - * - * If the value stored in the map is updated, then the key stored in the map - * will also be updated with the key provided by this call. The old key will - * not be returned due to the memory managment assumptions described in the - * interface header comment. - * - * @param [in] map The PointerMap to attempt to modify - * @param [in] key The key with which to associate the new value - * (may be NULL if the comparator and - * hasher functions support it) - * @param [in] newValue The value to be associated with the key - * @param [in] update Whether to overwrite an existing value - * @param [out] oldValuePtr A pointer in which to store either the old value - * (if the key was already mapped) or - * NULL if the map did not contain the - * key; NULL may be provided if the - * caller does not need to know the old value - * - * @return UDS_SUCCESS or an error code - **/ -int pointerMapPut(PointerMap *map, - const void *key, - void *newValue, - bool update, - void **oldValuePtr) - __attribute__((warn_unused_result)); - -/** - * Remove the mapping for a given key from the PointerMap. - * - * @param map The PointerMap from which to remove the mapping - * @param key The key whose mapping is to be removed (may be NULL - * if the comparator and hasher functions support it) - * - * @return the value that was associated with the key, or - * NULL if it was not mapped - **/ -void *pointerMapRemove(PointerMap *map, const void *key); - -#endif /* POINTER_MAP_H */ diff --git a/vdo/base/priorityTable.c b/vdo/base/priorityTable.c deleted file mode 100644 index deb423b..0000000 --- a/vdo/base/priorityTable.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/priorityTable.c#1 $ - */ - -#include "priorityTable.h" - -#include "errors.h" -#include "memoryAlloc.h" -#include "numUtils.h" - -#include "statusCodes.h" - -/** We use a single 64-bit search vector, so the maximum priority is 63 */ -enum { MAX_PRIORITY = 63 }; - -/** - * All the entries with the same priority are queued in a circular list in a - * bucket for that priority. The table is essentially an array of buckets. - **/ -typedef struct bucket { - /** The head of a queue of table entries, all having the same priority */ - RingNode queue; - /** The priority of all the entries in this bucket */ - unsigned int priority; -} Bucket; - -/** - * A priority table is an array of buckets, indexed by priority. New entries - * are added to the end of the queue in the appropriate bucket. The dequeue - * operation finds the highest-priority non-empty bucket by searching a bit - * vector represented as a single 8-byte word, which is very fast with - * compiler and CPU support. - **/ -struct priorityTable { - /** The maximum priority of entries that may be stored in this table */ - unsigned int maxPriority; - /** A bit vector flagging all buckets that are currently non-empty */ - uint64_t searchVector; - /** The array of all buckets, indexed by priority */ - Bucket buckets[]; -}; - -/** - * Convert a queue head to to the bucket that contains it. - * - * @param head The bucket queue ring head pointer to convert - * - * @return the enclosing bucket - **/ -static inline Bucket *asBucket(RingNode *head) -{ - STATIC_ASSERT(offsetof(Bucket, queue) == 0); - return (Bucket *) head; -} - -/**********************************************************************/ -int makePriorityTable(unsigned int maxPriority, PriorityTable **tablePtr) -{ - if (maxPriority > MAX_PRIORITY) { - return UDS_INVALID_ARGUMENT; - } - - PriorityTable *table; - int result = ALLOCATE_EXTENDED(PriorityTable, maxPriority + 1, Bucket, - __func__, &table); - if (result != VDO_SUCCESS) { - return result; - } - - for (unsigned int priority = 0; priority <= maxPriority; priority++) { - Bucket *bucket = &table->buckets[priority]; - bucket->priority = priority; - initializeRing(&bucket->queue); - } - - table->maxPriority = maxPriority; - table->searchVector = 0; - - *tablePtr = table; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freePriorityTable(PriorityTable **tablePtr) -{ - PriorityTable *table = *tablePtr; - if (table == NULL) { - return; - } - - // Unlink the buckets from any entries still in the table so the entries - // won't be left with dangling pointers to freed memory. - resetPriorityTable(table); - - FREE(table); - *tablePtr = NULL; -} - -/**********************************************************************/ -void resetPriorityTable(PriorityTable *table) -{ - table->searchVector = 0; - for (unsigned int priority = 0; priority <= table->maxPriority; priority++) { - unspliceRingNode(&table->buckets[priority].queue); - } -} - -/**********************************************************************/ -void priorityTableEnqueue(PriorityTable *table, - unsigned int priority, - RingNode *entry) -{ - ASSERT_LOG_ONLY((priority <= table->maxPriority), - "entry priority must be valid for the table"); - - // Append the entry to the queue in the specified bucket. - pushRingNode(&table->buckets[priority].queue, entry); - - // Flag the bucket in the search vector since it must be non-empty. - table->searchVector |= (1ULL << priority); -} - -/**********************************************************************/ -static inline void markBucketEmpty(PriorityTable *table, Bucket *bucket) -{ - table->searchVector &= ~(1ULL << bucket->priority); -} - -/**********************************************************************/ -RingNode *priorityTableDequeue(PriorityTable *table) -{ - // Find the highest priority non-empty bucket by finding the highest-order - // non-zero bit in the search vector. - int topPriority = logBaseTwo(table->searchVector); - - if (topPriority < 0) { - // All buckets are empty. - return NULL; - } - - // Dequeue the first entry in the bucket. - Bucket *bucket = &table->buckets[topPriority]; - RingNode *entry = unspliceRingNode(bucket->queue.next); - - // Clear the bit in the search vector if the bucket has been emptied. - if (isRingEmpty(&bucket->queue)) { - markBucketEmpty(table, bucket); - } - - return entry; -} - -/**********************************************************************/ -void priorityTableRemove(PriorityTable *table, RingNode *entry) -{ - // We can't guard against calls where the entry is on a ring for a different - // table, but it's easy to deal with an entry not in any table or ring. - if (isRingEmpty(entry)) { - return; - } - - // Remove the entry from the bucket ring, remembering a pointer to another - // entry in the ring. - RingNode *nextNode = entry->next; - unspliceRingNode(entry); - - // If the rest of the ring is now empty, the next node must be the ring head - // in the bucket and we can use it to update the search vector. - if (isRingEmpty(nextNode)) { - markBucketEmpty(table, asBucket(nextNode)); - } -} - -/**********************************************************************/ -bool isPriorityTableEmpty(PriorityTable *table) -{ - return (table->searchVector == 0); -} diff --git a/vdo/base/priorityTable.h b/vdo/base/priorityTable.h deleted file mode 100644 index d48a570..0000000 --- a/vdo/base/priorityTable.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/priorityTable.h#2 $ - */ - -#ifndef PRIORITY_TABLE_H -#define PRIORITY_TABLE_H - -#include "ringNode.h" - -/** - * A PriorityTable is a simple implementation of a priority queue for entries - * with priorities that are small non-negative integer values. It implements - * the obvious priority queue operations of enqueuing an entry and dequeuing - * an entry with the maximum priority. It also supports removing an arbitrary - * entry. The priority of an entry already in the table can be changed by - * removing it and re-enqueuing it with a different priority. All operations - * have O(1) complexity. - * - * The links for the table entries must be embedded in the entries themselves. - * RingNode is used to link entries in the table and no wrapper type is - * declared, so an existing RingNode link in an object can also be used to - * queue it in a PriorityTable, assuming the field is not used for anything - * else while so queued. - * - * The table is implemented as an array of queues (circular lists) indexed by - * priority, along with a hint for which queues are non-empty. Steven Skiena - * calls a very similar structure a "bounded height priority queue", but given - * the resemblance to a hash table, "priority table" seems both shorter and - * more apt, if somewhat novel. - **/ - -typedef struct priorityTable PriorityTable; - -/** - * Allocate and initialize a new PriorityTable. - * - * @param [in] maxPriority The maximum priority value for table entries - * @param [out] tablePtr A pointer to hold the new table - * - * @return VDO_SUCCESS or an error code - **/ -int makePriorityTable(unsigned int maxPriority, PriorityTable **tablePtr) - __attribute__((warn_unused_result)); - -/** - * Free a PriorityTable and null out the reference to it. NOTE: The table does - * not own the entries stored in it and they are not freed by this call. - * - * @param [in,out] tablePtr The reference to the table to free - **/ -void freePriorityTable(PriorityTable **tablePtr); - -/** - * Add a new entry to the priority table, appending it to the queue for - * entries with the specified priority. - * - * @param table The table in which to store the entry - * @param priority The priority of the entry - * @param entry The RingNode embedded in the entry to store in the table - * (the caller must have initialized it) - **/ -void priorityTableEnqueue(PriorityTable *table, - unsigned int priority, - RingNode *entry); - -/** - * Reset a priority table, leaving it in the same empty state as when newly - * constructed. NOTE: The table does not own the entries stored in it and they - * are not freed (or even unlinked from each other) by this call. - * - * @param table The table to reset - **/ -void resetPriorityTable(PriorityTable *table); - -/** - * Find the highest-priority entry in the table, remove it from the table, and - * return it. If there are multiple entries with the same priority, the one - * that has been in the table with that priority the longest will be returned. - * - * @param table The priority table from which to remove an entry - * - * @return the dequeued entry, or NULL if the table is currently empty - **/ -RingNode *priorityTableDequeue(PriorityTable *table) - __attribute__((warn_unused_result)); - -/** - * Remove a specified entry from its priority table. - * - * @param table The table from which to remove the entry - * @param entry The entry to remove from the table - **/ -void priorityTableRemove(PriorityTable *table, RingNode *entry); - -/** - * Return whether the priority table is empty. - * - * @param table The table to check - * - * @return true if the table is empty - **/ -bool isPriorityTableEmpty(PriorityTable *table) - __attribute__((warn_unused_result)); - -#endif /* PRIORITY_TABLE_H */ diff --git a/vdo/base/readOnlyNotifier.c b/vdo/base/readOnlyNotifier.c deleted file mode 100644 index ba837ac..0000000 --- a/vdo/base/readOnlyNotifier.c +++ /dev/null @@ -1,393 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyNotifier.c#5 $ - */ - -#include "readOnlyNotifier.h" - -#include "atomic.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "completion.h" -#include "physicalLayer.h" -#include "threadConfig.h" - -/** - * A ReadOnlyNotifier has a single completion which is used to perform - * read-only notifications, however, enterReadOnlyMode() may be called from any - * base thread. A pair of atomic fields are used to control the read-only mode - * entry process. The first field holds the read-only error. The second is the - * state field, which may hold any of the four special values enumerated here. - * - * When enterReadOnlyMode() is called from some base thread, a compare-and-swap - * is done on the readOnlyError, setting it to the supplied error if the value - * was VDO_SUCCESS. If this fails, some other thread has already intiated - * read-only entry or scheduled a pending entry, so the call exits. Otherwise, - * a compare-and-swap is done on the state, setting it to NOTIFYING if the - * value was MAY_NOTIFY. If this succeeds, the caller initiates the - * notification. If this failed due to notifications being disallowed, the - * notifier will be in the MAY_NOT_NOTIFY state but readOnlyError will not be - * VDO_SUCCESS. This configuration will indicate to allowReadOnlyModeEntry() - * that there is a pending notification to perform. - **/ -enum { - /** Notifications are allowed but not in progress */ - MAY_NOTIFY = 0, - /** A notification is in progress */ - NOTIFYING, - /** Notifications are not allowed */ - MAY_NOT_NOTIFY, - /** A notification has completed */ - NOTIFIED, -}; - -/** - * An object to be notified when the VDO enters read-only mode - **/ -typedef struct readOnlyListener ReadOnlyListener; - -struct readOnlyListener { - /** The listener */ - void *listener; - /** The method to call to notifiy the listener */ - ReadOnlyNotification *notify; - /** A pointer to the next listener */ - ReadOnlyListener *next; -}; - -/** - * Data associated with each base code thread. - **/ -typedef struct threadData { - /** - * Each thread maintains its own notion of whether the VDO is read-only so - * that the read-only state can be checked from any base thread without - * worrying about synchronization or thread safety. This does mean that - * knowledge of the VDO going read-only does not occur simultaneously across - * the VDO's threads, but that does not seem to cause any problems. - */ - bool isReadOnly; - /** - * A list of objects waiting to be notified on this thread that the VDO has - * entered read-only mode. - **/ - ReadOnlyListener *listeners; -} ThreadData; - -struct readOnlyNotifier { - /** The completion for entering read-only mode */ - VDOCompletion completion; - /** A completion waiting for notifications to be drained or enabled */ - VDOCompletion *waiter; - /** The code of the error which put the VDO into read-only mode */ - Atomic32 readOnlyError; - /** The current state of the notifier (values described above) */ - Atomic32 state; - /** The thread config of the VDO */ - const ThreadConfig *threadConfig; - /** The array of per-thread data */ - ThreadData threadData[]; -}; - -/** - * Convert a generic VDOCompletion to a ReadOnlyNotifier. - * - * @param completion The completion to convert - * - * @return The completion as a ReadOnlyNotifier - **/ -static inline ReadOnlyNotifier *asNotifier(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(ReadOnlyNotifier, completion) == 0); - assertCompletionType(completion->type, READ_ONLY_MODE_COMPLETION); - return (ReadOnlyNotifier *) completion; -} - -/**********************************************************************/ -int makeReadOnlyNotifier(bool isReadOnly, - const ThreadConfig *threadConfig, - PhysicalLayer *layer, - ReadOnlyNotifier **notifierPtr) -{ - ReadOnlyNotifier *notifier; - int result = ALLOCATE_EXTENDED(ReadOnlyNotifier, - threadConfig->baseThreadCount, ThreadData, - __func__, ¬ifier); - if (result != VDO_SUCCESS) { - return result; - } - - notifier->threadConfig = threadConfig; - if (isReadOnly) { - atomicStore32(¬ifier->readOnlyError, (uint32_t) VDO_READ_ONLY); - atomicStore32(¬ifier->state, NOTIFIED); - } else { - atomicStore32(¬ifier->state, MAY_NOTIFY); - } - result = initializeEnqueueableCompletion(¬ifier->completion, - READ_ONLY_MODE_COMPLETION, layer); - if (result != VDO_SUCCESS) { - freeReadOnlyNotifier(¬ifier); - return result; - } - - for (ThreadCount id = 0; id < threadConfig->baseThreadCount; id++) { - notifier->threadData[id].isReadOnly = isReadOnly; - } - - *notifierPtr = notifier; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeReadOnlyNotifier(ReadOnlyNotifier **notifierPtr) -{ - ReadOnlyNotifier *notifier = *notifierPtr; - if (notifier == NULL) { - return; - } - - for (ThreadCount id = 0; id < notifier->threadConfig->baseThreadCount; - id++) { - ThreadData *threadData = ¬ifier->threadData[id]; - ReadOnlyListener *listener = threadData->listeners; - while (listener != NULL) { - ReadOnlyListener *toFree = listener; - listener = listener->next; - FREE(toFree); - } - } - - destroyEnqueueable(¬ifier->completion); - FREE(notifier); - *notifierPtr = NULL; -} - -/** - * Check that a function was called on the admin thread. - * - * @param notifier The notifier - * @param caller The name of the function (for logging) - **/ -static void assertOnAdminThread(ReadOnlyNotifier *notifier, const char *caller) -{ - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((getAdminThread(notifier->threadConfig) == threadID), - "%s called on admin thread", caller); -} - - -/**********************************************************************/ -void waitUntilNotEnteringReadOnlyMode(ReadOnlyNotifier *notifier, - VDOCompletion *parent) -{ - if (notifier == NULL) { - finishCompletion(parent, VDO_SUCCESS); - return; - } - - assertOnAdminThread(notifier, __func__); - if (notifier->waiter != NULL) { - finishCompletion(parent, VDO_COMPONENT_BUSY); - return; - } - - uint32_t state = atomicLoad32(¬ifier->state); - if ((state == MAY_NOT_NOTIFY) || (state == NOTIFIED)) { - // Notifications are already done or disallowed. - completeCompletion(parent); - return; - } - - if (compareAndSwap32(¬ifier->state, MAY_NOTIFY, MAY_NOT_NOTIFY)) { - // A notification was not in progress, and now they are disallowed. - completeCompletion(parent); - return; - } - - /* - * A notification is in progress, so wait for it to finish. There is no race - * here since the notification can't finish while the admin thread is in this - * method. - */ - notifier->waiter = parent; -} - -/** - * Complete the process of entering read only mode. - * - * @param completion The read-only mode completion - **/ -static void finishEnteringReadOnlyMode(VDOCompletion *completion) -{ - ReadOnlyNotifier *notifier = asNotifier(completion); - assertOnAdminThread(notifier, __func__); - atomicStore32(¬ifier->state, NOTIFIED); - - VDOCompletion *waiter = notifier->waiter; - if (waiter != NULL) { - notifier->waiter = NULL; - finishCompletion(waiter, completion->result); - } -} - -/** - * Inform each thread that the VDO is in read-only mode. - * - * @param completion The read-only mode completion - **/ -static void makeThreadReadOnly(VDOCompletion *completion) -{ - ThreadID threadID = completion->callbackThreadID; - ReadOnlyNotifier *notifier = asNotifier(completion); - ReadOnlyListener *listener = completion->parent; - if (listener == NULL) { - // This is the first call on this thread - ThreadData *threadData = ¬ifier->threadData[threadID]; - threadData->isReadOnly = true; - listener = threadData->listeners; - if (threadID == 0) { - // Note: This message must be recognizable by Permabit::UserMachine. - logErrorWithStringError((int) atomicLoad32(¬ifier->readOnlyError), - "Unrecoverable error, entering read-only mode"); - } - } else { - // We've just finished notifying a listener - listener = listener->next; - } - - if (listener != NULL) { - // We have a listener to notify - prepareCompletion(completion, makeThreadReadOnly, makeThreadReadOnly, - threadID, listener); - listener->notify(listener->listener, completion); - return; - } - - // We're done with this thread - if (++threadID >= notifier->threadConfig->baseThreadCount) { - // There are no more threads - prepareCompletion(completion, finishEnteringReadOnlyMode, - finishEnteringReadOnlyMode, - getAdminThread(notifier->threadConfig), NULL); - } else { - prepareCompletion(completion, makeThreadReadOnly, makeThreadReadOnly, - threadID, NULL); - } - - invokeCallback(completion); -} - -/**********************************************************************/ -void allowReadOnlyModeEntry(ReadOnlyNotifier *notifier, VDOCompletion *parent) -{ - assertOnAdminThread(notifier, __func__); - if (notifier->waiter != NULL) { - finishCompletion(parent, VDO_COMPONENT_BUSY); - return; - } - - if (!compareAndSwap32(¬ifier->state, MAY_NOT_NOTIFY, MAY_NOTIFY)) { - // Notifications were already allowed or complete - completeCompletion(parent); - return; - } - - if ((int) atomicLoad32(¬ifier->readOnlyError) == VDO_SUCCESS) { - // We're done - completeCompletion(parent); - return; - } - - // There may have been a pending notification - if (!compareAndSwap32(¬ifier->state, MAY_NOTIFY, NOTIFYING)) { - /* - * There wasn't, the error check raced with a thread calling - * enterReadOnlyMode() after we set the state to MAY_NOTIFY. It has already - * started the notification. - */ - completeCompletion(parent); - return; - } - - // Do the pending notification. - notifier->waiter = parent; - makeThreadReadOnly(¬ifier->completion); -} - -/**********************************************************************/ -void enterReadOnlyMode(ReadOnlyNotifier *notifier, int errorCode) -{ - ThreadData *threadData = ¬ifier->threadData[getCallbackThreadID()]; - if (threadData->isReadOnly) { - // This thread has already gone read-only. - return; - } - - // Record for this thread that the VDO is read-only. - threadData->isReadOnly = true; - - if (!compareAndSwap32(¬ifier->readOnlyError, (uint32_t) VDO_SUCCESS, - (uint32_t) errorCode)) { - // The notifier is already aware of a read-only error - return; - } - - if (compareAndSwap32(¬ifier->state, MAY_NOTIFY, NOTIFYING)) { - // Initiate a notification starting on the lowest numbered thread. - launchCallback(¬ifier->completion, makeThreadReadOnly, 0); - } -} - -/**********************************************************************/ -bool isReadOnly(ReadOnlyNotifier *notifier) -{ - return notifier->threadData[getCallbackThreadID()].isReadOnly; -} - -/**********************************************************************/ -bool isOrWillBeReadOnly(ReadOnlyNotifier *notifier) -{ - return (((int) relaxedLoad32(¬ifier->readOnlyError)) != VDO_SUCCESS); -} - -/**********************************************************************/ -int registerReadOnlyListener(ReadOnlyNotifier *notifier, - void *listener, - ReadOnlyNotification *notification, - ThreadID threadID) -{ - ReadOnlyListener *readOnlyListener; - int result = ALLOCATE(1, ReadOnlyListener, __func__, &readOnlyListener); - if (result != VDO_SUCCESS) { - return result; - } - - ThreadData *threadData = ¬ifier->threadData[threadID]; - *readOnlyListener = (ReadOnlyListener) { - .listener = listener, - .notify = notification, - .next = threadData->listeners, - }; - - threadData->listeners = readOnlyListener; - return VDO_SUCCESS; -} diff --git a/vdo/base/readOnlyNotifier.h b/vdo/base/readOnlyNotifier.h deleted file mode 100644 index b5eb322..0000000 --- a/vdo/base/readOnlyNotifier.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyNotifier.h#3 $ - */ - -/* - * A ReadOnlyNotifier is responsible for propogating the fact that the VDO - * has encountered an unrecoverable error to all base threads. It also persists - * the read-only state to the super block. - * - * The notifier also provides the ability to wait for any notifications to be - * complete in order to not cause super block write races when shutting down - * the VDO. - */ - -#ifndef READ_ONLY_NOTIFIER_H -#define READ_ONLY_NOTIFIER_H - -#include "completion.h" - -/** - * A function to notify a listener that the VDO has gone read-only. - * - * @param listener The object to notify - * @param parent The completion to notify in order to acknowledge the - * notification - **/ -typedef void ReadOnlyNotification(void *listener, VDOCompletion *parent); - -/** - * Create a read-only notifer. - * - * @param [in] isReadOnly Whether the VDO is already read-only - * @param [in] threadConfig The thread configuration of the VDO - * @param [in] layer The physical layer of the VDO - * @param [out] notifierPtr A pointer to receive the new notifier - * - * @return VDO_SUCCESS or an error - **/ -int makeReadOnlyNotifier(bool isReadOnly, - const ThreadConfig *threadConfig, - PhysicalLayer *layer, - ReadOnlyNotifier **notifierPtr) - __attribute__((warn_unused_result)); - -/** - * Free a ReadOnlyNotifier and null out the reference to it. - * - * @param notifierPtr The reference to the notifier to free - **/ -void freeReadOnlyNotifier(ReadOnlyNotifier **notifierPtr); - -/** - * Wait until no read-only notifications are in progress and prevent any - * subsequent notifications. Notifications may be re-enabled by calling - * allowReadOnlyModeEntry(). - * - * @param notifier The read-only notifier on which to wait - * @param parent The completion to notify when no threads are entering - * read-only mode - **/ -void waitUntilNotEnteringReadOnlyMode(ReadOnlyNotifier *notifier, - VDOCompletion *parent); - -/** - * Allow the notifier to put the VDO into read-only mode, reversing the effects - * of waitUntilNotEnteringReadOnlyMode(). If some thread tried to put the VDO - * into read-only mode while notifications were disallowed, it will be done - * when this method is called. If that happens, the parent will not be notified - * until the VDO has actually entered read-only mode and attempted to save the - * super block. - * - *

This method may only be called from the admin thread. - * - * @param notifier The notifier - * @param parent The object to notify once the operation is complete - **/ -void allowReadOnlyModeEntry(ReadOnlyNotifier *notifier, - VDOCompletion *parent); - -/** - * Put a VDO into read-only mode and save the read-only state in the super - * block. This method is a no-op if the VDO is already read-only. - * - * @param notifier The read-only notifier of the VDO - * @param errorCode The error which caused the VDO to enter read-only - * mode - **/ -void enterReadOnlyMode(ReadOnlyNotifier *notifier, int errorCode); - -/** - * Check whether the VDO is read-only. This method may be called from any - * thread, as opposed to examining the VDO's state field which is only safe - * to check from the admin thread. - * - * @param notifier The read-only notifier of the VDO - * - * @return true if the VDO is read-only - **/ -bool isReadOnly(ReadOnlyNotifier *notifier) - __attribute__((warn_unused_result)); - -/** - * Check whether the VDO is or will be read-only (i.e. some thread has started - * the process of entering read-only mode, but not all threads have been - * notified yet). This method should only be called in cases where the expense - * of reading atomic state is not a problem. It was introduced in order to allow - * suppresion of spurious error messages resulting from VIO cleanup racing with - * read-only notification. - * - * @param notifier The read-only notifier of the VDO - * - * @return true if the VDO has started (and possibly finished) - * the process of entering read-only mode - **/ -bool isOrWillBeReadOnly(ReadOnlyNotifier *notifier) - __attribute__((warn_unused_result)); - -/** - * Register a listener to be notified when the VDO goes read-only. - * - * @param notifier The notifier to register with - * @param listener The object to notify - * @param notification The function to call to send the notification - * @param threadID The id of the thread on which to send the notification - * - * @return VDO_SUCCESS or an error - **/ -int registerReadOnlyListener(ReadOnlyNotifier *notifier, - void *listener, - ReadOnlyNotification *notification, - ThreadID threadID); - -#endif /* READ_ONLY_NOTIFIER_H */ diff --git a/vdo/base/readOnlyRebuild.c b/vdo/base/readOnlyRebuild.c deleted file mode 100644 index 7e9df0c..0000000 --- a/vdo/base/readOnlyRebuild.c +++ /dev/null @@ -1,421 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyRebuild.c#9 $ - */ - -#include "readOnlyRebuild.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "blockMapInternals.h" -#include "blockMapRecovery.h" -#include "completion.h" -#include "numUtils.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalInternals.h" -#include "recoveryUtils.h" -#include "referenceCountRebuild.h" -#include "slabDepot.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" - -typedef struct { - /** The completion header */ - VDOCompletion completion; - /** A sub task completion */ - VDOCompletion subTaskCompletion; - /** The VDO in question */ - VDO *vdo; - /** A buffer to hold the data read off disk */ - char *journalData; - /** The entry data for the block map rebuild */ - NumberedBlockMapping *entries; - /** The number of entries in the entry array */ - size_t entryCount; - /** The sequence number of the first valid block of the journal (if known) */ - SequenceNumber head; - /** The sequence number of the last valid block of the journal (if known) */ - SequenceNumber tail; - /** The number of logical blocks in use */ - BlockCount logicalBlocksUsed; - /** The number of allocated block map pages */ - BlockCount blockMapDataBlocks; -} ReadOnlyRebuildCompletion; - -/** - * Convert a generic completion to a ReadOnlyRebuildCompletion. - * - * @param completion The completion to convert - * - * @return the journal rebuild completion - **/ -__attribute__((warn_unused_result)) -static inline ReadOnlyRebuildCompletion * -asReadOnlyRebuildCompletion(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(ReadOnlyRebuildCompletion, completion) == 0); - assertCompletionType(completion->type, READ_ONLY_REBUILD_COMPLETION); - return (ReadOnlyRebuildCompletion *) completion; -} - -/** - * Free a rebuild completion and all underlying structures. - * - * @param rebuildPtr A pointer to the rebuild completion to free - */ -static void freeRebuildCompletion(ReadOnlyRebuildCompletion **rebuildPtr) -{ - ReadOnlyRebuildCompletion *rebuild = *rebuildPtr; - if (rebuild == NULL) { - return; - } - - destroyEnqueueable(&rebuild->subTaskCompletion); - FREE(rebuild->journalData); - FREE(rebuild->entries); - FREE(rebuild); - *rebuildPtr = NULL; -} - -/** - * Allocate and initialize a read only rebuild completion. - * - * @param [in] vdo The VDO in question - * @param [out] rebuildPtr A pointer to return the created rebuild completion - * - * @return VDO_SUCCESS or an error code - **/ -static int makeRebuildCompletion(VDO *vdo, - ReadOnlyRebuildCompletion **rebuildPtr) -{ - ReadOnlyRebuildCompletion *rebuild; - int result = ALLOCATE(1, ReadOnlyRebuildCompletion, __func__, &rebuild); - if (result != VDO_SUCCESS) { - return result; - } - - initializeCompletion(&rebuild->completion, READ_ONLY_REBUILD_COMPLETION, - vdo->layer); - - result = initializeEnqueueableCompletion(&rebuild->subTaskCompletion, - SUB_TASK_COMPLETION, vdo->layer); - if (result != VDO_SUCCESS) { - freeRebuildCompletion(&rebuild); - return result; - } - - rebuild->vdo = vdo; - *rebuildPtr = rebuild; - return VDO_SUCCESS; -} - -/** - * Clean up the rebuild process, whether or not it succeeded, by freeing the - * rebuild completion and notifying the parent of the outcome. - * - * @param completion The rebuild completion - **/ -static void completeRebuild(VDOCompletion *completion) -{ - VDOCompletion *parent = completion->parent; - int result = completion->result; - ReadOnlyRebuildCompletion *rebuild = asReadOnlyRebuildCompletion(completion); - VDO *vdo = rebuild->vdo; - setVDOPageCacheRebuildMode(getBlockMap(vdo)->zones[0].pageCache, false); - freeRebuildCompletion(&rebuild); - finishCompletion(parent, result); -} - -/** - * Finish rebuilding, free the rebuild completion and notify the parent. - * - * @param completion The rebuild completion - **/ -static void finishRebuild(VDOCompletion *completion) -{ - ReadOnlyRebuildCompletion *rebuild = asReadOnlyRebuildCompletion(completion); - initializeRecoveryJournalPostRebuild(rebuild->vdo->recoveryJournal, - rebuild->vdo->completeRecoveries, - rebuild->tail, - rebuild->logicalBlocksUsed, - rebuild->blockMapDataBlocks); - logInfo("Read-only rebuild complete"); - completeRebuild(completion); -} - -/** - * Handle a rebuild error. - * - * @param completion The rebuild completion - **/ -static void abortRebuild(VDOCompletion *completion) -{ - logInfo("Read-only rebuild aborted"); - completeRebuild(completion); -} - -/** - * Abort a rebuild if there is an error. - * - * @param result The result to check - * @param rebuild The journal rebuild completion - * - * @return true if the result was an error - **/ -__attribute__((warn_unused_result)) -static bool abortRebuildOnError(int result, - ReadOnlyRebuildCompletion *rebuild) -{ - if (result == VDO_SUCCESS) { - return false; - } - - finishCompletion(&rebuild->completion, result); - return true; -} - -/** - * Clean up after finishing the reference count rebuild. This callback is - * registered in launchReferenceCountRebuild(). - * - * @param completion The sub-task completion - **/ -static void finishReferenceCountRebuild(VDOCompletion *completion) -{ - ReadOnlyRebuildCompletion *rebuild = completion->parent; - VDO *vdo = rebuild->vdo; - assertOnAdminThread(vdo, __func__); - if (vdo->loadState != VDO_REBUILD_FOR_UPGRADE) { - // A "rebuild" for upgrade should not increment this count. - vdo->completeRecoveries++; - } - - logInfo("Saving rebuilt state"); - prepareToFinishParent(completion, &rebuild->completion); - drainSlabDepot(vdo->depot, ADMIN_STATE_REBUILDING, completion); -} - -/** - * Rebuild the reference counts from the block map now that all journal entries - * have been applied to the block map. This callback is registered in - * applyJournalEntries(). - * - * @param completion The sub-task completion - **/ -static void launchReferenceCountRebuild(VDOCompletion *completion) -{ - ReadOnlyRebuildCompletion *rebuild = completion->parent; - VDO *vdo = rebuild->vdo; - - // We must allocate RefCounts before we can rebuild them. - int result = allocateSlabRefCounts(vdo->depot); - if (abortRebuildOnError(result, rebuild)) { - return; - } - - prepareCompletion(completion, finishReferenceCountRebuild, - finishParentCallback, getAdminThread(getThreadConfig(vdo)), - completion->parent); - rebuildReferenceCounts(vdo, completion, &rebuild->logicalBlocksUsed, - &rebuild->blockMapDataBlocks); -} - -/** - * Append an array of recovery journal entries from a journal block sector to - * the array of numbered mappings in the rebuild completion, numbering each - * entry in the order they are appended. - * - * @param rebuild The journal rebuild completion - * @param sector The recovery journal sector with entries - * @param entryCount The number of entries to append - **/ -static void appendSectorEntries(ReadOnlyRebuildCompletion *rebuild, - PackedJournalSector *sector, - JournalEntryCount entryCount) -{ - for (JournalEntryCount i = 0; i < entryCount; i++) { - RecoveryJournalEntry entry - = unpackRecoveryJournalEntry(§or->entries[i]); - int result = validateRecoveryJournalEntry(rebuild->vdo, &entry); - if (result != VDO_SUCCESS) { - // When recovering from read-only mode, ignore damaged entries. - continue; - } - - if (isIncrementOperation(entry.operation)) { - rebuild->entries[rebuild->entryCount] = (NumberedBlockMapping) { - .blockMapSlot = entry.slot, - .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state), - .number = rebuild->entryCount, - }; - rebuild->entryCount++; - } - } -} - -/** - * Create an array of all valid journal entries, in order, and store - * it in the rebuild completion. - * - * @param rebuild The journal rebuild completion - * - * @return VDO_SUCCESS or an error code - **/ -static int extractJournalEntries(ReadOnlyRebuildCompletion *rebuild) -{ - VDO *vdo = rebuild->vdo; - RecoveryJournal *journal = vdo->recoveryJournal; - SequenceNumber first = rebuild->head; - SequenceNumber last = rebuild->tail; - BlockCount maxCount = ((last - first + 1) * journal->entriesPerBlock); - - // Allocate a NumberedBlockMapping array large enough to transcribe every - // PackedRecoveryJournalEntry from every valid journal block. - int result = ALLOCATE(maxCount, NumberedBlockMapping, __func__, - &rebuild->entries); - if (result != VDO_SUCCESS) { - return result; - } - - for (SequenceNumber i = first; i <= last; i++) { - PackedJournalHeader *packedHeader - = getJournalBlockHeader(journal, rebuild->journalData, i); - RecoveryBlockHeader header; - unpackRecoveryBlockHeader(packedHeader, &header); - - if (!isExactRecoveryJournalBlock(journal, &header, i)) { - // This block is invalid, so skip it. - continue; - } - - // Don't extract more than the expected maximum entries per block. - JournalEntryCount blockEntries = minBlock(journal->entriesPerBlock, - header.entryCount); - for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) { - // Stop when all entries counted in the header are applied or skipped. - if (blockEntries == 0) { - break; - } - - PackedJournalSector *sector = getJournalBlockSector(packedHeader, j); - if (!isValidRecoveryJournalSector(&header, sector)) { - blockEntries -= minBlock(blockEntries, - RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); - continue; - } - - // Don't extract more than the expected maximum entries per sector. - JournalEntryCount sectorEntries - = minBlock(sector->entryCount, RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); - // Only extract as many as the block header calls for. - sectorEntries = minBlock(sectorEntries, blockEntries); - appendSectorEntries(rebuild, sector, sectorEntries); - // Even if the sector wasn't full, count it as full when counting up - // to the entry count the block header claims. - blockEntries -= minBlock(blockEntries, - RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); - } - } - - return VDO_SUCCESS; -} - -/** - * Determine the limits of the valid recovery journal and apply all - * valid entries to the block map. This callback is registered in - * rebuildJournalAsync(). - * - * @param completion The sub-task completion - **/ -static void applyJournalEntries(VDOCompletion *completion) -{ - ReadOnlyRebuildCompletion *rebuild - = asReadOnlyRebuildCompletion(completion->parent); - VDO *vdo = rebuild->vdo; - - logInfo("Finished reading recovery journal"); - assertOnLogicalZoneThread(vdo, 0, __func__); - - bool foundEntries = findHeadAndTail(vdo->recoveryJournal, - rebuild->journalData, &rebuild->tail, - &rebuild->head, NULL); - if (foundEntries) { - int result = extractJournalEntries(rebuild); - if (abortRebuildOnError(result, rebuild)) { - return; - } - } - - // Suppress block map errors. - setVDOPageCacheRebuildMode(getBlockMap(vdo)->zones[0].pageCache, true); - - // Play the recovery journal into the block map. - prepareCompletion(completion, launchReferenceCountRebuild, - finishParentCallback, completion->callbackThreadID, - completion->parent); - recoverBlockMap(vdo, rebuild->entryCount, rebuild->entries, completion); -} - -/** - * Begin loading the journal. - * - * @param completion The sub task completion - **/ -static void loadJournal(VDOCompletion *completion) -{ - ReadOnlyRebuildCompletion *rebuild - = asReadOnlyRebuildCompletion(completion->parent); - VDO *vdo = rebuild->vdo; - assertOnLogicalZoneThread(vdo, 0, __func__); - - prepareCompletion(completion, applyJournalEntries, finishParentCallback, - completion->callbackThreadID, completion->parent); - loadJournalAsync(vdo->recoveryJournal, completion, &rebuild->journalData); -} - -/**********************************************************************/ -void launchRebuild(VDO *vdo, VDOCompletion *parent) -{ - // Note: These messages must be recognizable by Permabit::VDODeviceBase. - if (vdo->loadState == VDO_REBUILD_FOR_UPGRADE) { - logWarning("Rebuilding reference counts for upgrade"); - } else { - logWarning("Rebuilding reference counts to clear read-only mode"); - vdo->readOnlyRecoveries++; - } - - ReadOnlyRebuildCompletion *rebuild; - int result = makeRebuildCompletion(vdo, &rebuild); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - VDOCompletion *completion = &rebuild->completion; - prepareCompletion(completion, finishRebuild, abortRebuild, - parent->callbackThreadID, parent); - - VDOCompletion *subTaskCompletion = &rebuild->subTaskCompletion; - prepareCompletion(subTaskCompletion, loadJournal, finishParentCallback, - getLogicalZoneThread(getThreadConfig(vdo), 0), - completion); - loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_REBUILD, - subTaskCompletion, NULL); -} diff --git a/vdo/base/readOnlyRebuild.h b/vdo/base/readOnlyRebuild.h deleted file mode 100644 index 9f40ce6..0000000 --- a/vdo/base/readOnlyRebuild.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/readOnlyRebuild.h#1 $ - */ - -#ifndef READ_ONLY_REBUILD_H -#define READ_ONLY_REBUILD_H - -#include "completion.h" -#include "vdo.h" - -/** - * Construct a ReadOnlyRebuildCompletion and launch it. Apply all valid journal - * block entries to all VDO structures. Must be launched from logical zone 0. - * - * @param vdo The VDO to rebuild - * @param parent The completion to notify when the rebuild is complete - **/ -void launchRebuild(VDO *vdo, VDOCompletion *parent); - -#endif // READ_ONLY_REBUILD_H diff --git a/vdo/base/recoveryJournal.c b/vdo/base/recoveryJournal.c deleted file mode 100644 index c44053c..0000000 --- a/vdo/base/recoveryJournal.c +++ /dev/null @@ -1,1403 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.c#30 $ - */ - -#include "recoveryJournal.h" -#include "recoveryJournalInternals.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" - -#include "blockMap.h" -#include "constants.h" -#include "dataVIO.h" -#include "extent.h" -#include "header.h" -#include "numUtils.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalBlock.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "waitQueue.h" - -typedef struct { - SequenceNumber journalStart; // Sequence number to start the journal - BlockCount logicalBlocksUsed; // Number of logical blocks used by VDO - BlockCount blockMapDataBlocks; // Number of block map pages allocated -} __attribute__((packed)) RecoveryJournalState7_0; - -static const Header RECOVERY_JOURNAL_HEADER_7_0 = { - .id = RECOVERY_JOURNAL, - .version = { - .majorVersion = 7, - .minorVersion = 0, - }, - .size = sizeof(RecoveryJournalState7_0), -}; - -static const uint64_t RECOVERY_COUNT_MASK = 0xff; - -enum { - /* - * The number of reserved blocks must be large enough to prevent a - * new recovery journal block write from overwriting a block which - * appears to still be a valid head block of the journal. Currently, - * that means reserving enough space for all 2048 VIOs, or 8 blocks. - */ - RECOVERY_JOURNAL_RESERVED_BLOCKS = 8, -}; - -/**********************************************************************/ -const char *getJournalOperationName(JournalOperation operation) -{ - switch (operation) { - case DATA_DECREMENT: - return "data decrement"; - - case DATA_INCREMENT: - return "data increment"; - - case BLOCK_MAP_DECREMENT: - return "block map decrement"; - - case BLOCK_MAP_INCREMENT: - return "block map increment"; - - default: - return "unknown journal operation"; - } -} - -/** - * Get a block from the end of the free list. - * - * @param journal The journal - * - * @return The block or NULL if the list is empty - **/ -static RecoveryJournalBlock *popFreeList(RecoveryJournal *journal) -{ - return blockFromRingNode(popRingNode(&journal->freeTailBlocks)); -} - -/** - * Get a block from the end of the active list. - * - * @param journal The journal - * - * @return The block or NULL if the list is empty - **/ -static RecoveryJournalBlock *popActiveList(RecoveryJournal *journal) -{ - return blockFromRingNode(popRingNode(&journal->activeTailBlocks)); -} - -/** - * Assert that we are running on the journal thread. - * - * @param journal The journal - * @param functionName The function doing the check (for logging) - **/ -static void assertOnJournalThread(RecoveryJournal *journal, - const char *functionName) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() == journal->threadID), - "%s() called on journal thread", functionName); -} - -/** - * WaiterCallback implementation invoked whenever a DataVIO is to be released - * from the journal, either because its entry was committed to disk, - * or because there was an error. - **/ -static void continueWaiter(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - dataVIOAddTraceRecord(dataVIO, - THIS_LOCATION("$F($j-$js);" - "cb=continueJournalWaiter($j-$js)")); - int waitResult = *((int *) context); - continueDataVIO(dataVIO, waitResult); -} - -/** - * Check whether the journal has any waiters on any blocks. - * - * @param journal The journal in question - * - * @return true if any block has a waiter - **/ -static inline bool hasBlockWaiters(RecoveryJournal *journal) -{ - // Either the first active tail block (if it exists) has waiters, - // or no active tail block has waiters. - if (isRingEmpty(&journal->activeTailBlocks)) { - return false; - } - - RecoveryJournalBlock *block - = blockFromRingNode(journal->activeTailBlocks.next); - return (hasWaiters(&block->entryWaiters) - || hasWaiters(&block->commitWaiters)); -} - -/**********************************************************************/ -static void recycleJournalBlocks(RecoveryJournal *block); -static void recycleJournalBlock(RecoveryJournalBlock *block); -static void notifyCommitWaiters(RecoveryJournal *journal); - -/** - * Check whether the journal has drained. - * - * @param journal The journal which may have just drained - **/ -static void checkForDrainComplete(RecoveryJournal *journal) -{ - int result = VDO_SUCCESS; - if (isReadOnly(journal->readOnlyNotifier)) { - result = VDO_READ_ONLY; - /* - * Clean up any full active blocks which were not written due to being - * in read-only mode. - * - * XXX: This would probably be better as a short-circuit in writeBlock(). - */ - notifyCommitWaiters(journal); - recycleJournalBlocks(journal); - - // Release any DataVIOs waiting to be assigned entries. - notifyAllWaiters(&journal->decrementWaiters, continueWaiter, &result); - notifyAllWaiters(&journal->incrementWaiters, continueWaiter, &result); - } - - if (!isDraining(&journal->state) - || journal->reaping || hasBlockWaiters(journal) - || hasWaiters(&journal->incrementWaiters) - || hasWaiters(&journal->decrementWaiters) - || !suspendLockCounter(journal->lockCounter)) { - return; - } - - if (isSaving(&journal->state)) { - if (journal->activeBlock != NULL) { - ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) - || !isRecoveryBlockDirty(journal->activeBlock)), - "journal being saved has clean active block"); - recycleJournalBlock(journal->activeBlock); - } - - ASSERT_LOG_ONLY(isRingEmpty(&journal->activeTailBlocks), - "all blocks in a journal being saved must be inactive"); - } - - finishDrainingWithResult(&journal->state, result); -} - -/** - * Notifiy a recovery journal that the VDO has gone read-only. - * - *

Implements ReadOnlyNotification. - * - * @param listener The journal - * @param parent The completion to notify in order to acknowledge the - * notification - **/ -static void notifyRecoveryJournalOfReadOnlyMode(void *listener, - VDOCompletion *parent) -{ - checkForDrainComplete(listener); - completeCompletion(parent); -} - -/** - * Put the journal in read-only mode. All attempts to add entries after - * this function is called will fail. All VIOs waiting for commits will be - * awakened with an error. - * - * @param journal The journal which has failed - * @param errorCode The error result triggering this call - **/ -static void enterJournalReadOnlyMode(RecoveryJournal *journal, int errorCode) -{ - enterReadOnlyMode(journal->readOnlyNotifier, errorCode); - checkForDrainComplete(journal); -} - -/**********************************************************************/ -SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal) -{ - return journal->tail; -} - -/** - * Get the head of the recovery journal, which is the lowest sequence number of - * the block map head and the slab journal head. - * - * @param journal The journal - * - * @return the head of the journal - **/ -static inline SequenceNumber getRecoveryJournalHead(RecoveryJournal *journal) -{ - return minSequenceNumber(journal->blockMapHead, journal->slabJournalHead); -} - -/** - * Compute the recovery count byte for a given recovery count. - * - * @param recoveryCount The recovery count - * - * @return The byte corresponding to the recovery count - **/ -__attribute__((warn_unused_result)) -static inline uint8_t computeRecoveryCountByte(uint64_t recoveryCount) -{ - return (uint8_t) (recoveryCount & RECOVERY_COUNT_MASK); -} - -/** - * Check whether the journal is over the threshold, and if so, force the oldest - * slab journal tail block to commit. - * - * @param journal The journal - **/ -static void checkSlabJournalCommitThreshold(RecoveryJournal *journal) -{ - BlockCount currentLength = journal->tail - journal->slabJournalHead; - if (currentLength > journal->slabJournalCommitThreshold) { - journal->events.slabJournalCommitsRequested++; - commitOldestSlabJournalTailBlocks(journal->depot, - journal->slabJournalHead); - } -} - -/**********************************************************************/ -static void reapRecoveryJournal(RecoveryJournal *journal); -static void assignEntries(RecoveryJournal *journal); - -/** - * Finish reaping the journal. - * - * @param journal The journal being reaped - **/ -static void finishReaping(RecoveryJournal *journal) -{ - SequenceNumber oldHead = getRecoveryJournalHead(journal); - journal->blockMapHead = journal->blockMapReapHead; - journal->slabJournalHead = journal->slabJournalReapHead; - BlockCount blocksReaped = getRecoveryJournalHead(journal) - oldHead; - journal->availableSpace += blocksReaped * journal->entriesPerBlock; - journal->reaping = false; - checkSlabJournalCommitThreshold(journal); - assignEntries(journal); - checkForDrainComplete(journal); -} - -/** - * Finish reaping the journal after flushing the lower layer. This is the - * callback registered in reapRecoveryJournal(). - * - * @param completion The journal's flush VIO - **/ -static void completeReaping(VDOCompletion *completion) -{ - RecoveryJournal *journal = completion->parent; - finishReaping(journal); - - // Try reaping again in case more locks were released while flush was out. - reapRecoveryJournal(journal); -} - -/** - * Handle an error when flushing the lower layer due to reaping. - * - * @param completion The journal's flush VIO - **/ -static void handleFlushError(VDOCompletion *completion) -{ - RecoveryJournal *journal = completion->parent; - journal->reaping = false; - enterJournalReadOnlyMode(journal, completion->result); -} - -/** - * Set all journal fields appropriately to start journaling from the current - * active block. - * - * @param journal The journal to be reset based on its active block - **/ -static void initializeJournalState(RecoveryJournal *journal) -{ - journal->appendPoint.sequenceNumber = journal->tail; - journal->lastWriteAcknowledged = journal->tail; - journal->blockMapHead = journal->tail; - journal->slabJournalHead = journal->tail; - journal->blockMapReapHead = journal->tail; - journal->slabJournalReapHead = journal->tail; - journal->blockMapHeadBlockNumber - = getRecoveryJournalBlockNumber(journal, journal->blockMapHead); - journal->slabJournalHeadBlockNumber - = getRecoveryJournalBlockNumber(journal, journal->slabJournalHead); -} - -/**********************************************************************/ -BlockCount getRecoveryJournalLength(BlockCount journalSize) -{ - BlockCount reservedBlocks = journalSize / 4; - if (reservedBlocks > RECOVERY_JOURNAL_RESERVED_BLOCKS) { - reservedBlocks = RECOVERY_JOURNAL_RESERVED_BLOCKS; - } - return (journalSize - reservedBlocks); -} - -/** - * Attempt to reap the journal now that all the locks on some journal block - * have been released. This is the callback registered with the lock counter. - * - * @param completion The lock counter completion - **/ -static void reapRecoveryJournalCallback(VDOCompletion *completion) -{ - RecoveryJournal *journal = (RecoveryJournal *) completion->parent; - // The acknowledgement must be done before reaping so that there is no - // race between acknowledging the notification and unlocks wishing to notify. - acknowledgeUnlock(journal->lockCounter); - - if (isQuiescing(&journal->state)) { - // Don't start reaping when the journal is trying to quiesce. Do check if - // this notification is the last thing the drain is waiting on. - checkForDrainComplete(journal); - return; - } - - reapRecoveryJournal(journal); - checkSlabJournalCommitThreshold(journal); -} - -/********************************************************************** - * Set the journal's tail sequence number. - * - * @param journal The journal whose tail is to be set - * @param tail The new tail value - **/ -static void setJournalTail(RecoveryJournal *journal, SequenceNumber tail) -{ - // VDO does not support sequence numbers above 1 << 48 in the slab journal. - if (tail >= (1ULL << 48)) { - enterJournalReadOnlyMode(journal, VDO_JOURNAL_OVERFLOW); - } - - journal->tail = tail; -} - -/**********************************************************************/ -int makeRecoveryJournal(Nonce nonce, - PhysicalLayer *layer, - Partition *partition, - uint64_t recoveryCount, - BlockCount journalSize, - BlockCount tailBufferSize, - ReadOnlyNotifier *readOnlyNotifier, - const ThreadConfig *threadConfig, - RecoveryJournal **journalPtr) -{ - RecoveryJournal *journal; - int result = ALLOCATE(1, RecoveryJournal, __func__, &journal); - if (result != VDO_SUCCESS) { - return result; - } - - initializeRing(&journal->freeTailBlocks); - initializeRing(&journal->activeTailBlocks); - initializeWaitQueue(&journal->pendingWrites); - - journal->threadID = getJournalZoneThread(threadConfig); - journal->partition = partition; - journal->nonce = nonce; - journal->recoveryCount = computeRecoveryCountByte(recoveryCount); - journal->size = journalSize; - journal->readOnlyNotifier = readOnlyNotifier; - journal->tail = 1; - journal->slabJournalCommitThreshold = (journalSize * 2) / 3; - initializeJournalState(journal); - - journal->entriesPerBlock = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK; - BlockCount journalLength = getRecoveryJournalLength(journalSize); - journal->availableSpace = journal->entriesPerBlock * journalLength; - - // Only make the tail buffer and VIO in normal operation since the formatter - // doesn't need them. - if (layer->createMetadataVIO != NULL) { - for (BlockCount i = 0; i < tailBufferSize; i++) { - RecoveryJournalBlock *block; - result = makeRecoveryBlock(layer, journal, &block); - if (result != VDO_SUCCESS) { - freeRecoveryJournal(&journal); - return result; - } - - pushRingNode(&journal->freeTailBlocks, &block->ringNode); - } - - result = makeLockCounter(layer, journal, reapRecoveryJournalCallback, - journal->threadID, threadConfig->logicalZoneCount, - threadConfig->physicalZoneCount, journal->size, - &journal->lockCounter); - if (result != VDO_SUCCESS) { - freeRecoveryJournal(&journal); - return result; - } - - result = ALLOCATE(VDO_BLOCK_SIZE, char, "journal flush data", - &journal->unusedFlushVIOData); - if (result != VDO_SUCCESS) { - freeRecoveryJournal(&journal); - return result; - } - - result = createVIO(layer, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH, - journal, journal->unusedFlushVIOData, - &journal->flushVIO); - if (result != VDO_SUCCESS) { - freeRecoveryJournal(&journal); - return result; - } - - result = registerReadOnlyListener(readOnlyNotifier, journal, - notifyRecoveryJournalOfReadOnlyMode, - journal->threadID); - if (result != VDO_SUCCESS) { - freeRecoveryJournal(&journal); - return result; - } - - journal->flushVIO->completion.callbackThreadID = journal->threadID; - } - - *journalPtr = journal; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeRecoveryJournal(RecoveryJournal **journalPtr) -{ - RecoveryJournal *journal = *journalPtr; - if (journal == NULL) { - return; - } - - freeLockCounter(&journal->lockCounter); - freeVIO(&journal->flushVIO); - FREE(journal->unusedFlushVIOData); - - // XXX: eventually, the journal should be constructed in a quiescent state - // which requires opening before use. - if (!isQuiescent(&journal->state)) { - ASSERT_LOG_ONLY(isRingEmpty(&journal->activeTailBlocks), - "journal being freed has no active tail blocks"); - } else if (!isSaved(&journal->state) - && !isRingEmpty(&journal->activeTailBlocks)) { - logWarning("journal being freed has uncommited entries"); - } - - RecoveryJournalBlock *block; - while ((block = popActiveList(journal)) != NULL) { - freeRecoveryBlock(&block); - } - - while ((block = popFreeList(journal)) != NULL) { - freeRecoveryBlock(&block); - } - - FREE(journal); - *journalPtr = NULL; -} - -/**********************************************************************/ -void setRecoveryJournalPartition(RecoveryJournal *journal, - Partition *partition) -{ - journal->partition = partition; -} - -/**********************************************************************/ -void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal, - uint64_t recoveryCount, - SequenceNumber tail) -{ - setJournalTail(journal, tail + 1); - journal->recoveryCount = computeRecoveryCountByte(recoveryCount); - initializeJournalState(journal); -} - -/**********************************************************************/ -void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal, - uint64_t recoveryCount, - SequenceNumber tail, - BlockCount logicalBlocksUsed, - BlockCount blockMapDataBlocks) -{ - initializeRecoveryJournalPostRecovery(journal, recoveryCount, tail); - journal->logicalBlocksUsed = logicalBlocksUsed; - journal->blockMapDataBlocks = blockMapDataBlocks; -} - -/**********************************************************************/ -BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal) -{ - return journal->blockMapDataBlocks; -} - -/**********************************************************************/ -void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal, - BlockCount pages) -{ - journal->blockMapDataBlocks = pages; -} - -/**********************************************************************/ -ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal) -{ - return journal->threadID; -} - -/**********************************************************************/ -void openRecoveryJournal(RecoveryJournal *journal, - SlabDepot *depot, - BlockMap *blockMap) -{ - journal->depot = depot; - journal->blockMap = blockMap; - journal->state.state = ADMIN_STATE_NORMAL_OPERATION; -} - -/**********************************************************************/ -size_t getRecoveryJournalEncodedSize(void) -{ - return ENCODED_HEADER_SIZE + sizeof(RecoveryJournalState7_0); -} - -/**********************************************************************/ -int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) -{ - SequenceNumber journalStart; - if (isSaved(&journal->state)) { - // If the journal is saved, we should start one past the active block - // (since the active block is not guaranteed to be empty). - journalStart = journal->tail; - } else { - // When we're merely suspended or have gone read-only, we must record the - // first block that might have entries that need to be applied. - journalStart = getRecoveryJournalHead(journal); - } - - int result = encodeHeader(&RECOVERY_JOURNAL_HEADER_7_0, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - size_t initialLength = contentLength(buffer); - - result = putUInt64LEIntoBuffer(buffer, journalStart); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, journal->logicalBlocksUsed); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, journal->blockMapDataBlocks); - if (result != UDS_SUCCESS) { - return result; - } - - size_t encodedSize = contentLength(buffer) - initialLength; - return ASSERT(RECOVERY_JOURNAL_HEADER_7_0.size == encodedSize, - "encoded recovery journal component size" - " must match header size"); -} - -/** - * Decode recovery journal component state version 7.0 from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param state The state structure to receive the decoded values - * - * @return UDS_SUCCESS or an error code - **/ -static int decodeRecoveryJournalState_7_0(Buffer *buffer, - RecoveryJournalState7_0 *state) -{ - size_t initialLength = contentLength(buffer); - - SequenceNumber journalStart; - int result = getUInt64LEFromBuffer(buffer, &journalStart); - if (result != UDS_SUCCESS) { - return result; - } - - BlockCount logicalBlocksUsed; - result = getUInt64LEFromBuffer(buffer, &logicalBlocksUsed); - if (result != UDS_SUCCESS) { - return result; - } - - BlockCount blockMapDataBlocks; - result = getUInt64LEFromBuffer(buffer, &blockMapDataBlocks); - if (result != UDS_SUCCESS) { - return result; - } - - *state = (RecoveryJournalState7_0) { - .journalStart = journalStart, - .logicalBlocksUsed = logicalBlocksUsed, - .blockMapDataBlocks = blockMapDataBlocks, - }; - - size_t decodedSize = initialLength - contentLength(buffer); - return ASSERT(RECOVERY_JOURNAL_HEADER_7_0.size == decodedSize, - "decoded slab depot component size must match header size"); -} - -/**********************************************************************/ -int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) -{ - Header header; - int result = decodeHeader(buffer, &header); - if (result != VDO_SUCCESS) { - return result; - } - - result = validateHeader(&RECOVERY_JOURNAL_HEADER_7_0, &header, - true, __func__); - if (result != VDO_SUCCESS) { - return result; - } - - RecoveryJournalState7_0 state; - result = decodeRecoveryJournalState_7_0(buffer, &state); - if (result != VDO_SUCCESS) { - return result; - } - - // Update recovery journal in-memory information. - setJournalTail(journal, state.journalStart); - journal->logicalBlocksUsed = state.logicalBlocksUsed; - journal->blockMapDataBlocks = state.blockMapDataBlocks; - initializeJournalState(journal); - - // XXX: this is a hack until we make initial resume of a VDO a real resume - journal->state.state = ADMIN_STATE_SUSPENDED; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) -{ - // Sodium uses version 7.0, same as head, currently. - return decodeRecoveryJournal(journal, buffer); -} - -/** - * Advance the tail of the journal. - * - * @param journal The journal whose tail should be advanced - * - * @return true if the tail was advanced - **/ -static bool advanceTail(RecoveryJournal *journal) -{ - journal->activeBlock = popFreeList(journal); - if (journal->activeBlock == NULL) { - return false; - } - - pushRingNode(&journal->activeTailBlocks, &journal->activeBlock->ringNode); - initializeRecoveryBlock(journal->activeBlock); - setJournalTail(journal, journal->tail + 1); - advanceBlockMapEra(journal->blockMap, journal->tail); - return true; -} - -/** - * Check whether there is space to make a given type of entry. - * - * @param journal The journal to check - * @param increment Set to true if the desired entry is an - * increment - * - * @return true if there is space in the journal to make an - * entry of the specified type - **/ -static bool checkForEntrySpace(RecoveryJournal *journal, bool increment) -{ - if (increment) { - return ((journal->availableSpace - journal->pendingDecrementCount) > 1); - } - - return (journal->availableSpace > 0); -} - -/** - * Prepare the currently active block to receive an entry and check whether - * an entry of the given type may be assigned at this time. - * - * @param journal The journal receiving an entry - * @param increment Set to true if the desired entry is an - * increment - * - * @return true if there is space in the journal to store an - * entry of the specified type - **/ -static bool prepareToAssignEntry(RecoveryJournal *journal, bool increment) -{ - if (!checkForEntrySpace(journal, increment)) { - if (!increment) { - // There must always be room to make a decrement entry. - logError("No space for decrement entry in recovery journal"); - enterJournalReadOnlyMode(journal, VDO_RECOVERY_JOURNAL_FULL); - } - return false; - } - - if (isRecoveryBlockFull(journal->activeBlock) && !advanceTail(journal)) { - return false; - } - - if (!isRecoveryBlockEmpty(journal->activeBlock)) { - return true; - } - - if ((journal->tail - getRecoveryJournalHead(journal)) > journal->size) { - // Cannot use this block since the journal is full. - journal->events.diskFull++; - return false; - } - - /* - * Don't allow the new block to be reaped until all of its entries have been - * committed to the block map and until the journal block has been fully - * committed as well. Because the block map update is done only after any - * slab journal entries have been made, the per-entry lock for the block map - * entry serves to protect those as well. - */ - initializeLockCount(journal->lockCounter, journal->activeBlock->blockNumber, - journal->entriesPerBlock + 1); - return true; -} - -/**********************************************************************/ -static void writeBlocks(RecoveryJournal *journal); - -/** - * Queue a block for writing. The block is expected to be full. If the block - * is currently writing, this is a noop as the block will be queued for - * writing when the write finishes. The block must not currently be queued - * for writing. - * - * @param journal The journal in question - * @param block The block which is now ready to write - **/ -static void scheduleBlockWrite(RecoveryJournal *journal, - RecoveryJournalBlock *block) -{ - if (block->committing) { - return; - } - - int result = enqueueWaiter(&journal->pendingWrites, &block->writeWaiter); - if (result != VDO_SUCCESS) { - enterJournalReadOnlyMode(journal, result); - return; - } - - PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; - if ((layer->getWritePolicy(layer) == WRITE_POLICY_ASYNC)) { - /* - * At the end of adding entries, or discovering this partial block - * is now full and ready to rewrite, we will call writeBlocks() and - * write a whole batch. - */ - return; - } - writeBlocks(journal); -} - -/** - * Release a reference to a journal block. - * - * @param block The journal block from which to release a reference - **/ -static void releaseJournalBlockReference(RecoveryJournalBlock *block) -{ - releaseJournalZoneReference(block->journal->lockCounter, block->blockNumber); -} - -/** - * Implements WaiterCallback. Assign an entry waiter to the active block. - **/ -static void assignEntry(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - RecoveryJournalBlock *block = (RecoveryJournalBlock *) context; - RecoveryJournal *journal = block->journal; - - // Record the point at which we will make the journal entry. - dataVIO->recoveryJournalPoint = (JournalPoint) { - .sequenceNumber = block->sequenceNumber, - .entryCount = block->entryCount, - }; - - switch (dataVIO->operation.type) { - case DATA_INCREMENT: - if (dataVIO->operation.state != MAPPING_STATE_UNMAPPED) { - journal->logicalBlocksUsed++; - } - journal->pendingDecrementCount++; - break; - - case DATA_DECREMENT: - if (dataVIO->operation.state != MAPPING_STATE_UNMAPPED) { - journal->logicalBlocksUsed--; - } - - // Per-entry locks need not be held for decrement entries since the lock - // held for the incref entry will protect this entry as well. - releaseJournalBlockReference(block); - ASSERT_LOG_ONLY((journal->pendingDecrementCount != 0), - "decrement follows increment"); - journal->pendingDecrementCount--; - break; - - case BLOCK_MAP_INCREMENT: - journal->blockMapDataBlocks++; - break; - - default: - logError("Invalid journal operation %u", dataVIO->operation.type); - enterJournalReadOnlyMode(journal, VDO_NOT_IMPLEMENTED); - continueDataVIO(dataVIO, VDO_NOT_IMPLEMENTED); - return; - } - - journal->availableSpace--; - int result = enqueueRecoveryBlockEntry(block, dataVIO); - if (result != VDO_SUCCESS) { - enterJournalReadOnlyMode(journal, result); - continueDataVIO(dataVIO, result); - } - - if (isRecoveryBlockFull(block)) { - // The block is full, so we can write it anytime henceforth. If it is - // already committing, we'll queue it for writing when it comes back. - scheduleBlockWrite(journal, block); - } - - // Force out slab journal tail blocks when threshold is reached. - checkSlabJournalCommitThreshold(journal); -} - -/**********************************************************************/ -static bool assignEntriesFromQueue(RecoveryJournal *journal, - WaitQueue *queue, - bool increment) -{ - while (hasWaiters(queue)) { - if (!prepareToAssignEntry(journal, increment)) { - return false; - } - - notifyNextWaiter(queue, assignEntry, journal->activeBlock); - } - - return true; -} - -/**********************************************************************/ -static void assignEntries(RecoveryJournal *journal) -{ - if (journal->addingEntries) { - // Protect against re-entrancy. - return; - } - - journal->addingEntries = true; - if (assignEntriesFromQueue(journal, &journal->decrementWaiters, false)) { - assignEntriesFromQueue(journal, &journal->incrementWaiters, true); - } - - // Now that we've finished with entries, see if we have a batch of blocks to - // write. - writeBlocks(journal); - journal->addingEntries = false; -} - -/** - * Prepare an in-memory journal block to be reused now that it has been fully - * committed. - * - * @param block The block to be recycled - **/ -static void recycleJournalBlock(RecoveryJournalBlock *block) -{ - RecoveryJournal *journal = block->journal; - pushRingNode(&journal->freeTailBlocks, &block->ringNode); - - // Release any unused entry locks. - for (BlockCount i = block->entryCount; i < journal->entriesPerBlock; i++) { - releaseJournalBlockReference(block); - } - - // Release our own lock against reaping now that the block is completely - // committed, or we're giving up because we're in read-only mode. - if (block->entryCount > 0) { - releaseJournalBlockReference(block); - } - - if (block == journal->activeBlock) { - journal->activeBlock = NULL; - } -} - -/** - * WaiterCallback implementation invoked whenever a VIO is to be released - * from the journal because its entry was committed to disk. - **/ -static void continueCommittedWaiter(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - RecoveryJournal *journal = (RecoveryJournal *) context; - ASSERT_LOG_ONLY(beforeJournalPoint(&journal->commitPoint, - &dataVIO->recoveryJournalPoint), - "DataVIOs released from recovery journal in order. " - "Recovery journal point is (%llu, %" PRIu16 "), " - "but commit waiter point is (%llu, %" PRIu16 ")", - journal->commitPoint.sequenceNumber, - journal->commitPoint.entryCount, - dataVIO->recoveryJournalPoint.sequenceNumber, - dataVIO->recoveryJournalPoint.entryCount); - journal->commitPoint = dataVIO->recoveryJournalPoint; - - int result - = (isReadOnly(journal->readOnlyNotifier) ? VDO_READ_ONLY : VDO_SUCCESS); - continueWaiter(waiter, &result); -} - -/** - * Notify any VIOs whose entries have now committed. - * - * @param journal The recovery journal to update - **/ -static void notifyCommitWaiters(RecoveryJournal *journal) -{ - if (isRingEmpty(&journal->activeTailBlocks)) { - return; - } - - for (RingNode *node = journal->activeTailBlocks.next; - node != &journal->activeTailBlocks; - node = node->next) { - RecoveryJournalBlock *block = blockFromRingNode(node); - - if (block->committing) { - return; - } - - notifyAllWaiters(&block->commitWaiters, continueCommittedWaiter, journal); - if (isReadOnly(journal->readOnlyNotifier)) { - notifyAllWaiters(&block->entryWaiters, continueCommittedWaiter, journal); - } else if (isRecoveryBlockDirty(block) || !isRecoveryBlockFull(block)) { - // Stop at partially-committed or partially-filled blocks. - return; - } - } -} - -/** - * Recycle any journal blocks which have been fully committed. - * - * @param journal The recovery journal to update - **/ -static void recycleJournalBlocks(RecoveryJournal *journal) -{ - while (!isRingEmpty(&journal->activeTailBlocks)) { - RecoveryJournalBlock *block - = blockFromRingNode(journal->activeTailBlocks.next); - - if (block->committing) { - // Don't recycle committing blocks. - return; - } - - if (!isReadOnly(journal->readOnlyNotifier) - && (isRecoveryBlockDirty(block) - || !isRecoveryBlockFull(block))) { - // Don't recycle partially written or partially full - // blocks, except in read-only mode. - return; - } - recycleJournalBlock(block); - } -} - -/** - * Handle post-commit processing. This is the callback registered by - * writeBlock(). If more entries accumulated in the block being committed while - * the commit was in progress, another commit will be initiated. - * - * @param completion The completion of the VIO writing this block - **/ -static void completeWrite(VDOCompletion *completion) -{ - RecoveryJournalBlock *block = completion->parent; - RecoveryJournal *journal = block->journal; - assertOnJournalThread(journal, __func__); - - journal->pendingWriteCount -= 1; - journal->events.blocks.committed += 1; - journal->events.entries.committed += block->entriesInCommit; - block->uncommittedEntryCount -= block->entriesInCommit; - block->entriesInCommit = 0; - block->committing = false; - - // If this block is the latest block to be acknowledged, record that fact. - if (block->sequenceNumber > journal->lastWriteAcknowledged) { - journal->lastWriteAcknowledged = block->sequenceNumber; - } - - RecoveryJournalBlock *lastActiveBlock - = blockFromRingNode(journal->activeTailBlocks.next); - ASSERT_LOG_ONLY((block->sequenceNumber >= lastActiveBlock->sequenceNumber), - "completed journal write is still active"); - - notifyCommitWaiters(journal); - - // Is this block now full? Reaping, and adding entries, might have already - // sent it off for rewriting; else, queue it for rewrite. - if (isRecoveryBlockDirty(block) && isRecoveryBlockFull(block)) { - scheduleBlockWrite(journal, block); - } - - recycleJournalBlocks(journal); - writeBlocks(journal); - - checkForDrainComplete(journal); -} - -/**********************************************************************/ -static void handleWriteError(VDOCompletion *completion) -{ - RecoveryJournalBlock *block = completion->parent; - RecoveryJournal *journal = block->journal; - logErrorWithStringError(completion->result, - "cannot write recovery journal block %llu", - block->sequenceNumber); - enterJournalReadOnlyMode(journal, completion->result); - completeWrite(completion); -} - -/** - * Issue a block for writing. Implements WaiterCallback. - **/ -static void writeBlock(Waiter *waiter, void *context __attribute__((unused))) -{ - RecoveryJournalBlock *block = blockFromWaiter(waiter); - if (isReadOnly(block->journal->readOnlyNotifier)) { - return; - } - - int result = commitRecoveryBlock(block, completeWrite, handleWriteError); - if (result != VDO_SUCCESS) { - enterJournalReadOnlyMode(block->journal, result); - } -} - -/** - * Attempt to commit blocks, according to write policy. - * - * @param journal The recovery journal - **/ -static void writeBlocks(RecoveryJournal *journal) -{ - assertOnJournalThread(journal, __func__); - /* - * In sync and async-unsafe modes, we call this function each time we queue - * a full block on pending writes; in addition, in all cases we call this - * function after adding entries to the journal and finishing a block write. - * Thus, when this function terminates we must either have no VIOs waiting - * in the journal or have some outstanding IO to provide a future wakeup. - * - * In all modes, if there are no outstanding writes and some unwritten - * entries, we must issue a block, even if it's the active block and it - * isn't full. Otherwise, in sync/async-unsafe modes, we want to issue - * all full blocks every time; since we call it each time we fill a block, - * this is equivalent to issuing every full block as soon as its full. In - * async mode, we want to only issue full blocks if there are no - * pending writes. - */ - - PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; - if ((layer->getWritePolicy(layer) != WRITE_POLICY_ASYNC) - || (journal->pendingWriteCount == 0)) { - // Write all the full blocks. - notifyAllWaiters(&journal->pendingWrites, writeBlock, NULL); - } - - // Do we need to write the active block? Only if we have no outstanding - // writes, even after issuing all of the full writes. - if ((journal->pendingWriteCount == 0) - && canCommitRecoveryBlock(journal->activeBlock)) { - writeBlock(&journal->activeBlock->writeWaiter, NULL); - } -} - -/**********************************************************************/ -void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO) -{ - assertOnJournalThread(journal, __func__); - if (!isNormal(&journal->state)) { - continueDataVIO(dataVIO, VDO_INVALID_ADMIN_STATE); - return; - } - - if (isReadOnly(journal->readOnlyNotifier)) { - continueDataVIO(dataVIO, VDO_READ_ONLY); - return; - } - - bool increment = isIncrementOperation(dataVIO->operation.type); - ASSERT_LOG_ONLY((!increment || (dataVIO->recoverySequenceNumber == 0)), - "journal lock not held for increment"); - - advanceJournalPoint(&journal->appendPoint, journal->entriesPerBlock); - int result = enqueueDataVIO((increment - ? &journal->incrementWaiters - : &journal->decrementWaiters), dataVIO, - THIS_LOCATION("$F($j-$js);io=journal($j-$js)")); - if (result != VDO_SUCCESS) { - enterJournalReadOnlyMode(journal, result); - continueDataVIO(dataVIO, result); - return; - } - - assignEntries(journal); -} - -/** - * Conduct a sweep on a recovery journal to reclaim unreferenced blocks. - * - * @param journal The recovery journal - **/ -static void reapRecoveryJournal(RecoveryJournal *journal) -{ - if (journal->reaping) { - // We already have an outstanding reap in progress. We need to wait for it - // to finish. - return; - } - - if (isQuiescent(&journal->state)) { - // We are supposed to not do IO. Don't botch it by reaping. - return; - } - - // Start reclaiming blocks only when the journal head has no references. Then - // stop when a block is referenced. - while ((journal->blockMapReapHead < journal->lastWriteAcknowledged) - && !isLocked(journal->lockCounter, journal->blockMapHeadBlockNumber, - ZONE_TYPE_LOGICAL)) { - journal->blockMapReapHead++; - if (++journal->blockMapHeadBlockNumber == journal->size) { - journal->blockMapHeadBlockNumber = 0; - } - } - - while ((journal->slabJournalReapHead < journal->lastWriteAcknowledged) - && !isLocked(journal->lockCounter, - journal->slabJournalHeadBlockNumber, - ZONE_TYPE_PHYSICAL)) { - journal->slabJournalReapHead++; - if (++journal->slabJournalHeadBlockNumber == journal->size) { - journal->slabJournalHeadBlockNumber = 0; - } - } - - if ((journal->blockMapReapHead == journal->blockMapHead) - && (journal->slabJournalReapHead == journal->slabJournalHead)) { - // Nothing happened. - return; - } - - PhysicalLayer *layer = vioAsCompletion(journal->flushVIO)->layer; - if (layer->getWritePolicy(layer) != WRITE_POLICY_SYNC) { - /* - * If the block map head will advance, we must flush any block map page - * modified by the entries we are reaping. If the slab journal head will - * advance, we must flush the slab summary update covering the slab journal - * that just released some lock. - * - * In sync mode, this is unnecessary because we won't record these numbers - * on disk until the next journal block write, and in sync mode every - * journal block write is preceded by a flush, which does the block map - * page and slab summary update flushing itself. - */ - journal->reaping = true; - launchFlush(journal->flushVIO, completeReaping, handleFlushError); - return; - } - - finishReaping(journal); -} - -/**********************************************************************/ -void acquireRecoveryJournalBlockReference(RecoveryJournal *journal, - SequenceNumber sequenceNumber, - ZoneType zoneType, - ZoneCount zoneID) -{ - if (sequenceNumber == 0) { - return; - } - - BlockCount blockNumber - = getRecoveryJournalBlockNumber(journal, sequenceNumber); - acquireLockCountReference(journal->lockCounter, blockNumber, zoneType, - zoneID); -} - -/**********************************************************************/ -void releaseRecoveryJournalBlockReference(RecoveryJournal *journal, - SequenceNumber sequenceNumber, - ZoneType zoneType, - ZoneCount zoneID) -{ - if (sequenceNumber == 0) { - return; - } - - BlockCount blockNumber - = getRecoveryJournalBlockNumber(journal, sequenceNumber); - releaseLockCountReference(journal->lockCounter, blockNumber, zoneType, - zoneID); -} - -/**********************************************************************/ -void releasePerEntryLockFromOtherZone(RecoveryJournal *journal, - SequenceNumber sequenceNumber) -{ - if (sequenceNumber == 0) { - return; - } - - BlockCount blockNumber - = getRecoveryJournalBlockNumber(journal, sequenceNumber); - releaseJournalZoneReferenceFromOtherZone(journal->lockCounter, blockNumber); -} - -/** - * Initiate a drain. - * - * Implements AdminInitiator. - **/ -static void initiateDrain(AdminState *state) -{ - checkForDrainComplete(container_of(state, RecoveryJournal, state)); -} - -/**********************************************************************/ -void drainRecoveryJournal(RecoveryJournal *journal, - AdminStateCode operation, - VDOCompletion *parent) -{ - assertOnJournalThread(journal, __func__); - startDraining(&journal->state, operation, parent, initiateDrain); -} - -/**********************************************************************/ -void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent) -{ - assertOnJournalThread(journal, __func__); - bool saved = isSaved(&journal->state); - setCompletionResult(parent, resumeIfQuiescent(&journal->state)); - - if (isReadOnly(journal->readOnlyNotifier)) { - finishCompletion(parent, VDO_READ_ONLY); - return; - } - - if (saved) { - initializeJournalState(journal); - } - - if (resumeLockCounter(journal->lockCounter)) { - // We might have missed a notification. - reapRecoveryJournal(journal); - } - - completeCompletion(parent); -} - -/**********************************************************************/ -BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal) -{ - return journal->logicalBlocksUsed; -} - -/**********************************************************************/ -RecoveryJournalStatistics -getRecoveryJournalStatistics(const RecoveryJournal *journal) -{ - return journal->events; -} - -/**********************************************************************/ -void dumpRecoveryJournalStatistics(const RecoveryJournal *journal) -{ - RecoveryJournalStatistics stats = getRecoveryJournalStatistics(journal); - logInfo("Recovery Journal"); - logInfo(" blockMapHead=%llu slabJournalHead=%" PRIu64 - " lastWriteAcknowledged=%llu tail=%" PRIu64 - " blockMapReapHead=%llu slabJournalReapHead=%" PRIu64 - " diskFull=%llu slabJournalCommitsRequested=%" PRIu64 - " incrementWaiters=%zu decrementWaiters=%zu", - journal->blockMapHead, journal->slabJournalHead, - journal->lastWriteAcknowledged, journal->tail, - journal->blockMapReapHead, journal->slabJournalReapHead, - stats.diskFull, stats.slabJournalCommitsRequested, - countWaiters(&journal->incrementWaiters), - countWaiters(&journal->decrementWaiters)); - logInfo(" entries: started=%llu written=%llu committed=%" - PRIu64, - stats.entries.started, stats.entries.written, - stats.entries.committed); - logInfo(" blocks: started=%llu written=%llu committed=%" - PRIu64, - stats.blocks.started, stats.blocks.written, - stats.blocks.committed); - - logInfo(" active blocks:"); - const RingNode *head = &journal->activeTailBlocks; - for (RingNode *node = head->next; node != head; node = node->next) { - dumpRecoveryBlock(blockFromRingNode(node)); - } -} diff --git a/vdo/base/recoveryJournal.h b/vdo/base/recoveryJournal.h deleted file mode 100644 index 8ae7de0..0000000 --- a/vdo/base/recoveryJournal.h +++ /dev/null @@ -1,416 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournal.h#5 $ - */ - -#ifndef RECOVERY_JOURNAL_H -#define RECOVERY_JOURNAL_H - -#include "buffer.h" - -#include "adminState.h" -#include "completion.h" -#include "fixedLayout.h" -#include "flush.h" -#include "readOnlyNotifier.h" -#include "statistics.h" -#include "trace.h" -#include "types.h" - -/** - * The RecoveryJournal provides a log of all block mapping changes - * which have not yet been stably written to the block map. It exists - * to help provide resiliency guarantees by allowing synchronous - * writes to be acknowledged as soon as the corresponding journal - * entry is committed instead of having to wait for the block map - * update. For asynchronous writes, the journal aids in meeting the - * five second data loss window by ensuring that writes will not be - * lost as long as they are committed to the journal before the window - * expires. This should be less work than committing all of the - * required block map pages. - * - * The journal consists of a set of on-disk blocks arranged as a - * circular log with monotonically increasing sequence numbers. Three - * sequence numbers serve to define the active extent of the - * journal. The 'head' is the oldest active block in the journal. The - * 'tail' is the end of the half-open interval containing the active - * blocks. 'active' is the number of the block actively receiving - * entries. In an empty journal, head == active == tail. Once any - * entries are added, tail = active + 1, and head may be any value in - * the interval [tail - size, active]. - * - * The journal also contains a set of in-memory blocks which are used - * to buffer up entries until they can be committed. In general the - * number of in-memory blocks ('tailBufferCount') will be less than - * the on-disk size. Each in-memory block is also a VDOCompletion. - * Each in-memory block has a VDOExtent which is used to commit that - * block to disk. The extent's data is a PackedJournalBlock (which is a - * formatted journal block). In addition each in-memory block has a - * buffer which is used to accumulate entries while a partial commit - * of the block is in progress. In-memory blocks are kept on two - * rings. Free blocks live on the 'freeTailBlocks' ring. When a block - * becomes active (see below) it is moved to the 'activeTailBlocks' - * ring. When a block is fully committed, it is moved back to the - * 'freeTailBlocks' ring. - * - * When entries are added to the journal, they are added to the active - * in-memory block, as indicated by the 'activeBlock' field. If the - * caller wishes to wait for the entry to be committed, the requesting - * VIO will be attached to the in-memory block to which the caller's - * entry was added. If the caller does wish to wait, or if the entry - * filled the active block, an attempt will be made to commit that - * block to disk. If there is already another commit in progress, the - * attempt will be ignored and then automatically retried when the - * in-progress commit completes. If there is no commit in progress, - * any VIOs waiting on the block are transferred to the extent. The - * extent is then written, automatically waking all of the waiters - * when it completes. When the extent completes, any entries which - * accumulated in the block are copied to the extent's data buffer. - * - * Finally, the journal maintains a set of counters, one for each on - * disk journal block. These counters are used as locks to prevent - * premature reaping of journal blocks. Each time a new sequence - * number is used, the counter for the corresponding block is - * incremented. The counter is subsequently decremented when that - * block is filled and then committed for the last time. This prevents - * blocks from being reaped while they are still being updated. The - * counter is also incremented once for each entry added to a block, - * and decremented once each time the block map is updated in memory - * for that request. This prevents blocks from being reaped while - * their VIOs are still active. Finally, each in-memory block map page - * tracks the oldest journal block that contains entries corresponding to - * uncommitted updates to that block map page. Each time an in-memory block - * map page is updated, it checks if the journal block for the VIO - * is earlier than the one it references, in which case it increments - * the count on the earlier journal block and decrements the count on the - * later journal block, maintaining a lock on the oldest journal block - * containing entries for that page. When a block map page has been flushed - * from the cache, the counter for the journal block it references is - * decremented. Whenever the counter for the head block goes to 0, the - * head is advanced until it comes to a block whose counter is not 0 - * or until it reaches the active block. This is the mechanism for - * reclaiming journal space on disk. - * - * If there is no in-memory space when a VIO attempts to add an entry, - * the VIO will be attached to the 'commitCompletion' and will be - * woken the next time a full block has committed. If there is no - * on-disk space when a VIO attempts to add an entry, the VIO will be - * attached to the 'reapCompletion', and will be woken the next time a - * journal block is reaped. - **/ - -/** - * Return whether a given JournalOperation is an increment type. - * - * @param operation The operation in question - * - * @return true if the type is an increment type - **/ -static inline bool isIncrementOperation(JournalOperation operation) -{ - return ((operation == DATA_INCREMENT) || (operation == BLOCK_MAP_INCREMENT)); -} - -/** - * Get the name of a journal operation. - * - * @param operation The operation to name - * - * @return The name of the operation - **/ -const char *getJournalOperationName(JournalOperation operation) - __attribute__((warn_unused_result)); - -/** - * Create a recovery journal. - * - * @param [in] nonce the nonce of the VDO - * @param [in] layer the physical layer for the journal - * @param [in] partition the partition for the journal - * @param [in] recoveryCount The VDO's number of completed recoveries - * @param [in] journalSize the number of blocks in the journal on disk - * @param [in] tailBufferSize the number of blocks for tail buffer - * @param [in] readOnlyNotifier the read-only mode notifier - * @param [in] threadConfig the thread configuration of the VDO - * @param [out] journalPtr the pointer to hold the new recovery journal - * - * @return a success or error code - **/ -int makeRecoveryJournal(Nonce nonce, - PhysicalLayer *layer, - Partition *partition, - uint64_t recoveryCount, - BlockCount journalSize, - BlockCount tailBufferSize, - ReadOnlyNotifier *readOnlyNotifier, - const ThreadConfig *threadConfig, - RecoveryJournal **journalPtr) - __attribute__((warn_unused_result)); - -/** - * Free a recovery journal and null out the reference to it. - * - * @param [in,out] journalPtr The reference to the recovery journal to free - **/ -void freeRecoveryJournal(RecoveryJournal **journalPtr); - -/** - * Move the backing partition pointer of the recovery journal. - * Assumes that the data in the old and the new partitions is identical. - * - * @param journal the journal being moved - * @param partition the new journal partition - **/ -void setRecoveryJournalPartition(RecoveryJournal *journal, - Partition *partition); - -/** - * Initialize the journal after a recovery. - * - * @param journal The journal in question - * @param recoveryCount The number of completed recoveries - * @param tail The new tail block sequence number - **/ -void initializeRecoveryJournalPostRecovery(RecoveryJournal *journal, - uint64_t recoveryCount, - SequenceNumber tail); - -/** - * Initialize the journal after a rebuild. - * - * @param journal The journal in question - * @param recoveryCount The number of completed recoveries - * @param tail The new tail block sequence number - * @param logicalBlocksUsed The new number of logical blocks used - * @param blockMapDataBlocks The new number of block map data blocks - **/ -void initializeRecoveryJournalPostRebuild(RecoveryJournal *journal, - uint64_t recoveryCount, - SequenceNumber tail, - BlockCount logicalBlocksUsed, - BlockCount blockMapDataBlocks); - -/** - * Get the number of block map pages, allocated from data blocks, currently - * in use. - * - * @param journal The journal in question - * - * @return The number of block map pages allocated from slabs - **/ -BlockCount getJournalBlockMapDataBlocksUsed(RecoveryJournal *journal) - __attribute__((warn_unused_result)); - -/** - * Set the number of block map pages, allocated from data blocks, currently - * in use. - * - * @param journal The journal in question - * @param pages The number of block map pages allocated from slabs - **/ -void setJournalBlockMapDataBlocksUsed(RecoveryJournal *journal, - BlockCount pages); - -/** - * Get the ID of a recovery journal's thread. - * - * @param journal The journal to query - * - * @return The ID of the journal's thread. - **/ -ThreadID getRecoveryJournalThreadID(RecoveryJournal *journal) - __attribute__((warn_unused_result)); - -/** - * Prepare the journal for new entries. - * - * @param journal The journal in question - * @param depot The slab depot for this VDO - * @param blockMap The block map for this VDO - **/ -void openRecoveryJournal(RecoveryJournal *journal, - SlabDepot *depot, - BlockMap *blockMap); - -/** - * Obtain the recovery journal's current sequence number. Exposed only so - * the block map can be initialized therefrom. - * - * @param journal The journal in question - * - * @return the sequence number of the tail block - **/ -SequenceNumber getCurrentJournalSequenceNumber(RecoveryJournal *journal); - -/** - * Get the number of usable recovery journal blocks. - * - * @param journalSize The size of the recovery journal in blocks - * - * @return the number of recovery journal blocks usable for entries - **/ -BlockCount getRecoveryJournalLength(BlockCount journalSize) - __attribute__((warn_unused_result)); - -/** - * Get the size of the encoded state of a recovery journal. - * - * @return the encoded size of the journal's state - **/ -size_t getRecoveryJournalEncodedSize(void) - __attribute__((warn_unused_result)); - -/** - * Encode the state of a recovery journal. - * - * @param journal the recovery journal - * @param buffer the buffer to encode into - * - * @return VDO_SUCCESS or an error code - **/ -int encodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) - __attribute__((warn_unused_result)); - -/** - * Decode the state of a recovery journal saved in a buffer. - * - * @param journal the recovery journal - * @param buffer the buffer containing the saved state - * - * @return VDO_SUCCESS or an error code - **/ -int decodeRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) - __attribute__((warn_unused_result)); - -/** - * Decode the state of a Sodium recovery journal saved in a buffer. - * - * @param journal the recovery journal - * @param buffer the buffer containing the saved state - * - * @return VDO_SUCCESS or an error code - **/ -int decodeSodiumRecoveryJournal(RecoveryJournal *journal, Buffer *buffer) - __attribute__((warn_unused_result)); - -/** - * Add an entry to a recovery journal. This method is asynchronous. The DataVIO - * will not be called back until the entry is committed to the on-disk journal. - * - * @param journal The journal in which to make an entry - * @param dataVIO The DataVIO for which to add the entry. The entry will be - * taken from the logical and newMapped fields of the - * DataVIO. The DataVIO's recoverySequenceNumber field will - * be set to the sequence number of the journal block in - * which the entry was made. - **/ -void addRecoveryJournalEntry(RecoveryJournal *journal, DataVIO *dataVIO); - -/** - * Acquire a reference to a recovery journal block from somewhere other than - * the journal itself. - * - * @param journal The recovery journal - * @param sequenceNumber The journal sequence number of the referenced block - * @param zoneType The type of the zone making the adjustment - * @param zoneID The ID of the zone making the adjustment - **/ -void acquireRecoveryJournalBlockReference(RecoveryJournal *journal, - SequenceNumber sequenceNumber, - ZoneType zoneType, - ZoneCount zoneID); - - -/** - * Release a reference to a recovery journal block from somewhere other than - * the journal itself. If this is the last reference for a given zone type, - * an attempt will be made to reap the journal. - * - * @param journal The recovery journal - * @param sequenceNumber The journal sequence number of the referenced block - * @param zoneType The type of the zone making the adjustment - * @param zoneID The ID of the zone making the adjustment - **/ -void releaseRecoveryJournalBlockReference(RecoveryJournal *journal, - SequenceNumber sequenceNumber, - ZoneType zoneType, - ZoneCount zoneID); - -/** - * Release a single per-entry reference count for a recovery journal block. This - * method may be called from any zone (but shouldn't be called from the journal - * zone as it would be inefficient). - * - * @param journal The recovery journal - * @param sequenceNumber The journal sequence number of the referenced block - **/ -void releasePerEntryLockFromOtherZone(RecoveryJournal *journal, - SequenceNumber sequenceNumber); - -/** - * Drain recovery journal I/O. All uncommitted entries will be written out. - * - * @param journal The journal to drain - * @param operation The drain operation (suspend or save) - * @param parent The completion to finish once the journal is drained - **/ -void drainRecoveryJournal(RecoveryJournal *journal, - AdminStateCode operation, - VDOCompletion *parent); - -/** - * Resume a recovery journal which has been drained. - * - * @param journal The journal to resume - * @param parent The completion to finish once the journal is resumed - * - * @return VDO_SUCCESS or an error - **/ -void resumeRecoveryJournal(RecoveryJournal *journal, VDOCompletion *parent); - -/** - * Get the number of logical blocks in use by the VDO - * - * @param journal the journal - * - * @return the number of logical blocks in use by the VDO - **/ -BlockCount getJournalLogicalBlocksUsed(const RecoveryJournal *journal) - __attribute__((warn_unused_result)); - -/** - * Get the current statistics from the recovery journal. - * - * @param journal The recovery journal to query - * - * @return a copy of the current statistics for the journal - **/ -RecoveryJournalStatistics -getRecoveryJournalStatistics(const RecoveryJournal *journal) - __attribute__((warn_unused_result)); - -/** - * Dump some current statistics and other debug info from the recovery - * journal. - * - * @param journal The recovery journal to dump - **/ -void dumpRecoveryJournalStatistics(const RecoveryJournal *journal); - -#endif // RECOVERY_JOURNAL_H diff --git a/vdo/base/recoveryJournalBlock.c b/vdo/base/recoveryJournalBlock.c deleted file mode 100644 index 1bbacfc..0000000 --- a/vdo/base/recoveryJournalBlock.c +++ /dev/null @@ -1,341 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalBlock.c#13 $ - */ - -#include "recoveryJournalBlock.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "dataVIO.h" -#include "fixedLayout.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalEntry.h" -#include "recoveryJournalInternals.h" -#include "ringNode.h" -#include "vio.h" -#include "waitQueue.h" - -/**********************************************************************/ -int makeRecoveryBlock(PhysicalLayer *layer, - RecoveryJournal *journal, - RecoveryJournalBlock **blockPtr) -{ - // Ensure that a block is large enough to store - // RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries. - STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK - <= ((VDO_BLOCK_SIZE - sizeof(PackedJournalHeader)) - / sizeof(PackedRecoveryJournalEntry))); - - RecoveryJournalBlock *block; - int result = ALLOCATE(1, RecoveryJournalBlock, __func__, &block); - if (result != VDO_SUCCESS) { - return result; - } - - // Allocate a full block for the journal block even though not all of the - // space is used since the VIO needs to write a full disk block. - result = ALLOCATE(VDO_BLOCK_SIZE, char, "PackedJournalBlock", &block->block); - if (result != VDO_SUCCESS) { - freeRecoveryBlock(&block); - return result; - } - - result = createVIO(layer, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH, - block, block->block, &block->vio); - if (result != VDO_SUCCESS) { - freeRecoveryBlock(&block); - return result; - } - - block->vio->completion.callbackThreadID = journal->threadID; - initializeRing(&block->ringNode); - block->journal = journal; - - *blockPtr = block; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeRecoveryBlock(RecoveryJournalBlock **blockPtr) -{ - RecoveryJournalBlock *block = *blockPtr; - if (block == NULL) { - return; - } - - FREE(block->block); - freeVIO(&block->vio); - FREE(block); - *blockPtr = NULL; -} - -/** - * Get a pointer to the packed journal block header in the block buffer. - * - * @param block The recovery block - * - * @return The block's header - **/ -static inline -PackedJournalHeader *getBlockHeader(const RecoveryJournalBlock *block) -{ - return (PackedJournalHeader *) block->block; -} - -/** - * Set the current sector of the current block and initialize it. - * - * @param block The block to update - * @param sector A pointer to the first byte of the new sector - **/ -static void setActiveSector(RecoveryJournalBlock *block, void *sector) -{ - block->sector = (PackedJournalSector *) sector; - block->sector->checkByte = getBlockHeader(block)->fields.checkByte; - block->sector->recoveryCount = block->journal->recoveryCount; - block->sector->entryCount = 0; -} - -/**********************************************************************/ -void initializeRecoveryBlock(RecoveryJournalBlock *block) -{ - memset(block->block, 0x0, VDO_BLOCK_SIZE); - - RecoveryJournal *journal = block->journal; - block->sequenceNumber = journal->tail; - block->entryCount = 0; - block->uncommittedEntryCount = 0; - - block->blockNumber = getRecoveryJournalBlockNumber(journal, journal->tail); - - RecoveryBlockHeader unpacked = { - .metadataType = VDO_METADATA_RECOVERY_JOURNAL, - .blockMapDataBlocks = journal->blockMapDataBlocks, - .logicalBlocksUsed = journal->logicalBlocksUsed, - .nonce = journal->nonce, - .recoveryCount = journal->recoveryCount, - .sequenceNumber = journal->tail, - .checkByte = computeRecoveryCheckByte(journal, journal->tail), - }; - PackedJournalHeader *header = getBlockHeader(block); - packRecoveryBlockHeader(&unpacked, header); - - setActiveSector(block, getJournalBlockSector(header, 1)); -} - -/**********************************************************************/ -int enqueueRecoveryBlockEntry(RecoveryJournalBlock *block, DataVIO *dataVIO) -{ - // First queued entry indicates this is a journal block we've just opened - // or a committing block we're extending and will have to write again. - bool newBatch = !hasWaiters(&block->entryWaiters); - - // Enqueue the DataVIO to wait for its entry to commit. - int result = enqueueDataVIO(&block->entryWaiters, dataVIO, - THIS_LOCATION("$F($j-$js)")); - if (result != VDO_SUCCESS) { - return result; - } - - block->entryCount++; - block->uncommittedEntryCount++; - - // Update stats to reflect the journal entry we're going to write. - if (newBatch) { - block->journal->events.blocks.started++; - } - block->journal->events.entries.started++; - - return VDO_SUCCESS; -} - -/** - * Check whether the current sector of a block is full. - * - * @param block The block to check - * - * @return true if the sector is full - **/ -__attribute__((warn_unused_result)) -static bool isSectorFull(const RecoveryJournalBlock *block) -{ - return (block->sector->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR); -} - -/** - * Actually add entries from the queue to the given block. - * - * @param block The journal block - * - * @return VDO_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int addQueuedRecoveryEntries(RecoveryJournalBlock *block) -{ - while (hasWaiters(&block->entryWaiters)) { - DataVIO *dataVIO - = waiterAsDataVIO(dequeueNextWaiter(&block->entryWaiters)); - if (dataVIO->operation.type == DATA_INCREMENT) { - // In order to not lose committed sectors of this partial write, we must - // flush before the partial write entries are committed. - block->hasPartialWriteEntry = (block->hasPartialWriteEntry - || dataVIO->isPartialWrite); - /* - * In order to not lose acknowledged writes with the FUA flag set, we - * must issue a flush to cover the data write and also all previous - * journal writes, and we must issue a FUA on the journal write. - */ - block->hasFUAEntry = (block->hasFUAEntry - || vioRequiresFlushAfter(dataVIOAsVIO(dataVIO))); - } - - // Compose and encode the entry. - PackedRecoveryJournalEntry *packedEntry - = &block->sector->entries[block->sector->entryCount++]; - TreeLock *lock = &dataVIO->treeLock; - RecoveryJournalEntry newEntry = { - .mapping = { - .pbn = dataVIO->operation.pbn, - .state = dataVIO->operation.state, - }, - .operation = dataVIO->operation.type, - .slot = lock->treeSlots[lock->height].blockMapSlot, - }; - *packedEntry = packRecoveryJournalEntry(&newEntry); - - if (isIncrementOperation(dataVIO->operation.type)) { - dataVIO->recoverySequenceNumber = block->sequenceNumber; - } - - // Enqueue the DataVIO to wait for its entry to commit. - int result = enqueueDataVIO(&block->commitWaiters, dataVIO, - THIS_LOCATION("$F($j-$js)")); - if (result != VDO_SUCCESS) { - continueDataVIO(dataVIO, result); - return result; - } - - if (isSectorFull(block)) { - setActiveSector(block, (char *) block->sector + VDO_SECTOR_SIZE); - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int getRecoveryBlockPBN(RecoveryJournalBlock *block, - PhysicalBlockNumber *pbnPtr) -{ - RecoveryJournal *journal = block->journal; - int result = translateToPBN(journal->partition, block->blockNumber, pbnPtr); - if (result != VDO_SUCCESS) { - logErrorWithStringError(result, - "Error translating recovery journal block " - "number %llu", block->blockNumber); - } - return result; -} - -/**********************************************************************/ -bool canCommitRecoveryBlock(RecoveryJournalBlock *block) -{ - // Cannot commit in read-only mode, if already committing the block, or - // if there are no entries to commit. - return ((block != NULL) - && !block->committing - && hasWaiters(&block->entryWaiters) - && !isReadOnly(block->journal->readOnlyNotifier)); -} - -/**********************************************************************/ -int commitRecoveryBlock(RecoveryJournalBlock *block, - VDOAction *callback, - VDOAction *errorHandler) -{ - int result = ASSERT(canCommitRecoveryBlock(block), "should never call %s" - " when the block can't be committed", __func__); - if (result != VDO_SUCCESS) { - return result; - } - - PhysicalBlockNumber blockPBN; - result = getRecoveryBlockPBN(block, &blockPBN); - if (result != VDO_SUCCESS) { - return result; - } - - block->entriesInCommit = countWaiters(&block->entryWaiters); - result = addQueuedRecoveryEntries(block); - if (result != VDO_SUCCESS) { - return result; - } - - RecoveryJournal *journal = block->journal; - PackedJournalHeader *header = getBlockHeader(block); - - // Update stats to reflect the block and entries we're about to write. - journal->pendingWriteCount += 1; - journal->events.blocks.written += 1; - journal->events.entries.written += block->entriesInCommit; - - storeUInt64LE(header->fields.blockMapHead, journal->blockMapHead); - storeUInt64LE(header->fields.slabJournalHead, journal->slabJournalHead); - storeUInt16LE(header->fields.entryCount, block->entryCount); - - block->committing = true; - - /* - * In sync or async mode, when we are writing an increment entry for a - * request with FUA, or when making the increment entry for a partial - * write, we need to make sure all the data being mapped to by this block - * is stable on disk and also that the recovery journal is stable up to - * the current block, so we must flush before writing. - * - * In sync mode, and for FUA, we also need to make sure that the write we - * are doing is stable, so we issue the write with FUA. - */ - PhysicalLayer *layer = vioAsCompletion(block->vio)->layer; - bool fua = (block->hasFUAEntry - || (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC)); - bool flush = (block->hasFUAEntry - || (layer->getWritePolicy(layer) != WRITE_POLICY_ASYNC_UNSAFE) - || block->hasPartialWriteEntry); - block->hasFUAEntry = false; - block->hasPartialWriteEntry = false; - launchWriteMetadataVIOWithFlush(block->vio, blockPBN, callback, errorHandler, - flush, fua); - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void dumpRecoveryBlock(const RecoveryJournalBlock *block) -{ - logInfo(" sequence number %llu; entries %" PRIu16 - "; %s; %zu entry waiters; %zu commit waiters", - block->sequenceNumber, - block->entryCount, - (block->committing ? "committing" : "waiting"), - countWaiters(&block->entryWaiters), - countWaiters(&block->commitWaiters)); -} diff --git a/vdo/base/recoveryJournalBlock.h b/vdo/base/recoveryJournalBlock.h deleted file mode 100644 index f26f8e8..0000000 --- a/vdo/base/recoveryJournalBlock.h +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalBlock.h#8 $ - */ - -#ifndef RECOVERY_JOURNAL_BLOCK_H -#define RECOVERY_JOURNAL_BLOCK_H - -#include "permassert.h" - -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalInternals.h" -#include "ringNode.h" -#include "types.h" -#include "waitQueue.h" - -struct recoveryJournalBlock { - /** The doubly linked pointers for the free or active lists */ - RingNode ringNode; - /** The waiter for the pending full block list */ - Waiter writeWaiter; - /** The journal to which this block belongs */ - RecoveryJournal *journal; - /** A pointer to a block-sized buffer holding the packed block data */ - char *block; - /** A pointer to the current sector in the packed block buffer */ - PackedJournalSector *sector; - /** The VIO for writing this block */ - VIO *vio; - /** The sequence number for this block */ - SequenceNumber sequenceNumber; - /** The location of this block in the on-disk journal */ - PhysicalBlockNumber blockNumber; - /** Whether this block is being committed */ - bool committing; - /** Whether this block has an uncommitted increment for a partial write */ - bool hasPartialWriteEntry; - /** Whether this block has an uncommitted increment for a write with FUA */ - bool hasFUAEntry; - /** The total number of entries in this block */ - JournalEntryCount entryCount; - /** The total number of uncommitted entries (queued or committing) */ - JournalEntryCount uncommittedEntryCount; - /** The number of new entries in the current commit */ - JournalEntryCount entriesInCommit; - /** The queue of VIOs which will make entries for the next commit */ - WaitQueue entryWaiters; - /** The queue of VIOs waiting for the current commit */ - WaitQueue commitWaiters; -}; - -/** - * Return the block associated with a ring node. - * - * @param node The ring node to recast as a block - * - * @return The block - **/ -static inline RecoveryJournalBlock *blockFromRingNode(RingNode *node) -{ - STATIC_ASSERT(offsetof(RecoveryJournalBlock, ringNode) == 0); - return (RecoveryJournalBlock *) node; -} - -/** - * Return the block associated with a waiter - * - * @param waiter The waiter to recast as a block - * - * @return The block - **/ -static inline RecoveryJournalBlock *blockFromWaiter(Waiter *waiter) -{ - return (RecoveryJournalBlock *) - ((uintptr_t) waiter - offsetof(RecoveryJournalBlock, writeWaiter)); -} - -/** - * Check whether a recovery block is dirty, indicating it has any uncommitted - * entries, which includes both entries not written and entries written but - * not yet acknowledged. - * - * @param block The block to check - * - * @return true if the block has any uncommitted entries - **/ -__attribute__((warn_unused_result)) -static inline bool isRecoveryBlockDirty(const RecoveryJournalBlock *block) -{ - return (block->uncommittedEntryCount > 0); -} - -/** - * Check whether a journal block is empty. - * - * @param block The block to check - * - * @return true if the block has no entries - **/ -__attribute__((warn_unused_result)) -static inline bool isRecoveryBlockEmpty(const RecoveryJournalBlock *block) -{ - return (block->entryCount == 0); -} - -/** - * Check whether a journal block is full. - * - * @param block The block to check - * - * @return true if the the block is full - **/ -__attribute__((warn_unused_result)) -static inline bool isRecoveryBlockFull(const RecoveryJournalBlock *block) -{ - return ((block == NULL) - || (block->journal->entriesPerBlock == block->entryCount)); -} - -/** - * Construct a journal block. - * - * @param [in] layer The layer from which to construct VIOs - * @param [in] journal The journal to which the block will belong - * @param [out] blockPtr A pointer to receive the new block - * - * @return VDO_SUCCESS or an error - **/ -int makeRecoveryBlock(PhysicalLayer *layer, - RecoveryJournal *journal, - RecoveryJournalBlock **blockPtr) - __attribute__((warn_unused_result)); - -/** - * Free a tail block and null out the reference to it. - * - * @param blockPtr The reference to the tail block to free - **/ -void freeRecoveryBlock(RecoveryJournalBlock **blockPtr); - -/** - * Initialize the next active recovery journal block. - * - * @param block The journal block to initialize - **/ -void initializeRecoveryBlock(RecoveryJournalBlock *block); - -/** - * Enqueue a DataVIO to asynchronously encode and commit its next recovery - * journal entry in this block. The DataVIO will not be continued until the - * entry is committed to the on-disk journal. The caller is responsible for - * ensuring the block is not already full. - * - * @param block The journal block in which to make an entry - * @param dataVIO The DataVIO to enqueue - * - * @return VDO_SUCCESS or an error code if the DataVIO could not be enqueued - **/ -int enqueueRecoveryBlockEntry(RecoveryJournalBlock *block, DataVIO *dataVIO) - __attribute__((warn_unused_result)); - -/** - * Attempt to commit a block. If the block is not the oldest block with - * uncommitted entries or if it is already being committed, nothing will be - * done. - * - * @param block The block to write - * @param callback The function to call when the write completes - * @param errorHandler The handler for flush or write errors - * - * @return VDO_SUCCESS, or an error if the write could not be launched - **/ -int commitRecoveryBlock(RecoveryJournalBlock *block, - VDOAction *callback, - VDOAction *errorHandler) - __attribute__((warn_unused_result)); - -/** - * Dump the contents of the recovery block to the log. - * - * @param block The block to dump - **/ -void dumpRecoveryBlock(const RecoveryJournalBlock *block); - -/** - * Check whether a journal block can be committed. - * - * @param block The journal block in question - * - * @return true if the block can be committed now - **/ -bool canCommitRecoveryBlock(RecoveryJournalBlock *block) - __attribute__((warn_unused_result)); - -#endif // RECOVERY_JOURNAL_BLOCK_H diff --git a/vdo/base/recoveryJournalEntry.h b/vdo/base/recoveryJournalEntry.h deleted file mode 100644 index bf2a3e0..0000000 --- a/vdo/base/recoveryJournalEntry.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalEntry.h#1 $ - */ - -#ifndef RECOVERY_JOURNAL_ENTRY_H -#define RECOVERY_JOURNAL_ENTRY_H - -#include "numeric.h" - -#include "blockMapEntry.h" -#include "journalPoint.h" -#include "types.h" - -/** - * A recovery journal entry stores two physical locations: a data location - * that is the value of a single mapping in the block map tree, and the - * location of the block map page and and slot that is either acquiring or - * releasing a reference to the data location. The journal entry also stores - * an operation code that says whether the reference is being acquired (an - * increment) or released (a decrement), and whether the mapping is for a - * logical block or for the block map tree itself. - **/ -typedef struct { - BlockMapSlot slot; - DataLocation mapping; - JournalOperation operation; -} RecoveryJournalEntry; - -/** The packed, on-disk representation of a recovery journal entry. */ -typedef union __attribute__((packed)) { - struct __attribute__((packed)) { - /** - * In little-endian bit order: - * Bits 15..12: The four highest bits of the 36-bit physical block number - * of the block map tree page - * Bits 11..2: The 10-bit block map page slot number - * Bits 1..0: The 2-bit JournalOperation of the entry - **/ -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned operation : 2; - unsigned slotLow : 6; - unsigned slotHigh : 4; - unsigned pbnHighNibble : 4; -#else - unsigned slotLow : 6; - unsigned operation : 2; - unsigned pbnHighNibble : 4; - unsigned slotHigh : 4; -#endif - - /** - * Bits 47..16: The 32 low-order bits of the block map page PBN, - * in little-endian byte order - **/ - byte pbnLowWord[4]; - - /** - * Bits 87..48: The five-byte block map entry encoding the location that - * was or will be stored in the block map page slot - **/ - BlockMapEntry blockMapEntry; - } fields; - - // A raw view of the packed encoding. - uint8_t raw[11]; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - // This view is only valid on little-endian machines and is only present for - // ease of directly examining packed entries in GDB. - struct __attribute__((packed)) { - unsigned operation : 2; - unsigned slot : 10; - unsigned pbnHighNibble : 4; - uint32_t pbnLowWord; - BlockMapEntry blockMapEntry; - } littleEndian; -#endif -} PackedRecoveryJournalEntry; - -/** - * Return the packed, on-disk representation of a recovery journal entry. - * - * @param entry The journal entry to pack - * - * @return The packed representation of the journal entry - **/ -static inline PackedRecoveryJournalEntry -packRecoveryJournalEntry(const RecoveryJournalEntry *entry) -{ - PackedRecoveryJournalEntry packed = { - .fields = { - .operation = entry->operation, - .slotLow = entry->slot.slot & 0x3F, - .slotHigh = (entry->slot.slot >> 6) & 0x0F, - .pbnHighNibble = (entry->slot.pbn >> 32) & 0x0F, - .blockMapEntry = packPBN(entry->mapping.pbn, entry->mapping.state), - } - }; - storeUInt32LE(packed.fields.pbnLowWord, entry->slot.pbn & UINT_MAX); - return packed; -} - -/** - * Unpack the on-disk representation of a recovery journal entry. - * - * @param entry The recovery journal entry to unpack - * - * @return The unpacked entry - **/ -static inline RecoveryJournalEntry -unpackRecoveryJournalEntry(const PackedRecoveryJournalEntry *entry) -{ - PhysicalBlockNumber low32 = getUInt32LE(entry->fields.pbnLowWord); - PhysicalBlockNumber high4 = entry->fields.pbnHighNibble; - return (RecoveryJournalEntry) { - .operation = entry->fields.operation, - .slot = { - .pbn = ((high4 << 32) | low32), - .slot = (entry->fields.slotLow | (entry->fields.slotHigh << 6)), - }, - .mapping = unpackBlockMapEntry(&entry->fields.blockMapEntry), - }; -} - -#endif // RECOVERY_JOURNAL_ENTRY_H diff --git a/vdo/base/recoveryJournalInternals.h b/vdo/base/recoveryJournalInternals.h deleted file mode 100644 index 0266990..0000000 --- a/vdo/base/recoveryJournalInternals.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryJournalInternals.h#10 $ - */ - -#ifndef RECOVERY_JOURNAL_INTERNALS_H -#define RECOVERY_JOURNAL_INTERNALS_H - -#include "numeric.h" - -#include "adminState.h" -#include "fixedLayout.h" -#include "journalPoint.h" -#include "lockCounter.h" -#include "recoveryJournal.h" -#include "ringNode.h" -#include "statistics.h" -#include "types.h" -#include "waitQueue.h" - -typedef struct recoveryJournalBlock RecoveryJournalBlock; - -struct recoveryJournal { - /** The thread ID of the journal zone */ - ThreadID threadID; - /** The slab depot which can hold locks on this journal */ - SlabDepot *depot; - /** The block map which can hold locks on this journal */ - BlockMap *blockMap; - /** The queue of VIOs waiting to make increment entries */ - WaitQueue incrementWaiters; - /** The queue of VIOs waiting to make decrement entries */ - WaitQueue decrementWaiters; - /** The number of free entries in the journal */ - uint64_t availableSpace; - /** The number of decrement entries which need to be made */ - VIOCount pendingDecrementCount; - /** - * Whether the journal is adding entries from the increment or - * decrement waiters queues - **/ - bool addingEntries; - /** The notifier for read-only mode */ - ReadOnlyNotifier *readOnlyNotifier; - /** The administrative state of the journal */ - AdminState state; - /** Whether a reap is in progress */ - bool reaping; - /** The partition which holds the journal on disk */ - Partition *partition; - /** The oldest active block in the journal on disk for block map rebuild */ - SequenceNumber blockMapHead; - /** The oldest active block in the journal on disk for slab journal replay */ - SequenceNumber slabJournalHead; - /** The newest block in the journal on disk to which a write has finished */ - SequenceNumber lastWriteAcknowledged; - /** The end of the half-open interval of the active journal */ - SequenceNumber tail; - /** The point at which the last entry will have been added */ - JournalPoint appendPoint; - /** The journal point of the VIO most recently released from the journal */ - JournalPoint commitPoint; - /** The nonce of the VDO */ - Nonce nonce; - /** The number of recoveries completed by the VDO */ - uint8_t recoveryCount; - /** The number of entries which fit in a single block */ - JournalEntryCount entriesPerBlock; - /** Unused in-memory journal blocks */ - RingNode freeTailBlocks; - /** In-memory journal blocks with records */ - RingNode activeTailBlocks; - /** A pointer to the active block (the one we are adding entries to now) */ - RecoveryJournalBlock *activeBlock; - /** Journal blocks that need writing */ - WaitQueue pendingWrites; - /** The new block map reap head after reaping */ - SequenceNumber blockMapReapHead; - /** The head block number for the block map rebuild range */ - BlockCount blockMapHeadBlockNumber; - /** The new slab journal reap head after reaping */ - SequenceNumber slabJournalReapHead; - /** The head block number for the slab journal replay range */ - BlockCount slabJournalHeadBlockNumber; - /** The VIO on which we can call flush (less ick, but still ick) */ - VIO *flushVIO; - /** The data block which must live in the VIO in the flush extent */ - char *unusedFlushVIOData; - /** The number of blocks in the on-disk journal */ - BlockCount size; - /** The number of logical blocks that are in-use */ - BlockCount logicalBlocksUsed; - /** The number of block map pages that are allocated */ - BlockCount blockMapDataBlocks; - /** The number of journal blocks written but not yet acknowledged */ - BlockCount pendingWriteCount; - /** The threshold at which slab journal tail blocks will be written out */ - BlockCount slabJournalCommitThreshold; - /** Counters for events in the journal that are reported as statistics */ - RecoveryJournalStatistics events; - /** The locks for each on-disk block */ - LockCounter *lockCounter; -}; - -/** - * Get the physical block number for a given sequence number. - * - * @param journal The journal - * @param sequence The sequence number of the desired block - * - * @return The block number corresponding to the sequence number - **/ -__attribute__((warn_unused_result)) -static inline PhysicalBlockNumber -getRecoveryJournalBlockNumber(const RecoveryJournal *journal, - SequenceNumber sequence) -{ - // Since journal size is a power of two, the block number modulus can just - // be extracted from the low-order bits of the sequence. - return (sequence & (journal->size - 1)); -} - -/** - * Compute the checkByte for a given sequence number. - * - * @param journal The journal - * @param sequence The sequence number - * - * @return The check byte corresponding to the sequence number - **/ -__attribute__((warn_unused_result)) -static inline uint8_t computeRecoveryCheckByte(const RecoveryJournal *journal, - SequenceNumber sequence) -{ - // The check byte must change with each trip around the journal. - return (((sequence / journal->size) & 0x7F) | 0x80); -} - -#endif // RECOVERY_JOURNAL_INTERNALS_H diff --git a/vdo/base/recoveryUtils.c b/vdo/base/recoveryUtils.c deleted file mode 100644 index 44f16ee..0000000 --- a/vdo/base/recoveryUtils.c +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryUtils.c#4 $ - */ - -#include "recoveryUtils.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "completion.h" -#include "extent.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalEntry.h" -#include "recoveryJournalInternals.h" -#include "slabDepot.h" -#include "vdoInternal.h" - -/** - * Finish loading the journal by freeing the extent and notifying the parent. - * This callback is registered in loadJournalAsync(). - * - * @param completion The load extent - **/ -static void finishJournalLoad(VDOCompletion *completion) -{ - int result = completion->result; - VDOCompletion *parent = completion->parent; - VDOExtent *extent = asVDOExtent(completion); - freeExtent(&extent); - finishCompletion(parent, result); -} - -/**********************************************************************/ -void loadJournalAsync(RecoveryJournal *journal, - VDOCompletion *parent, - char **journalDataPtr) -{ - int result = ALLOCATE(journal->size * VDO_BLOCK_SIZE, char, __func__, - journalDataPtr); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - VDOExtent *extent; - result = createExtent(parent->layer, VIO_TYPE_RECOVERY_JOURNAL, - VIO_PRIORITY_METADATA, journal->size, - *journalDataPtr, &extent); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - prepareCompletion(&extent->completion, finishJournalLoad, finishJournalLoad, - parent->callbackThreadID, parent); - readMetadataExtent(extent, - getFixedLayoutPartitionOffset(journal->partition)); -} - -/** - * Determine whether the given header describe a valid block for the - * given journal that could appear at the given offset in the journal. - * - * @param journal The journal to use - * @param header The unpacked block header to check - * @param offset An offset indicating where the block was in the journal - * - * @return True if the header matches - **/ -__attribute__((warn_unused_result)) -static bool isCongruentRecoveryJournalBlock(RecoveryJournal *journal, - const RecoveryBlockHeader *header, - PhysicalBlockNumber offset) -{ - PhysicalBlockNumber expectedOffset - = getRecoveryJournalBlockNumber(journal, header->sequenceNumber); - return ((expectedOffset == offset) - && isValidRecoveryJournalBlock(journal, header)); -} - -/**********************************************************************/ -bool findHeadAndTail(RecoveryJournal *journal, - char *journalData, - SequenceNumber *tailPtr, - SequenceNumber *blockMapHeadPtr, - SequenceNumber *slabJournalHeadPtr) -{ - SequenceNumber highestTail = journal->tail; - SequenceNumber blockMapHeadMax = 0; - SequenceNumber slabJournalHeadMax = 0; - bool foundEntries = false; - for (PhysicalBlockNumber i = 0; i < journal->size; i++) { - PackedJournalHeader *packedHeader - = getJournalBlockHeader(journal, journalData, i); - RecoveryBlockHeader header; - unpackRecoveryBlockHeader(packedHeader, &header); - - if (!isCongruentRecoveryJournalBlock(journal, &header, i)) { - // This block is old, unformatted, or doesn't belong at this location. - continue; - } - - if (header.sequenceNumber >= highestTail) { - foundEntries = true; - highestTail = header.sequenceNumber; - } - if (header.blockMapHead > blockMapHeadMax) { - blockMapHeadMax = header.blockMapHead; - } - if (header.slabJournalHead > slabJournalHeadMax) { - slabJournalHeadMax = header.slabJournalHead; - } - } - - *tailPtr = highestTail; - if (!foundEntries) { - return false; - } - - *blockMapHeadPtr = blockMapHeadMax; - if (slabJournalHeadPtr != NULL) { - *slabJournalHeadPtr = slabJournalHeadMax; - } - return true; -} - -/**********************************************************************/ -int validateRecoveryJournalEntry(const VDO *vdo, - const RecoveryJournalEntry *entry) -{ - if ((entry->slot.pbn >= vdo->config.physicalBlocks) - || (entry->slot.slot >= BLOCK_MAP_ENTRIES_PER_PAGE) - || !isValidLocation(&entry->mapping) - || !isPhysicalDataBlock(vdo->depot, entry->mapping.pbn)) { - return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Invalid entry:" - " (%llu, %" PRIu16 ") to %" PRIu64 - " (%s) is not within bounds", - entry->slot.pbn, entry->slot.slot, - entry->mapping.pbn, - getJournalOperationName(entry->operation)); - } - - if ((entry->operation == BLOCK_MAP_INCREMENT) - && (isCompressed(entry->mapping.state) - || (entry->mapping.pbn == ZERO_BLOCK))) { - return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Invalid entry:" - " (%llu, %" PRIu16 ") to %" PRIu64 - " (%s) is not a valid tree mapping", - entry->slot.pbn, entry->slot.slot, - entry->mapping.pbn, - getJournalOperationName(entry->operation)); - } - - return VDO_SUCCESS; -} diff --git a/vdo/base/recoveryUtils.h b/vdo/base/recoveryUtils.h deleted file mode 100644 index 6778af9..0000000 --- a/vdo/base/recoveryUtils.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/recoveryUtils.h#5 $ - */ - -#ifndef RECOVERY_UTILS_H -#define RECOVERY_UTILS_H - -#include "constants.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournalEntry.h" -#include "recoveryJournalInternals.h" -#include "types.h" - -/** - * Get the block header for a block at a position in the journal data. - * - * @param journal The recovery journal - * @param journalData The recovery journal data - * @param sequence The sequence number - * - * @return A pointer to a packed recovery journal block header. - **/ -__attribute__((warn_unused_result)) -static inline -PackedJournalHeader *getJournalBlockHeader(RecoveryJournal *journal, - char *journalData, - SequenceNumber sequence) -{ - off_t blockOffset = (getRecoveryJournalBlockNumber(journal, sequence) - * VDO_BLOCK_SIZE); - return (PackedJournalHeader *) &journalData[blockOffset]; -} - -/** - * Determine whether the given header describes a valid block for the - * given journal. A block is not valid if it is unformatted, or if it - * is older than the last successful recovery or reformat. - * - * @param journal The journal to use - * @param header The unpacked block header to check - * - * @return True if the header is valid - **/ -__attribute__((warn_unused_result)) -static inline -bool isValidRecoveryJournalBlock(const RecoveryJournal *journal, - const RecoveryBlockHeader *header) -{ - return ((header->metadataType == VDO_METADATA_RECOVERY_JOURNAL) - && (header->nonce == journal->nonce) - && (header->recoveryCount == journal->recoveryCount)); -} - -/** - * Determine whether the given header describes the exact block indicated. - * - * @param journal The journal to use - * @param header The unpacked block header to check - * @param sequence The expected sequence number - * - * @return True if the block matches - **/ -__attribute__((warn_unused_result)) -static inline -bool isExactRecoveryJournalBlock(const RecoveryJournal *journal, - const RecoveryBlockHeader *header, - SequenceNumber sequence) -{ - return ((header->sequenceNumber == sequence) - && isValidRecoveryJournalBlock(journal, header)); -} - -/** - * Determine whether the header of the given sector could describe a - * valid sector for the given journal block header. - * - * @param header The unpacked block header to compare against - * @param sector The packed sector to check - * - * @return True if the sector matches the block header - **/ -__attribute__((warn_unused_result)) -static inline -bool isValidRecoveryJournalSector(const RecoveryBlockHeader *header, - const PackedJournalSector *sector) -{ - return ((header->checkByte == sector->checkByte) - && (header->recoveryCount == sector->recoveryCount)); -} - -/** - * Load the journal data off the disk. - * - * @param [in] journal The recovery journal to load - * @param [in] parent The completion to notify when the load is - * complete - * @param [out] journalDataPtr A pointer to the journal data buffer (it is the - * caller's responsibility to free this buffer) - **/ -void loadJournalAsync(RecoveryJournal *journal, - VDOCompletion *parent, - char **journalDataPtr); - -/** - * Find the tail and the head of the journal by searching for the highest - * sequence number in a block with a valid nonce, and the highest head value - * among the blocks with valid nonces. - * - * @param [in] journal The recovery journal - * @param [in] journalData The journal data read from disk - * @param [out] tailPtr A pointer to return the tail found, or if - * no higher block is found, the value - * currently in the journal - * @param [out] blockMapHeadPtr A pointer to return the block map head - * @param [out] slabJournalHeadPtr An optional pointer to return the slab - * journal head - * - * @return True if there were valid journal blocks - **/ -bool findHeadAndTail(RecoveryJournal *journal, - char *journalData, - SequenceNumber *tailPtr, - SequenceNumber *blockMapHeadPtr, - SequenceNumber *slabJournalHeadPtr); - -/** - * Validate a recovery journal entry. - * - * @param vdo The VDO - * @param entry The entry to validate - * - * @return VDO_SUCCESS or an error - **/ -int validateRecoveryJournalEntry(const VDO *vdo, - const RecoveryJournalEntry *entry) - __attribute__((warn_unused_result)); - -#endif // RECOVERY_UTILS_H diff --git a/vdo/base/refCounts.c b/vdo/base/refCounts.c deleted file mode 100644 index daf04c4..0000000 --- a/vdo/base/refCounts.c +++ /dev/null @@ -1,1451 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCounts.c#9 $ - */ - -#include "refCounts.h" -#include "refCountsInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" - -#include "adminState.h" -#include "blockAllocatorInternals.h" -#include "completion.h" -#include "extent.h" -#include "header.h" -#include "journalPoint.h" -#include "numUtils.h" -#include "pbnLock.h" -#include "readOnlyNotifier.h" -#include "referenceBlock.h" -#include "referenceOperation.h" -#include "slab.h" -#include "slabJournal.h" -#include "slabJournalInternals.h" -#include "slabSummary.h" -#include "statusCodes.h" -#include "stringUtils.h" -#include "vdo.h" -#include "vioPool.h" -#include "waitQueue.h" - -static const uint64_t BYTES_PER_WORD = sizeof(uint64_t); -static const bool NORMAL_OPERATION = true; - -/** - * Return the RefCounts from the RefCounts waiter. - * - * @param waiter The waiter to convert - * - * @return The RefCounts - **/ -__attribute__((warn_unused_result)) -static inline RefCounts *refCountsFromWaiter(Waiter *waiter) -{ - if (waiter == NULL) { - return NULL; - } - return (RefCounts *) - ((uintptr_t) waiter - offsetof(RefCounts, slabSummaryWaiter)); -} - -/** - * Convert the index of a reference counter back to the block number of the - * physical block for which it is counting references. The index is assumed to - * be valid and in-range. - * - * @param refCounts The reference counts object - * @param index The array index of the reference counter - * - * @return the physical block number corresponding to the index - **/ -static PhysicalBlockNumber indexToPBN(const RefCounts *refCounts, - uint64_t index) -{ - return (refCounts->slab->start + index); -} - -/** - * Convert a block number to the index of a reference counter for that block. - * Out of range values are pinned to the beginning or one past the end of the - * array. - * - * @param refCounts The reference counts object - * @param pbn The physical block number - * - * @return the index corresponding to the physical block number - **/ -static uint64_t pbnToIndex(const RefCounts *refCounts, PhysicalBlockNumber pbn) -{ - if (pbn < refCounts->slab->start) { - return 0; - } - uint64_t index = (pbn - refCounts->slab->start); - return minBlock(index, refCounts->blockCount); -} - -/**********************************************************************/ -ReferenceStatus referenceCountToStatus(ReferenceCount count) -{ - if (count == EMPTY_REFERENCE_COUNT) { - return RS_FREE; - } else if (count == 1) { - return RS_SINGLE; - } else if (count == PROVISIONAL_REFERENCE_COUNT) { - return RS_PROVISIONAL; - } else { - return RS_SHARED; - } -} - -/** - * Reset the free block search back to the first reference counter - * in the first reference block. - * - * @param refCounts The RefCounts object containing the search cursor - **/ -static void resetSearchCursor(RefCounts *refCounts) -{ - SearchCursor *cursor = &refCounts->searchCursor; - - cursor->block = cursor->firstBlock; - cursor->index = 0; - // Unit tests have slabs with only one reference block (and it's a runt). - cursor->endIndex = minBlock(COUNTS_PER_BLOCK, refCounts->blockCount); -} - -/** - * Advance the search cursor to the start of the next reference block, - * wrapping around to the first reference block if the current block is the - * last reference block. - * - * @param refCounts The RefCounts object containing the search cursor - * - * @return true unless the cursor was at the last reference block - **/ -static bool advanceSearchCursor(RefCounts *refCounts) -{ - SearchCursor *cursor = &refCounts->searchCursor; - - // If we just finished searching the last reference block, then wrap back - // around to the start of the array. - if (cursor->block == cursor->lastBlock) { - resetSearchCursor(refCounts); - return false; - } - - // We're not already at the end, so advance to cursor to the next block. - cursor->block++; - cursor->index = cursor->endIndex; - - if (cursor->block == cursor->lastBlock) { - // The last reference block will usually be a runt. - cursor->endIndex = refCounts->blockCount; - } else { - cursor->endIndex += COUNTS_PER_BLOCK; - } - return true; -} - -/**********************************************************************/ -int makeRefCounts(BlockCount blockCount, - Slab *slab, - PhysicalBlockNumber origin, - ReadOnlyNotifier *readOnlyNotifier, - RefCounts **refCountsPtr) -{ - BlockCount refBlockCount = getSavedReferenceCountSize(blockCount); - RefCounts *refCounts; - int result = ALLOCATE_EXTENDED(RefCounts, refBlockCount, ReferenceBlock, - "ref counts structure", &refCounts); - if (result != UDS_SUCCESS) { - return result; - } - - // Allocate such that the runt slab has a full-length memory array, - // plus a little padding so we can word-search even at the very end. - size_t bytes = ((refBlockCount * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD)); - result = ALLOCATE(bytes, ReferenceCount, "ref counts array", - &refCounts->counters); - if (result != UDS_SUCCESS) { - freeRefCounts(&refCounts); - return result; - } - - refCounts->slab = slab; - refCounts->blockCount = blockCount; - refCounts->freeBlocks = blockCount; - refCounts->origin = origin; - refCounts->referenceBlockCount = refBlockCount; - refCounts->readOnlyNotifier = readOnlyNotifier; - refCounts->statistics = &slab->allocator->refCountStatistics; - refCounts->searchCursor.firstBlock = &refCounts->blocks[0]; - refCounts->searchCursor.lastBlock = &refCounts->blocks[refBlockCount - 1]; - resetSearchCursor(refCounts); - - for (size_t index = 0; index < refBlockCount; index++) { - refCounts->blocks[index] = (ReferenceBlock) { - .refCounts = refCounts, - }; - } - - *refCountsPtr = refCounts; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeRefCounts(RefCounts **refCountsPtr) -{ - RefCounts *refCounts = *refCountsPtr; - if (refCounts == NULL) { - return; - } - - FREE(refCounts->counters); - FREE(refCounts); - *refCountsPtr = NULL; -} - -/** - * Check whether a RefCounts has active I/O. - * - * @param refCounts The RefCounts to check - * - * @return true if there is reference block I/O or a summary - * update in progress - **/ -__attribute__((warn_unused_result)) -static bool hasActiveIO(RefCounts *refCounts) -{ - return ((refCounts->activeCount > 0) || refCounts->updatingSlabSummary); -} - -/**********************************************************************/ -bool areRefCountsActive(RefCounts *refCounts) -{ - if (hasActiveIO(refCounts)) { - return true; - } - - // When not suspending or recovering, the refCounts must be clean. - AdminStateCode code = refCounts->slab->state.state; - return (hasWaiters(&refCounts->dirtyBlocks) - && (code != ADMIN_STATE_SUSPENDING) - && (code != ADMIN_STATE_RECOVERING)); -} - -/**********************************************************************/ -static void enterRefCountsReadOnlyMode(RefCounts *refCounts, int result) -{ - enterReadOnlyMode(refCounts->readOnlyNotifier, result); - checkIfSlabDrained(refCounts->slab); -} - -/** - * Enqueue a block on the dirty queue. - * - * @param block The block to enqueue - **/ -static void enqueueDirtyBlock(ReferenceBlock *block) -{ - int result = enqueueWaiter(&block->refCounts->dirtyBlocks, &block->waiter); - if (result != VDO_SUCCESS) { - // This should never happen. - enterRefCountsReadOnlyMode(block->refCounts, result); - } -} - -/** - * Mark a reference count block as dirty, potentially adding it to the dirty - * queue if it wasn't already dirty. - * - * @param block The reference block to mark as dirty - **/ -static void dirtyBlock(ReferenceBlock *block) -{ - if (block->isDirty) { - return; - } - - block->isDirty = true; - if (block->isWriting) { - // The conclusion of the current write will enqueue the block again. - return; - } - - enqueueDirtyBlock(block); -} - -/**********************************************************************/ -BlockCount getUnreferencedBlockCount(RefCounts *refCounts) -{ - return refCounts->freeBlocks; -} - -/**********************************************************************/ -ReferenceBlock *getReferenceBlock(RefCounts *refCounts, SlabBlockNumber index) -{ - return &refCounts->blocks[index / COUNTS_PER_BLOCK]; -} - -/** - * Get the reference counter that covers the given physical block number. - * - * @param [in] refCounts The refcounts object - * @param [in] pbn The physical block number - * @param [out] counterPtr A pointer to the reference counter - - **/ -static int getReferenceCounter(RefCounts *refCounts, - PhysicalBlockNumber pbn, - ReferenceCount **counterPtr) -{ - SlabBlockNumber index; - int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &index); - if (result != VDO_SUCCESS) { - return result; - } - - *counterPtr = &refCounts->counters[index]; - - return VDO_SUCCESS; -} - -/**********************************************************************/ -uint8_t getAvailableReferences(RefCounts *refCounts, PhysicalBlockNumber pbn) -{ - ReferenceCount *counterPtr = NULL; - int result = getReferenceCounter(refCounts, pbn, &counterPtr); - if (result != VDO_SUCCESS) { - return 0; - } - - if (*counterPtr == PROVISIONAL_REFERENCE_COUNT) { - return (MAXIMUM_REFERENCE_COUNT - 1); - } - - return (MAXIMUM_REFERENCE_COUNT - *counterPtr); -} - -/** - * Increment the reference count for a data block. - * - * @param [in] refCounts The refCounts responsible for the block - * @param [in] block The reference block which contains the - * block being updated - * @param [in] slabBlockNumber The block to update - * @param [in] oldStatus The reference status of the data block - * before this increment - * @param [in] lock The PBNLock associated with this - * increment (may be NULL) - * @param [in,out] counterPtr A pointer to the count for the data block - * @param [out] freeStatusChanged A pointer which will be set to true if - * this update changed the free status of - * the block - * - * @return VDO_SUCCESS or an error - **/ -static int incrementForData(RefCounts *refCounts, - ReferenceBlock *block, - SlabBlockNumber slabBlockNumber, - ReferenceStatus oldStatus, - PBNLock *lock, - ReferenceCount *counterPtr, - bool *freeStatusChanged) -{ - switch (oldStatus) { - case RS_FREE: - *counterPtr = 1; - block->allocatedCount++; - refCounts->freeBlocks--; - *freeStatusChanged = true; - break; - - case RS_PROVISIONAL: - *counterPtr = 1; - *freeStatusChanged = false; - break; - - default: - // Single or shared - if (*counterPtr >= MAXIMUM_REFERENCE_COUNT) { - return logErrorWithStringError(VDO_REF_COUNT_INVALID, - "Incrementing a block already having" - " 254 references (slab %u, offset %" - PRIu32 ")", - refCounts->slab->slabNumber, - slabBlockNumber); - } - (*counterPtr)++; - *freeStatusChanged = false; - } - - if (lock != NULL) { - unassignProvisionalReference(lock); - } - return VDO_SUCCESS; -} - -/** - * Decrement the reference count for a data block. - * - * @param [in] refCounts The refCounts responsible for the block - * @param [in] block The reference block which contains the - * block being updated - * @param [in] slabBlockNumber The block to update - * @param [in] oldStatus The reference status of the data block - * before this decrement - * @param [in] lock The PBNLock associated with the block - * being decremented (may be NULL) - * @param [in,out] counterPtr A pointer to the count for the data block - * @param [out] freeStatusChanged A pointer which will be set to true if - * this update changed the free status of - * the block - * - * @return VDO_SUCCESS or an error - **/ -static int decrementForData(RefCounts *refCounts, - ReferenceBlock *block, - SlabBlockNumber slabBlockNumber, - ReferenceStatus oldStatus, - PBNLock *lock, - ReferenceCount *counterPtr, - bool *freeStatusChanged) -{ - switch (oldStatus) { - case RS_FREE: - return logErrorWithStringError(VDO_REF_COUNT_INVALID, - "Decrementing free block at offset %" - PRIu32 " in slab %u", slabBlockNumber, - refCounts->slab->slabNumber); - - case RS_PROVISIONAL: - case RS_SINGLE: - if (lock != NULL) { - // There is a read lock on this block, so the block must not become - // unreferenced. - *counterPtr = PROVISIONAL_REFERENCE_COUNT; - *freeStatusChanged = false; - assignProvisionalReference(lock); - } else { - *counterPtr = EMPTY_REFERENCE_COUNT; - block->allocatedCount--; - refCounts->freeBlocks++; - *freeStatusChanged = true; - } - break; - - default: - // Shared - (*counterPtr)--; - *freeStatusChanged = false; - } - - return VDO_SUCCESS; -} - -/** - * Increment the reference count for a block map page. All block map increments - * should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map blocks - * never dedupe they should never be adjusted from any other state. The - * adjustment always results in MAXIMUM_REFERENCE_COUNT as this value is used to - * prevent dedupe against block map blocks. - * - * @param [in] refCounts The refCounts responsible for the block - * @param [in] block The reference block which contains the - * block being updated - * @param [in] slabBlockNumber The block to update - * @param [in] oldStatus The reference status of the block - * before this increment - * @param [in] lock The PBNLock associated with this - * increment (may be NULL) - * @param [in] normalOperation Whether we are in normal operation vs. - * recovery or rebuild - * @param [in,out] counterPtr A pointer to the count for the block - * @param [out] freeStatusChanged A pointer which will be set to true if - * this update changed the free status of the - * block - * - * @return VDO_SUCCESS or an error - **/ -static int incrementForBlockMap(RefCounts *refCounts, - ReferenceBlock *block, - SlabBlockNumber slabBlockNumber, - ReferenceStatus oldStatus, - PBNLock *lock, - bool normalOperation, - ReferenceCount *counterPtr, - bool *freeStatusChanged) -{ - switch (oldStatus) { - case RS_FREE: - if (normalOperation) { - return logErrorWithStringError(VDO_REF_COUNT_INVALID, - "Incrementing unallocated block map block" - " (slab %u, offset %" PRIu32 ")", - refCounts->slab->slabNumber, - slabBlockNumber); - } - - *counterPtr = MAXIMUM_REFERENCE_COUNT; - block->allocatedCount++; - refCounts->freeBlocks--; - *freeStatusChanged = true; - return VDO_SUCCESS; - - case RS_PROVISIONAL: - if (!normalOperation) { - return logErrorWithStringError(VDO_REF_COUNT_INVALID, - "Block map block had provisional " - "reference during replay" - " (slab %u, offset %" PRIu32 ")", - refCounts->slab->slabNumber, - slabBlockNumber); - } - - *counterPtr = MAXIMUM_REFERENCE_COUNT; - *freeStatusChanged = false; - if (lock != NULL) { - unassignProvisionalReference(lock); - } - return VDO_SUCCESS; - - default: - return logErrorWithStringError(VDO_REF_COUNT_INVALID, - "Incrementing a block map block which is " - "already referenced %u times (slab %u, " - "offset %" PRIu32 ")", - *counterPtr, - refCounts->slab->slabNumber, - slabBlockNumber); - } -} - -/** - * Update the reference count of a block. - * - * @param [in] refCounts The refCounts responsible for the - * block - * @param [in] block The reference block which contains the - * block being updated - * @param [in] slabBlockNumber The block to update - * @param [in] slabJournalPoint The slab journal point at which this - * update is journaled - * @param [in] operation How to update the count - * @param [in] normalOperation Whether we are in normal operation vs. - * recovery or rebuild - * @param [out] freeStatusChanged A pointer which will be set to true if - * this update changed the free status of - * the block - * @param [out] provisionalDecrementPtr A pointer which will be set to true if - * this update was a decrement of a - * provisional reference - * - * @return VDO_SUCCESS or an error - **/ -static int updateReferenceCount(RefCounts *refCounts, - ReferenceBlock *block, - SlabBlockNumber slabBlockNumber, - const JournalPoint *slabJournalPoint, - ReferenceOperation operation, - bool normalOperation, - bool *freeStatusChanged, - bool *provisionalDecrementPtr) -{ - ReferenceCount *counterPtr = &refCounts->counters[slabBlockNumber]; - ReferenceStatus oldStatus = referenceCountToStatus(*counterPtr); - PBNLock *lock = getReferenceOperationPBNLock(operation); - int result; - - switch (operation.type) { - case DATA_INCREMENT: - result = incrementForData(refCounts, block, slabBlockNumber, oldStatus, - lock, counterPtr, freeStatusChanged); - break; - - case DATA_DECREMENT: - result = decrementForData(refCounts, block, slabBlockNumber, oldStatus, - lock, counterPtr, freeStatusChanged); - if ((result == VDO_SUCCESS) && (oldStatus == RS_PROVISIONAL)) { - if (provisionalDecrementPtr != NULL) { - *provisionalDecrementPtr = true; - } - return VDO_SUCCESS; - } - break; - - case BLOCK_MAP_INCREMENT: - result = incrementForBlockMap(refCounts, block, slabBlockNumber, oldStatus, - lock, normalOperation, counterPtr, - freeStatusChanged); - break; - - default: - logError("Unknown reference count operation: %u", operation.type); - enterRefCountsReadOnlyMode(refCounts, VDO_NOT_IMPLEMENTED); - result = VDO_NOT_IMPLEMENTED; - } - - if (result != VDO_SUCCESS) { - return result; - } - - if (isValidJournalPoint(slabJournalPoint)) { - refCounts->slabJournalPoint = *slabJournalPoint; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int adjustReferenceCount(RefCounts *refCounts, - ReferenceOperation operation, - const JournalPoint *slabJournalPoint, - bool *freeStatusChanged) -{ - if (!isSlabOpen(refCounts->slab)) { - return VDO_INVALID_ADMIN_STATE; - } - - SlabBlockNumber slabBlockNumber; - int result = slabBlockNumberFromPBN(refCounts->slab, operation.pbn, - &slabBlockNumber); - if (result != VDO_SUCCESS) { - return result; - } - - ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); - bool provisionalDecrement = false; - result = updateReferenceCount(refCounts, block, slabBlockNumber, - slabJournalPoint, operation, - NORMAL_OPERATION, freeStatusChanged, - &provisionalDecrement); - if ((result != VDO_SUCCESS) || provisionalDecrement) { - return result; - } - - if (block->isDirty && (block->slabJournalLock > 0)) { - /* - * This block is already dirty and a slab journal entry has been made - * for it since the last time it was clean. We must release the per-entry - * slab journal lock for the entry associated with the update we are now - * doing. - */ - result = ASSERT(isValidJournalPoint(slabJournalPoint), - "Reference count adjustments need slab journal points."); - if (result != VDO_SUCCESS) { - return result; - } - - SequenceNumber entryLock = slabJournalPoint->sequenceNumber; - adjustSlabJournalBlockReference(refCounts->slab->journal, entryLock, -1); - return VDO_SUCCESS; - } - - /* - * This may be the first time we are applying an update for which there - * is a slab journal entry to this block since the block was - * cleaned. Therefore, we convert the per-entry slab journal lock to an - * uncommitted reference block lock, if there is a per-entry lock. - */ - if (isValidJournalPoint(slabJournalPoint)) { - block->slabJournalLock = slabJournalPoint->sequenceNumber; - } else { - block->slabJournalLock = 0; - } - - dirtyBlock(block); - return VDO_SUCCESS; -} - -/**********************************************************************/ -int adjustReferenceCountForRebuild(RefCounts *refCounts, - PhysicalBlockNumber pbn, - JournalOperation operation) -{ - SlabBlockNumber slabBlockNumber; - int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &slabBlockNumber); - if (result != VDO_SUCCESS) { - return result; - } - - ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); - bool unusedFreeStatus; - ReferenceOperation physicalOperation = { - .type = operation, - }; - result = updateReferenceCount(refCounts, block, slabBlockNumber, NULL, - physicalOperation, !NORMAL_OPERATION, - &unusedFreeStatus, NULL); - if (result != VDO_SUCCESS) { - return result; - } - - dirtyBlock(block); - return VDO_SUCCESS; -} - -/**********************************************************************/ -int replayReferenceCountChange(RefCounts *refCounts, - const JournalPoint *entryPoint, - SlabJournalEntry entry) -{ - ReferenceBlock *block = getReferenceBlock(refCounts, entry.sbn); - SectorCount sector - = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR; - if (!beforeJournalPoint(&block->commitPoints[sector], entryPoint)) { - // This entry is already reflected in the existing counts, so do nothing. - return VDO_SUCCESS; - } - - // This entry is not yet counted in the reference counts. - bool unusedFreeStatus; - ReferenceOperation operation = { - .type = entry.operation - }; - int result = updateReferenceCount(refCounts, block, entry.sbn, - entryPoint, operation, !NORMAL_OPERATION, - &unusedFreeStatus, NULL); - if (result != VDO_SUCCESS) { - return result; - } - - dirtyBlock(block); - return VDO_SUCCESS; -} - -/**********************************************************************/ -int getReferenceStatus(RefCounts *refCounts, - PhysicalBlockNumber pbn, - ReferenceStatus *statusPtr) -{ - ReferenceCount *counterPtr = NULL; - int result = getReferenceCounter(refCounts, pbn, &counterPtr); - if (result != VDO_SUCCESS) { - return result; - } - - *statusPtr = referenceCountToStatus(*counterPtr); - return VDO_SUCCESS; -} - -/**********************************************************************/ -bool areEquivalentReferenceCounters(RefCounts *counterA, RefCounts *counterB) -{ - if ((counterA->blockCount != counterB->blockCount) - || (counterA->freeBlocks != counterB->freeBlocks) - || (counterA->referenceBlockCount != counterB->referenceBlockCount)) { - return false; - } - - for (size_t i = 0; i < counterA->referenceBlockCount; i++) { - ReferenceBlock *blockA = &counterA->blocks[i]; - ReferenceBlock *blockB = &counterB->blocks[i]; - if (blockA->allocatedCount != blockB->allocatedCount) { - return false; - } - } - - return (memcmp(counterA->counters, counterB->counters, - sizeof(ReferenceCount) * counterA->blockCount) == 0); -} - -/** - * Find the array index of the first zero byte in word-sized range of - * reference counters. The search does no bounds checking; the function relies - * on the array being sufficiently padded. - * - * @param wordPtr A pointer to the eight counter bytes to check - * @param startIndex The array index corresponding to wordPtr[0] - * @param failIndex The array index to return if no zero byte is found - - * @return the array index of the first zero byte in the word, or - * the value passed as failIndex if no zero byte was found - **/ -static inline SlabBlockNumber findZeroByteInWord(const byte *wordPtr, - SlabBlockNumber startIndex, - SlabBlockNumber failIndex) -{ - uint64_t word = getUInt64LE(wordPtr); - - // This looks like a loop, but GCC will unroll the eight iterations for us. - for (unsigned int offset = 0; offset < BYTES_PER_WORD; offset++) { - // Assumes little-endian byte order, which we have on X86. - if ((word & 0xFF) == 0) { - return (startIndex + offset); - } - word >>= 8; - } - - return failIndex; -} - -/**********************************************************************/ -bool findFreeBlock(const RefCounts *refCounts, - SlabBlockNumber startIndex, - SlabBlockNumber endIndex, - SlabBlockNumber *indexPtr) -{ - SlabBlockNumber zeroIndex; - SlabBlockNumber nextIndex = startIndex; - byte *nextCounter = &refCounts->counters[nextIndex]; - byte *endCounter = &refCounts->counters[endIndex]; - - // Search every byte of the first unaligned word. (Array is padded so - // reading past end is safe.) - zeroIndex = findZeroByteInWord(nextCounter, nextIndex, endIndex); - if (zeroIndex < endIndex) { - *indexPtr = zeroIndex; - return true; - } - - // On architectures where unaligned word access is expensive, this - // would be a good place to advance to an alignment boundary. - nextIndex += BYTES_PER_WORD; - nextCounter += BYTES_PER_WORD; - - // Now we're word-aligned; check an word at a time until we find a word - // containing a zero. (Array is padded so reading past end is safe.) - while (nextCounter < endCounter) { - /* - * The following code is currently an exact copy of the code preceding the - * loop, but if you try to merge them by using a do loop, it runs slower - * because a jump instruction gets added at the start of the iteration. - */ - zeroIndex = findZeroByteInWord(nextCounter, nextIndex, endIndex); - if (zeroIndex < endIndex) { - *indexPtr = zeroIndex; - return true; - } - - nextIndex += BYTES_PER_WORD; - nextCounter += BYTES_PER_WORD; - } - - return false; -} - -/** - * Search the reference block currently saved in the search cursor for a - * reference count of zero, starting at the saved counter index. - * - * @param [in] refCounts The RefCounts object to search - * @param [out] freeIndexPtr A pointer to receive the array index of the - * zero reference count - * - * @return true if an unreferenced counter was found - **/ -static bool searchCurrentReferenceBlock(const RefCounts *refCounts, - SlabBlockNumber *freeIndexPtr) -{ - // Don't bother searching if the current block is known to be full. - return ((refCounts->searchCursor.block->allocatedCount < COUNTS_PER_BLOCK) - && findFreeBlock(refCounts, refCounts->searchCursor.index, - refCounts->searchCursor.endIndex, freeIndexPtr)); -} - -/** - * Search each reference block for a reference count of zero, starting at the - * reference block and counter index saved in the search cursor and searching - * up to the end of the last reference block. The search does not wrap. - * - * @param [in] refCounts The RefCounts object to search - * @param [out] freeIndexPtr A pointer to receive the array index of the - * zero reference count - * - * @return true if an unreferenced counter was found - **/ -static bool searchReferenceBlocks(RefCounts *refCounts, - SlabBlockNumber *freeIndexPtr) -{ - // Start searching at the saved search position in the current block. - if (searchCurrentReferenceBlock(refCounts, freeIndexPtr)) { - return true; - } - - // Search each reference block up to the end of the slab. - while (advanceSearchCursor(refCounts)) { - if (searchCurrentReferenceBlock(refCounts, freeIndexPtr)) { - return true; - } - } - - return false; -} - -/** - * Do the bookkeeping for making a provisional reference. - * - * @param refCounts The RefCounts - * @param slabBlockNumber The block to reference - **/ -static void makeProvisionalReference(RefCounts *refCounts, - SlabBlockNumber slabBlockNumber) -{ - // Make the initial transition from an unreferenced block to a provisionally - // allocated block. - refCounts->counters[slabBlockNumber] = PROVISIONAL_REFERENCE_COUNT; - - // Account for the allocation. - ReferenceBlock *block = getReferenceBlock(refCounts, slabBlockNumber); - block->allocatedCount++; - refCounts->freeBlocks--; -} - -/**********************************************************************/ -int allocateUnreferencedBlock(RefCounts *refCounts, - PhysicalBlockNumber *allocatedPtr) -{ - if (!isSlabOpen(refCounts->slab)) { - return VDO_INVALID_ADMIN_STATE; - } - - SlabBlockNumber freeIndex; - if (!searchReferenceBlocks(refCounts, &freeIndex)) { - return VDO_NO_SPACE; - } - - ASSERT_LOG_ONLY((refCounts->counters[freeIndex] == EMPTY_REFERENCE_COUNT), - "free block must have refCount of zero"); - makeProvisionalReference(refCounts, freeIndex); - - // Update the search hint so the next search will start at the array - // index just past the free block we just found. - refCounts->searchCursor.index = (freeIndex + 1); - - *allocatedPtr = indexToPBN(refCounts, freeIndex); - return VDO_SUCCESS; -} - -/**********************************************************************/ -int provisionallyReferenceBlock(RefCounts *refCounts, - PhysicalBlockNumber pbn, - PBNLock *lock) -{ - if (!isSlabOpen(refCounts->slab)) { - return VDO_INVALID_ADMIN_STATE; - } - - SlabBlockNumber slabBlockNumber; - int result = slabBlockNumberFromPBN(refCounts->slab, pbn, &slabBlockNumber); - if (result != VDO_SUCCESS) { - return result; - } - - if (refCounts->counters[slabBlockNumber] == EMPTY_REFERENCE_COUNT) { - makeProvisionalReference(refCounts, slabBlockNumber); - if (lock != NULL) { - assignProvisionalReference(lock); - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -BlockCount countUnreferencedBlocks(RefCounts *refCounts, - PhysicalBlockNumber startPBN, - PhysicalBlockNumber endPBN) -{ - BlockCount freeBlocks = 0; - SlabBlockNumber startIndex = pbnToIndex(refCounts, startPBN); - SlabBlockNumber endIndex = pbnToIndex(refCounts, endPBN); - for (SlabBlockNumber index = startIndex; index < endIndex; index++) { - if (refCounts->counters[index] == EMPTY_REFERENCE_COUNT) { - freeBlocks++; - } - } - - return freeBlocks; -} - -/** - * Convert a ReferenceBlock's generic wait queue entry back into the - * ReferenceBlock. - * - * @param waiter The wait queue entry to convert - * - * @return The wrapping ReferenceBlock - **/ -static inline ReferenceBlock *waiterAsReferenceBlock(Waiter *waiter) -{ - STATIC_ASSERT(offsetof(ReferenceBlock, waiter) == 0); - return (ReferenceBlock *) waiter; -} - -/** - * WaitCallback to clean dirty reference blocks when resetting. - * - * @param blockWaiter The dirty block - * @param context Unused - **/ -static void -clearDirtyReferenceBlocks(Waiter *blockWaiter, - void *context __attribute__((unused))) -{ - waiterAsReferenceBlock(blockWaiter)->isDirty = false; -} - -/**********************************************************************/ -void resetReferenceCounts(RefCounts *refCounts) -{ - // We can just use memset() since each ReferenceCount is exactly one byte. - STATIC_ASSERT(sizeof(ReferenceCount) == 1); - memset(refCounts->counters, 0, refCounts->blockCount); - refCounts->freeBlocks = refCounts->blockCount; - refCounts->slabJournalPoint = (JournalPoint) { - .sequenceNumber = 0, - .entryCount = 0, - }; - - for (size_t i = 0; i < refCounts->referenceBlockCount; i++) { - refCounts->blocks[i].allocatedCount = 0; - } - - notifyAllWaiters(&refCounts->dirtyBlocks, clearDirtyReferenceBlocks, NULL); -} - -/**********************************************************************/ -BlockCount getSavedReferenceCountSize(BlockCount blockCount) -{ - return computeBucketCount(blockCount, COUNTS_PER_BLOCK); -} - -/** - * A waiter callback that resets the writing state of refCounts. - **/ -static void finishSummaryUpdate(Waiter *waiter, void *context) -{ - RefCounts *refCounts = refCountsFromWaiter(waiter); - refCounts->updatingSlabSummary = false; - - int result = *((int *) context); - if ((result == VDO_SUCCESS) || (result == VDO_READ_ONLY)) { - checkIfSlabDrained(refCounts->slab); - return; - } - - logErrorWithStringError(result, "failed to update slab summary"); - enterRefCountsReadOnlyMode(refCounts, result); -} - -/** - * Update slab summary that the RefCounts is clean. - * - * @param refCounts The RefCounts object that is being written - **/ -static void updateSlabSummaryAsClean(RefCounts *refCounts) -{ - SlabSummaryZone *summary = getSlabSummaryZone(refCounts->slab->allocator); - if (summary == NULL) { - return; - } - - // Update the slab summary to indicate this refCounts is clean. - TailBlockOffset offset - = getSummarizedTailBlockOffset(summary, refCounts->slab->slabNumber); - refCounts->updatingSlabSummary = true; - refCounts->slabSummaryWaiter.callback = finishSummaryUpdate; - updateSlabSummaryEntry(summary, &refCounts->slabSummaryWaiter, - refCounts->slab->slabNumber, offset, true, true, - getSlabFreeBlockCount(refCounts->slab)); -} - -/** - * Handle an I/O error reading or writing a reference count block. - * - * @param completion The VIO doing the I/O as a completion - **/ -static void handleIOError(VDOCompletion *completion) -{ - int result = completion->result; - VIOPoolEntry *entry = completion->parent; - RefCounts *refCounts = ((ReferenceBlock *) entry->parent)->refCounts; - returnVIO(refCounts->slab->allocator, entry); - refCounts->activeCount--; - enterRefCountsReadOnlyMode(refCounts, result); -} - -/** - * After a reference block has written, clean it, release its locks, and return - * its VIO to the pool. - * - * @param completion The VIO that just finished writing - **/ -static void finishReferenceBlockWrite(VDOCompletion *completion) -{ - VIOPoolEntry *entry = completion->parent; - ReferenceBlock *block = entry->parent; - RefCounts *refCounts = block->refCounts; - refCounts->activeCount--; - - // Release the slab journal lock. - adjustSlabJournalBlockReference(refCounts->slab->journal, - block->slabJournalLockToRelease, -1); - returnVIO(refCounts->slab->allocator, entry); - - /* - * We can't clear the isWriting flag earlier as releasing the slab journal - * lock may cause us to be dirtied again, but we don't want to double - * enqueue. - */ - block->isWriting = false; - - if (isReadOnly(refCounts->readOnlyNotifier)) { - checkIfSlabDrained(refCounts->slab); - return; - } - - // Re-queue the block if it was re-dirtied while it was writing. - if (block->isDirty) { - enqueueDirtyBlock(block); - if (isSlabDraining(refCounts->slab)) { - // We must be saving, and this block will otherwise not be relaunched. - saveDirtyReferenceBlocks(refCounts); - } - - return; - } - - // Mark the RefCounts as clean in the slab summary if there are no dirty - // or writing blocks and no summary update in progress. - if (!hasActiveIO(refCounts) && !hasWaiters(&refCounts->dirtyBlocks)) { - updateSlabSummaryAsClean(refCounts); - } -} - -/**********************************************************************/ -ReferenceCount *getReferenceCountersForBlock(ReferenceBlock *block) -{ - size_t blockIndex = block - block->refCounts->blocks; - return &block->refCounts->counters[blockIndex * COUNTS_PER_BLOCK]; -} - -/**********************************************************************/ -void packReferenceBlock(ReferenceBlock *block, void *buffer) -{ - PackedJournalPoint commitPoint; - packJournalPoint(&block->refCounts->slabJournalPoint, &commitPoint); - - PackedReferenceBlock *packed = buffer; - ReferenceCount *counters = getReferenceCountersForBlock(block); - for (SectorCount i = 0; i < SECTORS_PER_BLOCK; i++) { - packed->sectors[i].commitPoint = commitPoint; - memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR), - (sizeof(ReferenceCount) * COUNTS_PER_SECTOR)); - } -} - -/** - * After a dirty block waiter has gotten a VIO from the VIO pool, copy its - * counters and associated data into the VIO, and launch the write. - * - * @param blockWaiter The waiter of the dirty block - * @param vioContext The VIO returned by the pool - **/ -static void writeReferenceBlock(Waiter *blockWaiter, void *vioContext) -{ - VIOPoolEntry *entry = vioContext; - ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); - packReferenceBlock(block, entry->buffer); - - size_t blockOffset = (block - block->refCounts->blocks); - PhysicalBlockNumber pbn = (block->refCounts->origin + blockOffset); - block->slabJournalLockToRelease = block->slabJournalLock; - entry->parent = block; - - /* - * Mark the block as clean, since we won't be committing any updates that - * happen after this moment. As long as VIO order is preserved, two - * VIOs updating this block at once will not cause complications. - */ - block->isDirty = false; - - // Flush before writing to ensure that the recovery journal and slab journal - // entries which cover this reference update are stable (VDO-2331). - relaxedAdd64(&block->refCounts->statistics->blocksWritten, 1); - entry->vio->completion.callbackThreadID - = block->refCounts->slab->allocator->threadID; - launchWriteMetadataVIOWithFlush(entry->vio, pbn, finishReferenceBlockWrite, - handleIOError, true, false); -} - -/** - * Launch the write of a dirty reference block by first acquiring a VIO for it - * from the pool. This can be asynchronous since the writer will have to wait - * if all VIOs in the pool are currently in use. - * - * @param blockWaiter The waiter of the block which is starting to write - * @param context The parent refCounts of the block - **/ -static void launchReferenceBlockWrite(Waiter *blockWaiter, void *context) -{ - RefCounts *refCounts = context; - if (isReadOnly(refCounts->readOnlyNotifier)) { - return; - } - - refCounts->activeCount++; - ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); - block->isWriting = true; - blockWaiter->callback = writeReferenceBlock; - int result = acquireVIO(refCounts->slab->allocator, blockWaiter); - if (result != VDO_SUCCESS) { - // This should never happen. - refCounts->activeCount--; - enterRefCountsReadOnlyMode(refCounts, result); - } -} - -/**********************************************************************/ -void saveOldestReferenceBlock(RefCounts *refCounts) -{ - notifyNextWaiter(&refCounts->dirtyBlocks, launchReferenceBlockWrite, - refCounts); -} - -/**********************************************************************/ -void saveSeveralReferenceBlocks(RefCounts *refCounts, size_t flushDivisor) -{ - BlockCount dirtyBlockCount = countWaiters(&refCounts->dirtyBlocks); - if (dirtyBlockCount == 0) { - return; - } - - BlockCount blocksToWrite = dirtyBlockCount / flushDivisor; - // Always save at least one block. - if (blocksToWrite == 0) { - blocksToWrite = 1; - } - - for (BlockCount written = 0; written < blocksToWrite; written++) { - saveOldestReferenceBlock(refCounts); - } -} - -/**********************************************************************/ -void saveDirtyReferenceBlocks(RefCounts *refCounts) -{ - notifyAllWaiters(&refCounts->dirtyBlocks, launchReferenceBlockWrite, - refCounts); - checkIfSlabDrained(refCounts->slab); -} - -/**********************************************************************/ -void dirtyAllReferenceBlocks(RefCounts *refCounts) -{ - for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { - dirtyBlock(&refCounts->blocks[i]); - } -} - -/** - * Clear the provisional reference counts from a reference block. - * - * @param block The block to clear - **/ -static void clearProvisionalReferences(ReferenceBlock *block) -{ - ReferenceCount *counters = getReferenceCountersForBlock(block); - for (BlockCount j = 0; j < COUNTS_PER_BLOCK; j++) { - if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { - counters[j] = EMPTY_REFERENCE_COUNT; - block->allocatedCount--; - } - } -} - -/** - * Unpack reference counts blocks into the internal memory structure. - * - * @param packed The written reference block to be unpacked - * @param block The internal reference block to be loaded - **/ -static void unpackReferenceBlock(PackedReferenceBlock *packed, - ReferenceBlock *block) -{ - RefCounts *refCounts = block->refCounts; - ReferenceCount *counters = getReferenceCountersForBlock(block); - for (SectorCount i = 0; i < SECTORS_PER_BLOCK; i++) { - PackedReferenceSector *sector = &packed->sectors[i]; - unpackJournalPoint(§or->commitPoint, &block->commitPoints[i]); - memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts, - (sizeof(ReferenceCount) * COUNTS_PER_SECTOR)); - // The slabJournalPoint must be the latest point found in any sector. - if (beforeJournalPoint(&refCounts->slabJournalPoint, - &block->commitPoints[i])) { - refCounts->slabJournalPoint = block->commitPoints[i]; - } - - if ((i > 0) && !areEquivalentJournalPoints(&block->commitPoints[0], - &block->commitPoints[i])) { - size_t blockIndex = block - block->refCounts->blocks; - logWarning("Torn write detected in sector %u of reference block" - " %zu of slab %" PRIu16, - i, blockIndex, block->refCounts->slab->slabNumber); - } - } - - block->allocatedCount = 0; - for (BlockCount i = 0; i < COUNTS_PER_BLOCK; i++) { - if (counters[i] != EMPTY_REFERENCE_COUNT) { - block->allocatedCount++; - } - } -} - -/** - * After a reference block has been read, unpack it. - * - * @param completion The VIO that just finished reading - **/ -static void finishReferenceBlockLoad(VDOCompletion *completion) -{ - VIOPoolEntry *entry = completion->parent; - ReferenceBlock *block = entry->parent; - unpackReferenceBlock((PackedReferenceBlock *) entry->buffer, block); - - RefCounts *refCounts = block->refCounts; - returnVIO(refCounts->slab->allocator, entry); - refCounts->activeCount--; - clearProvisionalReferences(block); - - refCounts->freeBlocks -= block->allocatedCount; - checkIfSlabDrained(block->refCounts->slab); -} - -/** - * After a block waiter has gotten a VIO from the VIO pool, load the block. - * - * @param blockWaiter The waiter of the block to load - * @param vioContext The VIO returned by the pool - **/ -static void loadReferenceBlock(Waiter *blockWaiter, void *vioContext) -{ - VIOPoolEntry *entry = vioContext; - ReferenceBlock *block = waiterAsReferenceBlock(blockWaiter); - size_t blockOffset = (block - block->refCounts->blocks); - PhysicalBlockNumber pbn = (block->refCounts->origin + blockOffset); - entry->parent = block; - - entry->vio->completion.callbackThreadID - = block->refCounts->slab->allocator->threadID; - launchReadMetadataVIO(entry->vio, pbn, finishReferenceBlockLoad, - handleIOError); -} - -/** - * Load reference blocks from the underlying storage into a pre-allocated - * reference counter. - * - * @param refCounts The reference counter to be loaded - **/ -static void loadReferenceBlocks(RefCounts *refCounts) -{ - refCounts->freeBlocks = refCounts->blockCount; - refCounts->activeCount = refCounts->referenceBlockCount; - for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { - Waiter *blockWaiter = &refCounts->blocks[i].waiter; - blockWaiter->callback = loadReferenceBlock; - int result = acquireVIO(refCounts->slab->allocator, blockWaiter); - if (result != VDO_SUCCESS) { - // This should never happen. - refCounts->activeCount -= (refCounts->referenceBlockCount - i); - enterRefCountsReadOnlyMode(refCounts, result); - return; - } - } -} - -/**********************************************************************/ -void drainRefCounts(RefCounts *refCounts) -{ - Slab *slab = refCounts->slab; - bool save = false; - switch (slab->state.state) { - case ADMIN_STATE_SCRUBBING: - if (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)) { - loadReferenceBlocks(refCounts); - return; - } - - break; - - case ADMIN_STATE_SAVE_FOR_SCRUBBING: - if (!mustLoadRefCounts(slab->allocator->summary, slab->slabNumber)) { - // These reference counts were never written, so mark them all dirty. - dirtyAllReferenceBlocks(refCounts); - } - save = true; - break; - - case ADMIN_STATE_REBUILDING: - if (shouldSaveFullyBuiltSlab(slab)) { - dirtyAllReferenceBlocks(refCounts); - save = true; - } - break; - - case ADMIN_STATE_SAVING: - save = !isUnrecoveredSlab(slab); - break; - - case ADMIN_STATE_RECOVERING: - case ADMIN_STATE_SUSPENDING: - break; - - default: - notifyRefCountsAreDrained(slab, VDO_SUCCESS); - return; - } - - if (save) { - saveDirtyReferenceBlocks(refCounts); - } -} - -/**********************************************************************/ -void acquireDirtyBlockLocks(RefCounts *refCounts) -{ - dirtyAllReferenceBlocks(refCounts); - for (BlockCount i = 0; i < refCounts->referenceBlockCount; i++) { - refCounts->blocks[i].slabJournalLock = 1; - } - - adjustSlabJournalBlockReference(refCounts->slab->journal, 1, - refCounts->referenceBlockCount); -} - -/**********************************************************************/ -void dumpRefCounts(const RefCounts *refCounts) -{ - // Terse because there are a lot of slabs to dump and syslog is lossy. - logInfo(" refCounts: free=%" PRIu32 "/%" PRIu32 " blocks=%" PRIu32 - " dirty=%zu active=%zu journal@(%llu,%" PRIu16 ")%s", - refCounts->freeBlocks, refCounts->blockCount, - refCounts->referenceBlockCount, - countWaiters(&refCounts->dirtyBlocks), - refCounts->activeCount, - refCounts->slabJournalPoint.sequenceNumber, - refCounts->slabJournalPoint.entryCount, - (refCounts->updatingSlabSummary ? " updating" : "")); -} diff --git a/vdo/base/refCounts.h b/vdo/base/refCounts.h deleted file mode 100644 index f140c8c..0000000 --- a/vdo/base/refCounts.h +++ /dev/null @@ -1,263 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCounts.h#7 $ - */ - -#ifndef REF_COUNTS_H -#define REF_COUNTS_H - -#include "completion.h" -#include "journalPoint.h" -#include "slab.h" -#include "types.h" - -/** - * Create a reference counting object. - * - *

A reference counting object can keep a reference count for every physical - * block in the VDO configuration. Since we expect the vast majority of the - * blocks to have 0 or 1 reference counts, the structure is optimized for that - * situation. - * - * @param [in] blockCount The number of physical blocks that can be - * referenced - * @param [in] slab The slab of the ref counts object - * @param [in] origin The layer PBN at which to save RefCounts - * @param [in] readOnlyNotifier The context for tracking read-only mode - * @param [out] refCountsPtr The pointer to hold the new ref counts object - * - * @return a success or error code - **/ -int makeRefCounts(BlockCount blockCount, - Slab *slab, - PhysicalBlockNumber origin, - ReadOnlyNotifier *readOnlyNotifier, - RefCounts **refCountsPtr) - __attribute__((warn_unused_result)); - -/** - * Free a reference counting object and null out the reference to it. - * - * @param refCountsPtr The reference to the reference counting object to free - **/ -void freeRefCounts(RefCounts **refCountsPtr); - -/** - * Check whether a RefCounts is active. - * - * @param refCounts The RefCounts to check - **/ -bool areRefCountsActive(RefCounts *refCounts) - __attribute__((warn_unused_result)); - -/** - * Get the stored count of the number of blocks that are currently free. - * - * @param refCounts The RefCounts object - * - * @return the number of blocks with a reference count of zero - **/ -BlockCount getUnreferencedBlockCount(RefCounts *refCounts) - __attribute__((warn_unused_result)); - -/** - * Determine how many times a reference count can be incremented without - * overflowing. - * - * @param refCounts The RefCounts object - * @param pbn The physical block number - * - * @return the number of increments that can be performed - **/ -uint8_t getAvailableReferences(RefCounts *refCounts, PhysicalBlockNumber pbn) - __attribute__((warn_unused_result)); - -/** - * Adjust the reference count of a block. - * - * @param [in] refCounts The refcounts object - * @param [in] operation The operation to perform - * @param [in] slabJournalPoint The slab journal entry for this adjustment - * @param [out] freeStatusChanged A pointer which will be set to true if the - * free status of the block changed - * - * - * @return A success or error code, specifically: - * VDO_REF_COUNT_INVALID if a decrement would result in a negative - * reference count, or an increment in a - * count greater than MAXIMUM_REFS - * - **/ -int adjustReferenceCount(RefCounts *refCounts, - ReferenceOperation operation, - const JournalPoint *slabJournalPoint, - bool *freeStatusChanged) - __attribute__((warn_unused_result)); - -/** - * Adjust the reference count of a block during rebuild. - * - * @param refCounts The refcounts object - * @param pbn The number of the block to adjust - * @param operation The operation to perform on the count - * - * @return VDO_SUCCESS or an error - **/ -int adjustReferenceCountForRebuild(RefCounts *refCounts, - PhysicalBlockNumber pbn, - JournalOperation operation) - __attribute__((warn_unused_result)); - -/** - * Replay the reference count adjustment from a slab journal entry into the - * reference count for a block. The adjustment will be ignored if it was already - * recorded in the reference count. - * - * @param refCounts The refcounts object - * @param entryPoint The slab journal point for the entry - * @param entry The slab journal entry being replayed - * - * @return VDO_SUCCESS or an error code - **/ -int replayReferenceCountChange(RefCounts *refCounts, - const JournalPoint *entryPoint, - SlabJournalEntry entry) - __attribute__((warn_unused_result)); - -/** - * Check whether two reference counters are equivalent. This method is - * used for unit testing. - * - * @param counterA The first counter to compare - * @param counterB The second counter to compare - * - * @return true if the two counters are equivalent - **/ -bool areEquivalentReferenceCounters(RefCounts *counterA, RefCounts *counterB) - __attribute__((warn_unused_result)); - -/** - * Find a block with a reference count of zero in the range of physical block - * numbers tracked by the reference counter. If a free block is found, that - * block is allocated by marking it as provisionally referenced, and the - * allocated block number is returned. - * - * @param [in] refCounts The reference counters to scan - * @param [out] allocatedPtr A pointer to hold the physical block number of - * the block that was found and allocated - * - * @return VDO_SUCCESS if a free block was found and allocated; - * VDO_NO_SPACE if there are no unreferenced blocks; - * otherwise an error code - **/ -int allocateUnreferencedBlock(RefCounts *refCounts, - PhysicalBlockNumber *allocatedPtr) - __attribute__((warn_unused_result)); - -/** - * Provisionally reference a block if it is unreferenced. - * - * @param refCounts The reference counters - * @param pbn The PBN to reference - * @param lock The PBNLock on the block (may be NULL) - * - * @return VDO_SUCCESS or an error - **/ -int provisionallyReferenceBlock(RefCounts *refCounts, - PhysicalBlockNumber pbn, - PBNLock *lock) - __attribute__((warn_unused_result)); - -/** - * Count all unreferenced blocks in a range [startBlock, endBlock) of physical - * block numbers. - * - * @param refCounts The reference counters to scan - * @param startPBN The physical block number at which to start - * scanning (included in the scan) - * @param endPBN The physical block number at which to stop - * scanning (excluded from the scan) - * - * @return The number of unreferenced blocks - **/ -BlockCount countUnreferencedBlocks(RefCounts *refCounts, - PhysicalBlockNumber startPBN, - PhysicalBlockNumber endPBN) - __attribute__((warn_unused_result)); - -/** - * Get the number of blocks required to save a reference counts state covering - * the specified number of data blocks. - * - * @param blockCount The number of physical data blocks that can be referenced - * - * @return The number of blocks required to save reference counts with the - * given block count - **/ -BlockCount getSavedReferenceCountSize(BlockCount blockCount) - __attribute__((warn_unused_result)); - -/** - * Request a RefCounts save several dirty blocks asynchronously. This function - * currently writes 1 / flushDivisor of the dirty blocks. - * - * @param refCounts The RefCounts object to notify - * @param flushDivisor The inverse fraction of the dirty blocks to write - **/ -void saveSeveralReferenceBlocks(RefCounts *refCounts, size_t flushDivisor); - -/** - * Ask a RefCounts to save all its dirty blocks asynchronously. - * - * @param refCounts The RefCounts object to notify - **/ -void saveDirtyReferenceBlocks(RefCounts *refCounts); - -/** - * Mark all reference count blocks as dirty. - * - * @param refCounts The RefCounts of the reference blocks - **/ -void dirtyAllReferenceBlocks(RefCounts *refCounts); - -/** - * Drain all reference count I/O. Depending upon the type of drain being - * performed (as recorded in the RefCount's Slab), the reference blocks may - * be loaded from disk or dirty reference blocks may be written out. - * - * @param refCounts The reference counts to drain - **/ -void drainRefCounts(RefCounts *refCounts); - -/** - * Mark all reference count blocks dirty and cause them to hold locks on slab - * journal block 1. - * - * @param refCounts The RefCounts of the reference blocks - **/ -void acquireDirtyBlockLocks(RefCounts *refCounts); - -/** - * Dump information about this RefCounts structure. - * - * @param refCounts The RefCounts to dump - **/ -void dumpRefCounts(const RefCounts *refCounts); - -#endif // REF_COUNTS_H diff --git a/vdo/base/refCountsInternals.h b/vdo/base/refCountsInternals.h deleted file mode 100644 index a1bd1db..0000000 --- a/vdo/base/refCountsInternals.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/refCountsInternals.h#4 $ - */ - -#ifndef REF_COUNTS_INTERNALS_H -#define REF_COUNTS_INTERNALS_H - -#include "refCounts.h" - -#include "journalPoint.h" -#include "referenceBlock.h" -#include "slab.h" -#include "blockAllocatorInternals.h" -#include "waitQueue.h" - -/** - * Represents the possible status of a block. - **/ -typedef enum referenceStatus { - RS_FREE, // this block is free - RS_SINGLE, // this block is singly-referenced - RS_SHARED, // this block is shared - RS_PROVISIONAL // this block is provisionally allocated -} ReferenceStatus; - -/** - * The SearchCursor represents the saved position of a free block search. - **/ -typedef struct searchCursor { - /** The reference block containing the current search index */ - ReferenceBlock *block; - /** The position at which to start searching for the next free counter */ - SlabBlockNumber index; - /** The position just past the last valid counter in the current block */ - SlabBlockNumber endIndex; - - /** A pointer to the first reference block in the slab */ - ReferenceBlock *firstBlock; - /** A pointer to the last reference block in the slab */ - ReferenceBlock *lastBlock; -} SearchCursor; - -/* - * RefCounts structure - * - * A reference count is maintained for each PhysicalBlockNumber. The vast - * majority of blocks have a very small reference count (usually 0 or 1). - * For references less than or equal to MAXIMUM_REFS (254) the reference count - * is stored in counters[pbn]. - * - */ -struct refCounts { - /** The slab of this reference block */ - Slab *slab; - - /** The size of the counters array */ - uint32_t blockCount; - /** The number of free blocks */ - uint32_t freeBlocks; - /** The array of reference counts */ - ReferenceCount *counters; // use ALLOCATE to align data ptr - - /** The saved block pointer and array indexes for the free block search */ - SearchCursor searchCursor; - - /** A list of the dirty blocks waiting to be written out */ - WaitQueue dirtyBlocks; - /** The number of blocks which are currently writing */ - size_t activeCount; - - /** A waiter object for updating the slab summary */ - Waiter slabSummaryWaiter; - /** Whether slab summary update is in progress */ - bool updatingSlabSummary; - - /** The notifier for read-only mode */ - ReadOnlyNotifier *readOnlyNotifier; - /** The refcount statistics, shared by all refcounts in our physical zone */ - AtomicRefCountStatistics *statistics; - /** The layer PBN for the first ReferenceBlock */ - PhysicalBlockNumber origin; - /** The latest slab journal entry this RefCounts has been updated with */ - JournalPoint slabJournalPoint; - - /** The number of reference count blocks */ - uint32_t referenceBlockCount; - /** reference count block array */ - ReferenceBlock blocks[]; -}; - -/** - * Convert a reference count to a reference status. - * - * @param count The count to convert - * - * @return The appropriate reference status - **/ -__attribute__((warn_unused_result)) -ReferenceStatus referenceCountToStatus(ReferenceCount count); - -/** - * Convert a generic VDOCompletion to a RefCounts. - * - * @param completion The completion to convert - * - * @return The completion as a RefCounts - **/ -RefCounts *asRefCounts(VDOCompletion *completion) - __attribute__((warn_unused_result)); - -/** - * Get the reference block that covers the given block index (exposed for - * testing). - * - * @param refCounts The refcounts object - * @param index The block index - **/ -ReferenceBlock *getReferenceBlock(RefCounts *refCounts, SlabBlockNumber index) - __attribute__((warn_unused_result)); - -/** - * Find the reference counters for a given block (exposed for testing). - * - * @param block The ReferenceBlock in question - * - * @return A pointer to the reference counters for this block - **/ -ReferenceCount *getReferenceCountersForBlock(ReferenceBlock *block) - __attribute__((warn_unused_result)); - -/** - * Copy data from a reference block to a buffer ready to be written out - * (exposed for testing). - * - * @param block The block to copy - * @param buffer The char buffer to fill with the packed block - **/ -void packReferenceBlock(ReferenceBlock *block, void *buffer); - -/** - * Get the reference status of a block. Exposed only for unit testing. - * - * @param [in] refCounts The refcounts object - * @param [in] pbn The physical block number - * @param [out] statusPtr Where to put the status of the block - * - * @return A success or error code, specifically: - * VDO_OUT_OF_RANGE if the pbn is out of range. - **/ -int getReferenceStatus(RefCounts *refCounts, - PhysicalBlockNumber pbn, - ReferenceStatus *statusPtr) - __attribute__((warn_unused_result)); - -/** - * Find the first block with a reference count of zero in the specified range - * of reference counter indexes. Exposed for unit testing. - * - * @param [in] refCounts The reference counters to scan - * @param [in] startIndex The array index at which to start scanning - * (included in the scan) - * @param [in] endIndex The array index at which to stop scanning - * (excluded from the scan) - * @param [out] indexPtr A pointer to hold the array index of the free block - * - * @return true if a free block was found in the specified range - **/ -bool findFreeBlock(const RefCounts *refCounts, - SlabBlockNumber startIndex, - SlabBlockNumber endIndex, - SlabBlockNumber *indexPtr) - __attribute__((warn_unused_result)); - -/** - * Request a RefCounts save its oldest dirty block asynchronously. - * - * @param refCounts The RefCounts object to notify - **/ -void saveOldestReferenceBlock(RefCounts *refCounts); - -/** - * Reset all reference counts back to RS_FREE. - * - * @param refCounts The reference counters to reset - **/ -void resetReferenceCounts(RefCounts *refCounts); - -#endif // REF_COUNTS_INTERNALS_H diff --git a/vdo/base/referenceBlock.h b/vdo/base/referenceBlock.h deleted file mode 100644 index 8014c3b..0000000 --- a/vdo/base/referenceBlock.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceBlock.h#1 $ - */ - -#ifndef REFERENCE_BLOCK_H -#define REFERENCE_BLOCK_H - -#include "constants.h" -#include "journalPoint.h" -#include "types.h" -#include "waitQueue.h" - -/** - * A type representing a reference count. - **/ -typedef uint8_t ReferenceCount; - -/** - * Special ReferenceCount values. - **/ -enum { - EMPTY_REFERENCE_COUNT = 0, - MAXIMUM_REFERENCE_COUNT = 254, - PROVISIONAL_REFERENCE_COUNT = 255, -}; - -enum { - COUNTS_PER_SECTOR = ((VDO_SECTOR_SIZE - sizeof(PackedJournalPoint)) - / sizeof(ReferenceCount)), - COUNTS_PER_BLOCK = COUNTS_PER_SECTOR * SECTORS_PER_BLOCK, -}; - -/** - * The format of a ReferenceSector on disk. - **/ -typedef struct { - PackedJournalPoint commitPoint; - ReferenceCount counts[COUNTS_PER_SECTOR]; -} __attribute__((packed)) PackedReferenceSector; - -typedef struct { - PackedReferenceSector sectors[SECTORS_PER_BLOCK]; -} PackedReferenceBlock; - -/* - * ReferenceBlock structure - * - * Blocks are used as a proxy, permitting saves of partial refcounts. - **/ -typedef struct { - /** This block waits on the refCounts to tell it to write */ - Waiter waiter; - /** The parent RefCount structure */ - RefCounts *refCounts; - /** The number of references in this block that represent allocations */ - BlockSize allocatedCount; - /** The slab journal block on which this block must hold a lock */ - SequenceNumber slabJournalLock; - /** - * The slab journal block which should be released when this block - * is committed - **/ - SequenceNumber slabJournalLockToRelease; - /** The point up to which each sector is accurate on disk */ - JournalPoint commitPoints[SECTORS_PER_BLOCK]; - /** Whether this block has been modified since it was written to disk */ - bool isDirty; - /** Whether this block is currently writing */ - bool isWriting; -} ReferenceBlock; - -#endif // REFERENCE_BLOCK_H diff --git a/vdo/base/referenceCountRebuild.c b/vdo/base/referenceCountRebuild.c deleted file mode 100644 index a3d91ac..0000000 --- a/vdo/base/referenceCountRebuild.c +++ /dev/null @@ -1,491 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceCountRebuild.c#6 $ - */ - -#include "referenceCountRebuild.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "blockMap.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "forest.h" -#include "constants.h" -#include "numUtils.h" -#include "refCounts.h" -#include "slabDepot.h" -#include "vdoInternal.h" -#include "vdoPageCache.h" - -/** - * A reference count rebuild completion. - * Note that the page completions kept in this structure are not immediately - * freed, so the corresponding pages will be locked down in the page cache - * until the rebuild frees them. - **/ -typedef struct { - /** completion header */ - VDOCompletion completion; - /** the completion for flushing the block map */ - VDOCompletion subTaskCompletion; - /** the thread on which all block map operations must be done */ - ThreadID logicalThreadID; - /** the admin thread */ - ThreadID adminThreadID; - /** the block map */ - BlockMap *blockMap; - /** the slab depot */ - SlabDepot *depot; - /** whether this recovery has been aborted */ - bool aborted; - /** whether we are currently launching the initial round of requests */ - bool launching; - /** The number of logical blocks observed used */ - BlockCount *logicalBlocksUsed; - /** The number of block map data blocks */ - BlockCount *blockMapDataBlocks; - /** the next page to fetch */ - PageCount pageToFetch; - /** the number of leaf pages in the block map */ - PageCount leafPages; - /** the last slot of the block map */ - BlockMapSlot lastSlot; - /** number of pending (non-ready) requests*/ - PageCount outstanding; - /** number of page completions */ - PageCount pageCount; - /** array of requested, potentially ready page completions */ - VDOPageCompletion pageCompletions[]; -} RebuildCompletion; - -/** - * Convert a VDOCompletion to a RebuildCompletion. - * - * @param completion The completion to convert - * - * @return The completion as a RebuildCompletion - **/ -__attribute__((warn_unused_result)) -static inline RebuildCompletion *asRebuildCompletion(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(RebuildCompletion, completion) == 0); - assertCompletionType(completion->type, REFERENCE_COUNT_REBUILD_COMPLETION); - return (RebuildCompletion *) completion; -} - -/** - * Free a RebuildCompletion and null out the reference to it. - * - * @param completionPtr a pointer to the completion to free - **/ -static void freeRebuildCompletion(VDOCompletion **completionPtr) -{ - VDOCompletion *completion = *completionPtr; - if (completion == NULL) { - return; - } - - RebuildCompletion *rebuild = asRebuildCompletion(completion); - destroyEnqueueable(&rebuild->subTaskCompletion); - destroyEnqueueable(completion); - FREE(rebuild); - *completionPtr = NULL; -} - -/** - * Free the RebuildCompletion and notify the parent that the block map - * rebuild is done. This callback is registered in rebuildBlockMap(). - * - * @param completion The RebuildCompletion - **/ -static void finishRebuild(VDOCompletion *completion) -{ - int result = completion->result; - VDOCompletion *parent = completion->parent; - freeRebuildCompletion(&completion); - finishCompletion(parent, result); -} - -/** - * Make a new rebuild completion. - * - * @param [in] vdo The VDO - * @param [in] logicalBlocksUsed A pointer to hold the logical blocks used - * @param [in] blockMapDataBlocks A pointer to hold the number of block map - * data blocks - * @param [in] parent The parent of the rebuild completion - * @param [out] rebuildPtr The new block map rebuild completion - * - * @return a success or error code - **/ -static int makeRebuildCompletion(VDO *vdo, - BlockCount *logicalBlocksUsed, - BlockCount *blockMapDataBlocks, - VDOCompletion *parent, - RebuildCompletion **rebuildPtr) -{ - BlockMap *blockMap = getBlockMap(vdo); - PageCount pageCount - = minPageCount(getConfiguredCacheSize(vdo) >> 1, - MAXIMUM_SIMULTANEOUS_BLOCK_MAP_RESTORATION_READS); - - RebuildCompletion *rebuild; - int result = ALLOCATE_EXTENDED(RebuildCompletion, pageCount, - VDOPageCompletion, __func__, &rebuild); - if (result != UDS_SUCCESS) { - return result; - } - - result = initializeEnqueueableCompletion(&rebuild->completion, - REFERENCE_COUNT_REBUILD_COMPLETION, - vdo->layer); - if (result != VDO_SUCCESS) { - VDOCompletion *completion = &rebuild->completion; - freeRebuildCompletion(&completion); - return result; - } - - result = initializeEnqueueableCompletion(&rebuild->subTaskCompletion, - SUB_TASK_COMPLETION, vdo->layer); - if (result != VDO_SUCCESS) { - VDOCompletion *completion = &rebuild->completion; - freeRebuildCompletion(&completion); - return result; - } - - rebuild->blockMap = blockMap; - rebuild->depot = vdo->depot; - rebuild->logicalBlocksUsed = logicalBlocksUsed; - rebuild->blockMapDataBlocks = blockMapDataBlocks; - rebuild->pageCount = pageCount; - rebuild->leafPages = computeBlockMapPageCount(blockMap->entryCount); - - const ThreadConfig *threadConfig = getThreadConfig(vdo); - rebuild->logicalThreadID = getLogicalZoneThread(threadConfig, 0); - rebuild->adminThreadID = getAdminThread(threadConfig); - - ASSERT_LOG_ONLY((getCallbackThreadID() == rebuild->logicalThreadID), - "%s must be called on logical thread %u (not %u)", __func__, - rebuild->logicalThreadID, getCallbackThreadID()); - prepareCompletion(&rebuild->completion, finishRebuild, finishRebuild, - rebuild->logicalThreadID, parent); - - *rebuildPtr = rebuild; - return VDO_SUCCESS; -} - -/** - * Flush the block map now that all the reference counts are rebuilt. This - * callback is registered in finishIfDone(). - * - * @param completion The sub-task completion - **/ -static void flushBlockMapUpdates(VDOCompletion *completion) -{ - logInfo("Flushing block map changes"); - prepareToFinishParent(completion, completion->parent); - drainBlockMap(asRebuildCompletion(completion->parent)->blockMap, - ADMIN_STATE_RECOVERING, completion); -} - -/** - * Check whether the rebuild is done. If it succeeded, continue by flushing the - * block map. - * - * @param rebuild The rebuild completion - * - * @return true if the rebuild is complete - **/ -static bool finishIfDone(RebuildCompletion *rebuild) -{ - if (rebuild->launching || (rebuild->outstanding > 0)) { - return false; - } - - if (rebuild->aborted) { - completeCompletion(&rebuild->completion); - return true; - } - - if (rebuild->pageToFetch < rebuild->leafPages) { - return false; - } - - prepareCompletion(&rebuild->subTaskCompletion, flushBlockMapUpdates, - finishParentCallback, rebuild->adminThreadID, rebuild); - invokeCallback(&rebuild->subTaskCompletion); - return true; -} - -/** - * Record that there has been an error during the rebuild. - * - * @param rebuild The rebuild completion - * @param result The error result to use, if one is not already saved - **/ -static void abortRebuild(RebuildCompletion *rebuild, int result) -{ - rebuild->aborted = true; - setCompletionResult(&rebuild->completion, result); -} - -/** - * Handle an error loading a page. - * - * @param completion The VDOPageCompletion - **/ -static void handlePageLoadError(VDOCompletion *completion) -{ - RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); - rebuild->outstanding--; - abortRebuild(rebuild, completion->result); - releaseVDOPageCompletion(completion); - finishIfDone(rebuild); -} - -/** - * Rebuild reference counts from a block map page. - * - * @param rebuild The rebuild completion - * @param completion The page completion holding the page - * - * @return VDO_SUCCESS or an error - **/ -static int rebuildReferenceCountsFromPage(RebuildCompletion *rebuild, - VDOCompletion *completion) -{ - BlockMapPage *page = dereferenceWritableVDOPage(completion); - int result = ASSERT(page != NULL, "page available"); - if (result != VDO_SUCCESS) { - return result; - } - - if (!isBlockMapPageInitialized(page)) { - return VDO_SUCCESS; - } - - // Remove any bogus entries which exist beyond the end of the logical space. - if (getBlockMapPagePBN(page) == rebuild->lastSlot.pbn) { - for (SlotNumber slot = rebuild->lastSlot.slot; - slot < BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { - DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); - if (isMappedLocation(&mapping)) { - page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); - requestVDOPageWrite(completion); - } - } - } - - // Inform the slab depot of all entries on this page. - for (SlotNumber slot = 0; slot < BLOCK_MAP_ENTRIES_PER_PAGE; slot++) { - DataLocation mapping = unpackBlockMapEntry(&page->entries[slot]); - if (!isValidLocation(&mapping)) { - // This entry is invalid, so remove it from the page. - page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); - requestVDOPageWrite(completion); - continue; - } - - if (!isMappedLocation(&mapping)) { - continue; - } - - (*rebuild->logicalBlocksUsed)++; - if (mapping.pbn == ZERO_BLOCK) { - continue; - } - - if (!isPhysicalDataBlock(rebuild->depot, mapping.pbn)) { - // This is a nonsense mapping. Remove it from the map so we're at least - // consistent and mark the page dirty. - page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); - requestVDOPageWrite(completion); - continue; - } - - Slab *slab = getSlab(rebuild->depot, mapping.pbn); - int result = adjustReferenceCountForRebuild(slab->referenceCounts, - mapping.pbn, DATA_INCREMENT); - if (result != VDO_SUCCESS) { - logErrorWithStringError(result, - "Could not adjust reference count for PBN" - " %llu, slot %u mapped to PBN %llu", - getBlockMapPagePBN(page), slot, mapping.pbn); - page->entries[slot] = packPBN(ZERO_BLOCK, MAPPING_STATE_UNMAPPED); - requestVDOPageWrite(completion); - } - } - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void fetchPage(RebuildCompletion *rebuild, VDOCompletion *completion); - -/** - * Process a page which has just been loaded. This callback is registered by - * fetchPage(). - * - * @param completion The VDOPageCompletion for the fetched page - **/ -static void pageLoaded(VDOCompletion *completion) -{ - RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); - rebuild->outstanding--; - - int result = rebuildReferenceCountsFromPage(rebuild, completion); - if (result != VDO_SUCCESS) { - abortRebuild(rebuild, result); - } - - releaseVDOPageCompletion(completion); - if (finishIfDone(rebuild)) { - return; - } - - // Advance progress to the next page, and fetch the next page we - // haven't yet requested. - fetchPage(rebuild, completion); -} - -/** - * Fetch a page from the block map. - * - * @param rebuild the RebuildCompletion - * @param completion the page completion to use - **/ -static void fetchPage(RebuildCompletion *rebuild, VDOCompletion *completion) -{ - while (rebuild->pageToFetch < rebuild->leafPages) { - PhysicalBlockNumber pbn = findBlockMapPagePBN(rebuild->blockMap, - rebuild->pageToFetch++); - if (pbn == ZERO_BLOCK) { - continue; - } - - if (!isPhysicalDataBlock(rebuild->depot, pbn)) { - abortRebuild(rebuild, VDO_BAD_MAPPING); - if (finishIfDone(rebuild)) { - return; - } - continue; - } - - initVDOPageCompletion(((VDOPageCompletion *) completion), - rebuild->blockMap->zones[0].pageCache, - pbn, true, &rebuild->completion, - pageLoaded, handlePageLoadError); - rebuild->outstanding++; - getVDOPageAsync(completion); - return; - } -} - -/** - * Rebuild reference counts from the leaf block map pages now that reference - * counts have been rebuilt from the interior tree pages (which have been - * loaded in the process). This callback is registered in - * rebuildReferenceCounts(). - * - * @param completion The sub-task completion - **/ -static void rebuildFromLeaves(VDOCompletion *completion) -{ - RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); - *rebuild->logicalBlocksUsed = 0; - - // The PBN calculation doesn't work until the tree pages have been loaded, - // so we can't set this value at the start of rebuild. - rebuild->lastSlot = (BlockMapSlot) { - .slot = rebuild->blockMap->entryCount % BLOCK_MAP_ENTRIES_PER_PAGE, - .pbn = findBlockMapPagePBN(rebuild->blockMap, rebuild->leafPages - 1), - }; - - // Prevent any page from being processed until all pages have been launched. - rebuild->launching = true; - for (PageCount i = 0; i < rebuild->pageCount; i++) { - fetchPage(rebuild, &rebuild->pageCompletions[i].completion); - } - rebuild->launching = false; - finishIfDone(rebuild); -} - -/** - * Process a single entry from the block map tree. - * - *

Implements EntryCallback. - * - * @param pbn A pbn which holds a block map tree page - * @param completion The parent completion of the traversal - * - * @return VDO_SUCCESS or an error - **/ -static int processEntry(PhysicalBlockNumber pbn, VDOCompletion *completion) -{ - RebuildCompletion *rebuild = asRebuildCompletion(completion->parent); - if ((pbn == ZERO_BLOCK) || !isPhysicalDataBlock(rebuild->depot, pbn)) { - return logErrorWithStringError(VDO_BAD_CONFIGURATION, - "PBN %llu out of range", - pbn); - } - - Slab *slab = getSlab(rebuild->depot, pbn); - int result = adjustReferenceCountForRebuild(slab->referenceCounts, pbn, - BLOCK_MAP_INCREMENT); - if (result != VDO_SUCCESS) { - return logErrorWithStringError(result, - "Could not adjust reference count for " - "block map tree PBN %llu", - pbn); - } - - (*rebuild->blockMapDataBlocks)++; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void rebuildReferenceCounts(VDO *vdo, - VDOCompletion *parent, - BlockCount *logicalBlocksUsed, - BlockCount *blockMapDataBlocks) -{ - RebuildCompletion *rebuild; - int result = makeRebuildCompletion(vdo, logicalBlocksUsed, - blockMapDataBlocks, parent, &rebuild); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - // Completion chaining from page cache hits can lead to stack overflow - // during the rebuild, so clear out the cache before this rebuild phase. - result = invalidateVDOPageCache(rebuild->blockMap->zones[0].pageCache); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - // First traverse the block map trees. - *rebuild->blockMapDataBlocks = 0; - VDOCompletion *completion = &rebuild->subTaskCompletion; - prepareCompletion(completion, rebuildFromLeaves, finishParentCallback, - rebuild->logicalThreadID, rebuild); - traverseForest(rebuild->blockMap, processEntry, completion); -} diff --git a/vdo/base/referenceCountRebuild.h b/vdo/base/referenceCountRebuild.h deleted file mode 100644 index 59363ac..0000000 --- a/vdo/base/referenceCountRebuild.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceCountRebuild.h#1 $ - */ - -#ifndef REFERENCE_COUNT_REBUILD_H -#define REFERENCE_COUNT_REBUILD_H - -#include "types.h" - -/** - * Rebuild the reference counts from the block map (read-only rebuild). - * - * @param [in] vdo The VDO - * @param [in] parent The completion to notify when the rebuild is - * complete - * @param [out] logicalBlocksUsed A pointer to hold the logical blocks used - * @param [out] blockMapDataBlocks A pointer to hold the number of block map - * data blocks - **/ -void rebuildReferenceCounts(VDO *vdo, - VDOCompletion *parent, - BlockCount *logicalBlocksUsed, - BlockCount *blockMapDataBlocks); - -#endif // REFERENCE_COUNT_REBUILD_H diff --git a/vdo/base/referenceOperation.c b/vdo/base/referenceOperation.c deleted file mode 100644 index a8ea9a0..0000000 --- a/vdo/base/referenceOperation.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceOperation.c#1 $ - */ - -#include "referenceOperation.h" - -#include "physicalZone.h" -#include "types.h" - -/**********************************************************************/ -static PBNLock *returnPBNLock(ReferenceOperation operation) -{ - return (PBNLock *) operation.context; -} - -/**********************************************************************/ -void setUpReferenceOperationWithLock(JournalOperation type, - PhysicalBlockNumber pbn, - BlockMappingState state, - PBNLock *lock, - ReferenceOperation *operation) -{ - *operation = (ReferenceOperation) { - .type = type, - .pbn = pbn, - .state = state, - .lockGetter = returnPBNLock, - .context = lock, - }; -} - -/**********************************************************************/ -static PBNLock *lookUpPBNLock(ReferenceOperation operation) -{ - return ((operation.context == NULL) - ? NULL : getPBNLock(operation.context, operation.pbn)); -} - -/**********************************************************************/ -void setUpReferenceOperationWithZone(JournalOperation type, - PhysicalBlockNumber pbn, - BlockMappingState state, - PhysicalZone *zone, - ReferenceOperation *operation) -{ - *operation = (ReferenceOperation) { - .type = type, - .pbn = pbn, - .state = state, - .lockGetter = lookUpPBNLock, - .context = zone, - }; -} diff --git a/vdo/base/referenceOperation.h b/vdo/base/referenceOperation.h deleted file mode 100644 index c846ec6..0000000 --- a/vdo/base/referenceOperation.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/referenceOperation.h#1 $ - */ - -#ifndef REFERENCE_OPERATION_H -#define REFERENCE_OPERATION_H - -#include "types.h" - -typedef struct referenceOperation ReferenceOperation; - -/** - * Get the PBNLock associated with a ReferenceOperation. - * - * @param operation The ReferenceOperation - * - * @return The PBNLock on the block of a ReferenceOperation or NULL if there - * isn't one - **/ -typedef PBNLock *PBNLockGetter(ReferenceOperation operation); - -/** - * The current operation on a physical block (from the point of view of the - * DataVIO doing the operation) - **/ -struct referenceOperation { - /** The operation being performed */ - JournalOperation type; - /** The PBN of the block being operated on */ - PhysicalBlockNumber pbn; - /** The mapping state of the block being operated on */ - BlockMappingState state; - /** A function to use to get any PBNLock associated with this operation */ - PBNLockGetter *lockGetter; - /** The context to pass to the PBNLockGetter */ - void *context; -}; - -/** - * Get the PBNLock associated with the current ReferenceOperation. - * - * @param operation The reference operation - * - * @return The PBNLock on the block of the current operation or NULL if there - * isn't one - **/ -__attribute__((warn_unused_result)) -static inline -PBNLock *getReferenceOperationPBNLock(ReferenceOperation operation) -{ - return ((operation.lockGetter == NULL) - ? NULL : operation.lockGetter(operation)); -} - -/** - * Set up a ReferenceOperation for which we already have the lock. - * - * @param type The type of operation - * @param pbn The PBN of the block on which to operate - * @param state The mapping state of the block on which to operate - * @param lock The PBNLock to associate with the operation - * @param operation The ReferenceOperation to set up - **/ -void setUpReferenceOperationWithLock(JournalOperation type, - PhysicalBlockNumber pbn, - BlockMappingState state, - PBNLock *lock, - ReferenceOperation *operation); - -/** - * Set up a ReferenceOperation for which we will need to look up the lock later. - * - * @param type The type of operation - * @param pbn The PBN of the block on which to operate - * @param state The mapping state of the block on which to operate - * @param zone The PhysicalZone from which the PBNLock can be retrieved - * when needed - * @param operation The ReferenceOperation to set up - **/ -void setUpReferenceOperationWithZone(JournalOperation type, - PhysicalBlockNumber pbn, - BlockMappingState state, - PhysicalZone *zone, - ReferenceOperation *operation); - -#endif // REFERENCE_OPERATION_H diff --git a/vdo/base/releaseVersions.h b/vdo/base/releaseVersions.h deleted file mode 100644 index 7620f17..0000000 --- a/vdo/base/releaseVersions.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -#ifndef RELEASE_VERSIONS_H -#define RELEASE_VERSIONS_H - -enum { - OXYGEN_RELEASE_VERSION_NUMBER = 109583, - FLUORINE_RELEASE_VERSION_NUMBER = 115838, - NEON_RELEASE_VERSION_NUMBER = 120965, - SODIUM_RELEASE_VERSION_NUMBER = 127441, - MAGNESIUM_RELEASE_VERSION_NUMBER = 131337, - ALUMINUM_RELEASE_VERSION_NUMBER = 133524, - HEAD_RELEASE_VERSION_NUMBER = 0, - CURRENT_RELEASE_VERSION_NUMBER = ALUMINUM_RELEASE_VERSION_NUMBER, -}; - -#endif /* not RELEASE_VERSIONS_H */ diff --git a/vdo/base/ringNode.h b/vdo/base/ringNode.h deleted file mode 100644 index 5f389f4..0000000 --- a/vdo/base/ringNode.h +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/ringNode.h#1 $ - */ - -#ifndef RING_NODE_H -#define RING_NODE_H - -#include "types.h" - -/** - * A ring node is a member of a doubly-linked circular list. - * - * Each node is usually embedded within a data structure that contains the - * relevant payload. In addition the ring head is also represented by a - * node where the next field designates the first element of the ring and the - * prev field designates the last. - * - * An empty ring contains next and prev fields that point back to the ring - * head itself. - * - * Typical iteration over a ring, from the front and back: - * - * for (RingNode *n = head->next; n != head; n = n->next) { ... } - * for (RingNode *p = head->prev; p != head; p = p->prev) { ... } - **/ -typedef struct ringNode RingNode; - -struct ringNode { - RingNode *next; - RingNode *prev; -}; - -/** - * Initialize a ring to be empty. - * - * @param head The head of the ring - **/ -static inline void initializeRing(RingNode *head) -{ - head->next = head->prev = head; -} - -/** - * Check whether a ring is empty. - * - * @param head The head of the ring - * - * @return true if the ring is empty - **/ -static inline bool isRingEmpty(const RingNode *head) -{ - return (head->next == head); -} - -/** - * Check whether a ring contains exactly one node. - * - * @param head The head of the ring - * - * @return true if the ring contains exactly one member - **/ -static inline bool isRingSingleton(const RingNode *head) -{ - return (!isRingEmpty(head) && (head->prev == head->next)); -} - -/** - * Unsplice a contiguous chain of at least one node from its ring. - * - * @param first the first entry in the ring to unsplice - * @param last the last entry in the ring to unsplice, - * may be the same as ``first`` - * - * The effect of this is to create two rings, the one designated - * by first through last, and the other consisting of anything remaining. - **/ -static inline void unspliceRingChain(RingNode *first, - RingNode *last) -{ - first->prev->next = last->next; - last->next->prev = first->prev; - first->prev = last; - last->next = first; -} - -/** - * Remove a ring node from its ring. - * - * @param node the ring node - * - * @return the removed node, for convenience - **/ -static inline RingNode *unspliceRingNode(RingNode *node) -{ - unspliceRingChain(node, node); - return node; -} - -/** - * Splice a contiguous chain of at least one node after the specified entry, - * which may be the head of a ring. - * - * @param first the first entry in a contiguous span of nodes - * @param last the last entry in a contiguous span of nodes, - * may be the same as ``first`` - * @param where the entry after which ``first`` through ``last`` - * shall appear - * - * The effect of this is to unsplice first through last (if necessary) and - * insert them after ``where`` so that the previous nodes after ``where`` - * now appear after ``last``. - **/ -static inline void spliceRingChainAfter(RingNode *first, - RingNode *last, - RingNode *where) -{ - if (last->next != first) { - unspliceRingChain(first, last); - } - last->next = where->next; - first->prev = where; - where->next->prev = last; - where->next = first; -} - -/** - * Splice a contiguous chain of at least one node before the specified entry, - * which may be the tail of a list. - * - * @param first the first entry in a contiguous span of nodes - * @param last the last entry in a contiguous span of nodes, - * may be the same as ``first`` - * @param where the entry before which ``first`` through ``last`` - * shall appear - * - * The effect of this is to unsplice first through last (if necessary) and - * insert them before ``where`` so that the previous nodes before ``where`` - * now appear before ``first``. - **/ -static inline void spliceRingChainBefore(RingNode *first, - RingNode *last, - RingNode *where) -{ - if (last->next != first) { - unspliceRingChain(first, last); - } - first->prev = where->prev; - last->next = where; - where->prev->next = first; - where->prev = last; -} - -/** - * Push a single node on the end of a ring. - * - * @param head The ring head - * @param node The node to push - **/ -static inline void pushRingNode(RingNode *head, RingNode *node) -{ - spliceRingChainBefore(node, node, head); -} - -/** - * Pop a single node off the end of a ring. - * - * @param head The ring head - * - * @return NULL if the ring was empty, otherwise the node that was - * removed from the ring (``head->prev``) - **/ -static inline RingNode *popRingNode(RingNode *head) -{ - return (isRingEmpty(head) ? NULL : unspliceRingNode(head->prev)); -} - -/** - * Remove a single node off the front of the list - **/ -static inline RingNode *chopRingNode(RingNode *head) -{ - return (isRingEmpty(head) ? NULL : unspliceRingNode(head->next)); -} - -#endif // RING_NODE_H diff --git a/vdo/base/slab.c b/vdo/base/slab.c deleted file mode 100644 index f2903d6..0000000 --- a/vdo/base/slab.c +++ /dev/null @@ -1,468 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.c#9 $ - */ - -#include "slab.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "adminState.h" -#include "blockAllocatorInternals.h" -#include "completion.h" -#include "constants.h" -#include "numUtils.h" -#include "pbnLock.h" -#include "recoveryJournal.h" -#include "refCounts.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "slabJournalInternals.h" -#include "slabSummary.h" - -/**********************************************************************/ -int configureSlab(BlockCount slabSize, - BlockCount slabJournalBlocks, - SlabConfig *slabConfig) -{ - if (slabJournalBlocks >= slabSize) { - return VDO_BAD_CONFIGURATION; - } - - /* - * This calculation should technically be a recurrence, but the total number - * of metadata blocks is currently less than a single block of refCounts, so - * we'd gain at most one data block in each slab with more iteration. - */ - BlockCount refBlocks - = getSavedReferenceCountSize(slabSize - slabJournalBlocks); - BlockCount metaBlocks = (refBlocks + slabJournalBlocks); - - // Make sure test code hasn't configured slabs to be too small. - if (metaBlocks >= slabSize) { - return VDO_BAD_CONFIGURATION; - } - - /* - * If the slab size is very small, assume this must be a unit test and - * override the number of data blocks to be a power of two (wasting blocks - * in the slab). Many tests need their dataBlocks fields to be the exact - * capacity of the configured volume, and that used to fall out since they - * use a power of two for the number of data blocks, the slab size was a - * power of two, and every block in a slab was a data block. - * - * XXX Try to figure out some way of structuring testParameters and unit - * tests so this hack isn't needed without having to edit several unit tests - * every time the metadata size changes by one block. - */ - BlockCount dataBlocks = slabSize - metaBlocks; - if ((slabSize < 1024) && !isPowerOfTwo(dataBlocks)) { - dataBlocks = ((BlockCount) 1 << logBaseTwo(dataBlocks)); - } - - /* - * Configure the slab journal thresholds. The flush threshold is 168 of 224 - * blocks in production, or 3/4ths, so we use this ratio for all sizes. - */ - BlockCount flushingThreshold = ((slabJournalBlocks * 3) + 3) / 4; - /* - * The blocking threshold should be far enough from the the flushing - * threshold to not produce delays, but far enough from the end of the - * journal to allow multiple successive recovery failures. - */ - BlockCount remaining = slabJournalBlocks - flushingThreshold; - BlockCount blockingThreshold = flushingThreshold + ((remaining * 5) / 7); - /* - * The scrubbing threshold should be at least 2048 entries before the end of - * the journal. - */ - BlockCount minimalExtraSpace - = 1 + (MAXIMUM_USER_VIOS / SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK); - BlockCount scrubbingThreshold = blockingThreshold; - if (slabJournalBlocks > minimalExtraSpace) { - scrubbingThreshold = slabJournalBlocks - minimalExtraSpace; - } - if (blockingThreshold > scrubbingThreshold) { - blockingThreshold = scrubbingThreshold; - } - - *slabConfig = (SlabConfig) { - .slabBlocks = slabSize, - .dataBlocks = dataBlocks, - .referenceCountBlocks = refBlocks, - .slabJournalBlocks = slabJournalBlocks, - .slabJournalFlushingThreshold = flushingThreshold, - .slabJournalBlockingThreshold = blockingThreshold, - .slabJournalScrubbingThreshold = scrubbingThreshold - }; - return VDO_SUCCESS; -} - -/**********************************************************************/ -PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig *slabConfig, - PhysicalBlockNumber origin) -{ - return origin + slabConfig->dataBlocks + slabConfig->referenceCountBlocks; -} - -/**********************************************************************/ -int makeSlab(PhysicalBlockNumber slabOrigin, - BlockAllocator *allocator, - PhysicalBlockNumber translation, - RecoveryJournal *recoveryJournal, - SlabCount slabNumber, - bool isNew, - Slab **slabPtr) -{ - Slab *slab; - int result = ALLOCATE(1, Slab, __func__, &slab); - if (result != VDO_SUCCESS) { - return result; - } - - const SlabConfig *slabConfig = getSlabConfig(allocator->depot); - - slab->allocator = allocator; - slab->start = slabOrigin; - slab->end = slab->start + slabConfig->slabBlocks; - slab->slabNumber = slabNumber; - initializeRing(&slab->ringNode); - - slab->refCountsOrigin = slabOrigin + slabConfig->dataBlocks + translation; - slab->journalOrigin = (getSlabJournalStartBlock(slabConfig, slabOrigin) - + translation); - - result = makeSlabJournal(allocator, slab, recoveryJournal, &slab->journal); - if (result != VDO_SUCCESS) { - freeSlab(&slab); - return result; - } - - if (isNew) { - slab->state.state = ADMIN_STATE_NEW; - result = allocateRefCountsForSlab(slab); - if (result != VDO_SUCCESS) { - freeSlab(&slab); - return result; - } - } - - *slabPtr = slab; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int allocateRefCountsForSlab(Slab *slab) -{ - BlockAllocator *allocator = slab->allocator; - const SlabConfig *slabConfig = getSlabConfig(allocator->depot); - - int result = ASSERT(slab->referenceCounts == NULL, - "Slab %u doesn't allocate refcounts twice", - slab->slabNumber); - if (result != VDO_SUCCESS) { - return result; - } - - return makeRefCounts(slabConfig->dataBlocks, slab, slab->refCountsOrigin, - allocator->readOnlyNotifier, &slab->referenceCounts); -} - -/**********************************************************************/ -void freeSlab(Slab **slabPtr) -{ - Slab *slab = *slabPtr; - if (slab == NULL) { - return; - } - - unspliceRingNode(&slab->ringNode); - freeSlabJournal(&slab->journal); - freeRefCounts(&slab->referenceCounts); - FREE(slab); - *slabPtr = NULL; -} - -/**********************************************************************/ -ZoneCount getSlabZoneNumber(Slab *slab) -{ - return slab->allocator->zoneNumber; -} - -/**********************************************************************/ -void markSlabReplaying(Slab *slab) -{ - if (slab->status == SLAB_REBUILT) { - slab->status = SLAB_REPLAYING; - } -} - -/**********************************************************************/ -void markSlabUnrecovered(Slab *slab) -{ - slab->status = SLAB_REQUIRES_SCRUBBING; -} - -/**********************************************************************/ -BlockCount getSlabFreeBlockCount(const Slab *slab) -{ - return getUnreferencedBlockCount(slab->referenceCounts); -} - -/**********************************************************************/ -int modifySlabReferenceCount(Slab *slab, - const JournalPoint *journalPoint, - ReferenceOperation operation) -{ - if (slab == NULL) { - return VDO_SUCCESS; - } - - /* - * If the slab is unrecovered, preserve the refCount state and let scrubbing - * correct the refCount. Note that the slab journal has already captured all - * refCount updates. - */ - if (isUnrecoveredSlab(slab)) { - SequenceNumber entryLock = journalPoint->sequenceNumber; - adjustSlabJournalBlockReference(slab->journal, entryLock, -1); - return VDO_SUCCESS; - } - - bool freeStatusChanged; - int result = adjustReferenceCount(slab->referenceCounts, operation, - journalPoint, &freeStatusChanged); - if (result != VDO_SUCCESS) { - return result; - } - - if (freeStatusChanged) { - adjustFreeBlockCount(slab, !isIncrementOperation(operation.type)); - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int acquireProvisionalReference(Slab *slab, - PhysicalBlockNumber pbn, - PBNLock *lock) -{ - if (hasProvisionalReference(lock)) { - return VDO_SUCCESS; - } - - int result = provisionallyReferenceBlock(slab->referenceCounts, pbn, lock); - if (result != VDO_SUCCESS) { - return result; - } - - if (hasProvisionalReference(lock)) { - adjustFreeBlockCount(slab, false); - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int slabBlockNumberFromPBN(Slab *slab, - PhysicalBlockNumber physicalBlockNumber, - SlabBlockNumber *slabBlockNumberPtr) -{ - if (physicalBlockNumber < slab->start) { - return VDO_OUT_OF_RANGE; - } - - uint64_t slabBlockNumber = physicalBlockNumber - slab->start; - if (slabBlockNumber >= getSlabConfig(slab->allocator->depot)->dataBlocks) { - return VDO_OUT_OF_RANGE; - } - - *slabBlockNumberPtr = slabBlockNumber; - return VDO_SUCCESS; -} - -/**********************************************************************/ -bool shouldSaveFullyBuiltSlab(const Slab *slab) -{ - // Write out the refCounts if the slab has written them before, or it has - // any non-zero reference counts, or there are any slab journal blocks. - BlockCount dataBlocks = getSlabConfig(slab->allocator->depot)->dataBlocks; - return (mustLoadRefCounts(slab->allocator->summary, slab->slabNumber) - || (getSlabFreeBlockCount(slab) != dataBlocks) - || !isSlabJournalBlank(slab->journal)); -} - -/** - * Initiate a slab action. - * - * Implements AdminInitiator. - **/ -static void initiateSlabAction(AdminState *state) -{ - Slab *slab = container_of(state, Slab, state); - if (isDraining(state)) { - if (state->state == ADMIN_STATE_SCRUBBING) { - slab->status = SLAB_REBUILDING; - } - - drainSlabJournal(slab->journal); - - if (slab->referenceCounts != NULL) { - drainRefCounts(slab->referenceCounts); - } - - checkIfSlabDrained(slab); - return; - } - - if (isLoading(state)) { - decodeSlabJournal(slab->journal); - return; - } - - if (isResuming(state)) { - queueSlab(slab); - finishResuming(state); - return; - } - - finishOperationWithResult(state, VDO_INVALID_ADMIN_STATE); -} - -/**********************************************************************/ -void startSlabAction(Slab *slab, - AdminStateCode operation, - VDOCompletion *parent) -{ - startOperationWithWaiter(&slab->state, operation, parent, - initiateSlabAction); -} - -/**********************************************************************/ -void notifySlabJournalIsLoaded(Slab *slab, int result) -{ - if ((result == VDO_SUCCESS) && isCleanLoad(&slab->state)) { - // Since this is a normal or new load, we don't need the memory to read and - // process the recovery journal, so we can allocate reference counts now. - result = allocateRefCountsForSlab(slab); - } - - finishLoadingWithResult(&slab->state, result); -} - -/**********************************************************************/ -bool isSlabOpen(Slab *slab) -{ - return (!isQuiescing(&slab->state) && !isQuiescent(&slab->state)); -} - -/**********************************************************************/ -bool isSlabDraining(Slab *slab) -{ - return isDraining(&slab->state); -} - -/**********************************************************************/ -void checkIfSlabDrained(Slab *slab) -{ - if (isDraining(&slab->state) - && !isSlabJournalActive(slab->journal) - && ((slab->referenceCounts == NULL) - || !areRefCountsActive(slab->referenceCounts))) { - finishDrainingWithResult(&slab->state, - (isReadOnly(slab->allocator->readOnlyNotifier) - ? VDO_READ_ONLY : VDO_SUCCESS)); - } -} - -/**********************************************************************/ -void notifySlabJournalIsDrained(Slab *slab, int result) -{ - if (slab->referenceCounts == NULL) { - // This can happen when shutting down a VDO that was in read-only mode when - // loaded. - notifyRefCountsAreDrained(slab, result); - return; - } - - setOperationResult(&slab->state, result); - drainRefCounts(slab->referenceCounts); -} - -/**********************************************************************/ -void notifyRefCountsAreDrained(Slab *slab, int result) -{ - finishDrainingWithResult(&slab->state, result); -} - -/**********************************************************************/ -bool isSlabResuming(Slab *slab) -{ - return isResuming(&slab->state); -} - -/**********************************************************************/ -void finishScrubbingSlab(Slab *slab) -{ - slab->status = SLAB_REBUILT; - queueSlab(slab); - reopenSlabJournal(slab->journal); -} - -/**********************************************************************/ -static const char *statusToString(SlabRebuildStatus status) -{ - switch (status) { - case SLAB_REBUILT: - return "REBUILT"; - case SLAB_REQUIRES_SCRUBBING: - return "SCRUBBING"; - case SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING: - return "PRIORITY_SCRUBBING"; - case SLAB_REBUILDING: - return "REBUILDING"; - case SLAB_REPLAYING: - return "REPLAYING"; - default: - return "UNKNOWN"; - } -} - -/**********************************************************************/ -void dumpSlab(const Slab *slab) -{ - if (slab->referenceCounts != NULL) { - // Terse because there are a lot of slabs to dump and syslog is lossy. - logInfo("slab %u: P%u, %llu free", - slab->slabNumber, slab->priority, getSlabFreeBlockCount(slab)); - } else { - logInfo("slab %u: status %s", slab->slabNumber, - statusToString(slab->status)); - } - - dumpSlabJournal(slab->journal); - - if (slab->referenceCounts != NULL) { - dumpRefCounts(slab->referenceCounts); - } else { - logInfo("refCounts is null"); - } -} diff --git a/vdo/base/slab.h b/vdo/base/slab.h deleted file mode 100644 index c7f204b..0000000 --- a/vdo/base/slab.h +++ /dev/null @@ -1,379 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slab.h#8 $ - */ - -#ifndef VDO_SLAB_H -#define VDO_SLAB_H - -#include "permassert.h" - -#include "adminState.h" -#include "fixedLayout.h" -#include "journalPoint.h" -#include "referenceOperation.h" -#include "ringNode.h" -#include "types.h" - -typedef uint32_t SlabBlockNumber; - -typedef enum { - SLAB_REBUILT = 0, - SLAB_REPLAYING, - SLAB_REQUIRES_SCRUBBING, - SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING, - SLAB_REBUILDING, -} SlabRebuildStatus; - -/** - * This is the type declaration for the Slab type. (The struct tag is named - * vdoSlab to avoid a conflict with the linux kernel type). A Slab currently - * consists of a run of 2^23 data blocks, but that will soon change to - * dedicate a small number of those blocks for metadata storage for the - * reference counts and slab journal for the slab. - **/ -struct vdoSlab { - /** A RingNode to queue this slab in a BlockAllocator ring */ - RingNode ringNode; - - /** The BlockAllocator that owns this slab */ - BlockAllocator *allocator; - - /** The reference counts for the data blocks in this slab */ - RefCounts *referenceCounts; - /** The journal for this slab */ - SlabJournal *journal; - - /** The slab number of this slab */ - SlabCount slabNumber; - /** The offset in the allocator partition of the first block in this slab */ - PhysicalBlockNumber start; - /** The offset of the first block past the end of this slab */ - PhysicalBlockNumber end; - /** The starting translated PBN of the slab journal */ - PhysicalBlockNumber journalOrigin; - /** The starting translated PBN of the reference counts */ - PhysicalBlockNumber refCountsOrigin; - - /** The administrative state of the slab */ - AdminState state; - /** The status of the slab */ - SlabRebuildStatus status; - /** Whether the slab was ever queued for scrubbing */ - bool wasQueuedForScrubbing; - - /** The priority at which this slab has been queued for allocation */ - uint8_t priority; -}; - -/** - * Measure and initialize the configuration to use for each slab. - * - * @param [in] slabSize The number of blocks per slab - * @param [in] slabJournalBlocks The number of blocks for the slab journal - * @param [out] slabConfig The slab configuration to initialize - * - * @return VDO_SUCCESS or an error code - **/ -int configureSlab(BlockCount slabSize, - BlockCount slabJournalBlocks, - SlabConfig *slabConfig) - __attribute__((warn_unused_result)); - -/** - * Convert a Slab's RingNode back to the Slab. - * - * @param ringNode The RingNode to convert - * - * @return The RingNode as a Slab - **/ -static inline Slab *slabFromRingNode(RingNode *ringNode) -{ - STATIC_ASSERT(offsetof(Slab, ringNode) == 0); - return (Slab *) ringNode; -} - -/** - * Get the physical block number of the start of the slab journal - * relative to the start block allocator partition. - * - * @param slabConfig The slab configuration of the VDO - * @param origin The first block of the slab - **/ -__attribute__((warn_unused_result)) -PhysicalBlockNumber getSlabJournalStartBlock(const SlabConfig *slabConfig, - PhysicalBlockNumber origin); - -/** - * Construct a new, empty slab. - * - * @param [in] slabOrigin The physical block number within the block - * allocator partition of the first block in the - * slab - * @param [in] allocator The block allocator to which the slab belongs - * @param [in] translation The translation from the depot's partition to - * the physical storage - * @param [in] recoveryJournal The recovery journal of the VDO - * @param [in] slabNumber The slab number of the slab - * @param [in] isNew true if this slab is being - * allocated as part of a resize - * @param [out] slabPtr A pointer to receive the new slab - * - * @return VDO_SUCCESS or an error code - **/ -int makeSlab(PhysicalBlockNumber slabOrigin, - BlockAllocator *allocator, - PhysicalBlockNumber translation, - RecoveryJournal *recoveryJournal, - SlabCount slabNumber, - bool isNew, - Slab **slabPtr) - __attribute__((warn_unused_result)); - -/** - * Allocate the reference counts for a slab. - * - * @param slab The slab whose reference counts need allocation. - * - * @return VDO_SUCCESS or an error code - **/ -int allocateRefCountsForSlab(Slab *slab) - __attribute__((warn_unused_result)); - -/** - * Destroy a slab and null out the reference to it. - * - * @param slabPtr The reference to the slab to destroy - **/ -void freeSlab(Slab **slabPtr); - -/** - * Get the physical zone number of a slab. - * - * @param slab The slab - * - * @return The number of the slab's physical zone - **/ -ZoneCount getSlabZoneNumber(Slab *slab) - __attribute__((warn_unused_result)); - -/** - * Check whether a slab is unrecovered. - * - * @param slab The slab to check - * - * @return true if the slab is unrecovered - **/ -static inline bool isUnrecoveredSlab(const Slab *slab) -{ - return (slab->status != SLAB_REBUILT); -} - -/** - * Check whether a slab is being replayed into. - * - * @param slab The slab to check - * - * @return true if the slab is replaying - **/ -static inline bool isReplayingSlab(const Slab *slab) -{ - return (slab->status == SLAB_REPLAYING); -} - -/** - * Check whether a slab is being rebuilt. - * - * @param slab The slab to check - * - * @return true if the slab is being rebuilt - **/ -static inline bool slabIsRebuilding(const Slab *slab) -{ - return (slab->status == SLAB_REBUILDING); -} - -/** - * Mark a slab as replaying, during offline recovery. - * - * @param slab The slab to mark - **/ -void markSlabReplaying(Slab *slab); - -/** - * Mark a slab as unrecovered, for online recovery. - * - * @param slab The slab to mark - **/ -void markSlabUnrecovered(Slab *slab); - -/** - * Get the current number of free blocks in a slab. - * - * @param slab The slab to query - * - * @return the number of free blocks in the slab - **/ -BlockCount getSlabFreeBlockCount(const Slab *slab) - __attribute__((warn_unused_result)); - -/** - * Increment or decrement the reference count of a block in a slab. - * - * @param slab The slab containing the block (may be NULL when - * referencing the zero block) - * @param journalPoint The slab journal entry corresponding to this change - * @param operation The operation to perform on the reference count - * - * @return VDO_SUCCESS or an error - **/ -int modifySlabReferenceCount(Slab *slab, - const JournalPoint *journalPoint, - ReferenceOperation operation) - __attribute__((warn_unused_result)); - -/** - * Acquire a provisional reference on behalf of a PBN lock if the block it - * locks is unreferenced. - * - * @param slab The slab which contains the block - * @param pbn The physical block to reference - * @param lock The lock - * - * @return VDO_SUCCESS or an error - **/ -int acquireProvisionalReference(Slab *slab, - PhysicalBlockNumber pbn, - PBNLock *lock) - __attribute__((warn_unused_result)); - -/** - * Determine the index within the slab of a particular physical block number. - * - * @param [in] slab The slab - * @param [in] physicalBlockNumber The physical block number - * @param [out] slabBlockNumberPtr A pointer to the slab block number - * - * @return VDO_SUCCESS or an error code - **/ -int slabBlockNumberFromPBN(Slab *slab, - PhysicalBlockNumber physicalBlockNumber, - SlabBlockNumber *slabBlockNumberPtr) - __attribute__((warn_unused_result)); - -/** - * Check whether the reference counts for a given rebuilt slab should be saved. - * Implements SlabStatusChecker. - * - * @param slab The slab to check - * - * @return true if the slab should be saved - **/ -bool shouldSaveFullyBuiltSlab(const Slab *slab) - __attribute__((warn_unused_result)); - -/** - * Start an administrative operation on a slab. - * - * @param slab The slab to load - * @param operation The type of load to perform - * @param parent The object to notify when the operation is complete - **/ -void startSlabAction(Slab *slab, - AdminStateCode operation, - VDOCompletion *parent); - -/** - * Inform a slab that its journal has been loaded. - * - * @param slab The slab whose journal has been loaded - * @param result The result of the load operation - **/ -void notifySlabJournalIsLoaded(Slab *slab, int result); - -/** - * Check whether a slab is open, i.e. is neither quiescent nor quiescing. - * - * @param slab The slab to check - * - * @return true if the slab is open - **/ -bool isSlabOpen(Slab *slab) - __attribute__((warn_unused_result)); - -/** - * Check whether a slab is currently draining. - * - * @param slab The slab to check - * - * @return true if the slab is performing a drain operation - **/ -bool isSlabDraining(Slab *slab) - __attribute__((warn_unused_result)); - -/** - * Check whether a slab has drained, and if so, send a notification thereof. - * - * @param slab The slab to check - **/ -void checkIfSlabDrained(Slab *slab); - -/** - * Inform a slab that its journal has finished draining. - * - * @param slab The slab whose journal has been drained - * @param result The result of the drain operation - **/ -void notifySlabJournalIsDrained(Slab *slab, int result); - -/** - * Inform a slab that its RefCounts have finished draining. - * - * @param slab The slab whose RefCounts has been drained - * @param result The result of the drain operation - **/ -void notifyRefCountsAreDrained(Slab *slab, int result); - -/** - * Check whether a slab is currently resuming. - * - * @param slab The slab to check - * - * @return true if the slab is performing a resume operation - **/ -bool isSlabResuming(Slab *slab) - __attribute__((warn_unused_result)); - -/** - * Finish scrubbing a slab now that it has been rebuilt by updating its status, - * queueing it for allocation, and reopening its journal. - * - * @param slab The slab whose reference counts have been rebuilt from its - * journal - **/ -void finishScrubbingSlab(Slab *slab); - -/** - * Dump information about a slab to the log for debugging. - * - * @param slab The slab to dump - **/ -void dumpSlab(const Slab *slab); - -#endif // VDO_SLAB_H diff --git a/vdo/base/slabDepot.c b/vdo/base/slabDepot.c deleted file mode 100644 index 6c10c29..0000000 --- a/vdo/base/slabDepot.c +++ /dev/null @@ -1,1145 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepot.c#23 $ - */ - -#include "slabDepot.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "actionManager.h" -#include "adminState.h" -#include "blockAllocatorInternals.h" -#include "constants.h" -#include "header.h" -#include "numUtils.h" -#include "readOnlyNotifier.h" -#include "refCounts.h" -#include "slab.h" -#include "slabDepotInternals.h" -#include "slabJournal.h" -#include "slabIterator.h" -#include "slabSummary.h" -#include "threadConfig.h" -#include "types.h" - -typedef struct { - SlabConfig slabConfig; - PhysicalBlockNumber firstBlock; - PhysicalBlockNumber lastBlock; - ZoneCount zoneCount; -} __attribute__((packed)) SlabDepotState2_0; - -static const Header SLAB_DEPOT_HEADER_2_0 = { - .id = SLAB_DEPOT, - .version = { - .majorVersion = 2, - .minorVersion = 0, - }, - .size = sizeof(SlabDepotState2_0), -}; - -/** - * Compute the number of slabs a depot with given parameters would have. - * - * @param firstBlock PBN of the first data block - * @param lastBlock PBN of the last data block - * @param slabSizeShift Exponent for the number of blocks per slab - * - * @return The number of slabs - **/ -__attribute__((warn_unused_result)) -static SlabCount computeSlabCount(PhysicalBlockNumber firstBlock, - PhysicalBlockNumber lastBlock, - unsigned int slabSizeShift) -{ - BlockCount dataBlocks = lastBlock - firstBlock; - return (SlabCount) (dataBlocks >> slabSizeShift); -} - -/**********************************************************************/ -SlabCount calculateSlabCount(SlabDepot *depot) -{ - return computeSlabCount(depot->firstBlock, depot->lastBlock, - depot->slabSizeShift); -} - -/** - * Get an iterator over all the slabs in the depot. - * - * @param depot The depot - * - * @return An iterator over the depot's slabs - **/ -static SlabIterator getSlabIterator(SlabDepot *depot) -{ - return iterateSlabs(depot->slabs, depot->slabCount - 1, 0, 1); -} - -/** - * Allocate a new slab pointer array. Any existing slab pointers will be - * copied into the new array, and slabs will be allocated as needed. The - * newly allocated slabs will not be distributed for use by the block - * allocators. - * - * @param depot The depot - * @param slabCount The number of slabs the depot should have in the new - * array - * - * @return VDO_SUCCESS or an error code - **/ -static int allocateSlabs(SlabDepot *depot, SlabCount slabCount) -{ - int result = ALLOCATE(slabCount, Slab *, "slab pointer array", - &depot->newSlabs); - if (result != VDO_SUCCESS) { - return result; - } - - bool resizing = false; - if (depot->slabs != NULL) { - memcpy(depot->newSlabs, depot->slabs, depot->slabCount * sizeof(Slab *)); - resizing = true; - } - - BlockCount slabSize = getSlabConfig(depot)->slabBlocks; - PhysicalBlockNumber slabOrigin - = depot->firstBlock + (depot->slabCount * slabSize); - - // The translation between allocator partition PBNs and layer PBNs. - BlockCount translation = depot->origin - depot->firstBlock; - depot->newSlabCount = depot->slabCount; - while (depot->newSlabCount < slabCount) { - BlockAllocator *allocator - = depot->allocators[depot->newSlabCount % depot->zoneCount]; - Slab **slabPtr = &depot->newSlabs[depot->newSlabCount]; - result = makeSlab(slabOrigin, allocator, translation, depot->journal, - depot->newSlabCount, resizing, slabPtr); - if (result != VDO_SUCCESS) { - return result; - } - // Increment here to ensure that abandonNewSlabs will clean up correctly. - depot->newSlabCount++; - - slabOrigin += slabSize; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void abandonNewSlabs(SlabDepot *depot) -{ - if (depot->newSlabs == NULL) { - return; - } - for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { - freeSlab(&depot->newSlabs[i]); - } - depot->newSlabCount = 0; - FREE(depot->newSlabs); - depot->newSlabs = NULL; - depot->newSize = 0; -} - -/** - * Get the ID of the thread on which a given allocator operates. - * - *

Implements ZoneThreadGetter. - **/ -static ThreadID getAllocatorThreadID(void *context, ZoneCount zoneNumber) -{ - return getBlockAllocatorForZone(context, zoneNumber)->threadID; -} - -/** - * Prepare to commit oldest tail blocks. - * - *

Implements ActionPreamble. - **/ -static void prepareForTailBlockCommit(void *context, VDOCompletion *parent) -{ - SlabDepot *depot = context; - depot->activeReleaseRequest = depot->newReleaseRequest; - completeCompletion(parent); -} - -/** - * Schedule a tail block commit if necessary. This method should not be called - * directly. Rather, call scheduleDefaultAction() on the depot's action - * manager. - * - *

Implements ActionScheduler, - **/ -static bool scheduleTailBlockCommit(void *context) -{ - SlabDepot *depot = context; - if (depot->newReleaseRequest == depot->activeReleaseRequest) { - return false; - } - - return scheduleAction(depot->actionManager, prepareForTailBlockCommit, - releaseTailBlockLocks, NULL, NULL); -} - -/** - * Allocate those components of the slab depot which are needed only at load - * time, not at format time. - * - * @param depot The depot - * @param nonce The nonce of the VDO - * @param threadConfig The thread config of the VDO - * @param vioPoolSize The size of the VIO pool - * @param layer The physical layer below this depot - * @param summaryPartition The partition which holds the slab summary - * - * @return VDO_SUCCESS or an error - **/ -static int allocateComponents(SlabDepot *depot, - Nonce nonce, - const ThreadConfig *threadConfig, - BlockCount vioPoolSize, - PhysicalLayer *layer, - Partition *summaryPartition) -{ - /* - * If createVIO is NULL, the slab depot is only being used to format - * or audit the VDO. These only require the SuperBlock component, so we can - * just skip allocating all the memory needed for runtime components. - */ - if (layer->createMetadataVIO == NULL) { - return VDO_SUCCESS; - } - - int result = initializeEnqueueableCompletion(&depot->scrubbingCompletion, - SUB_TASK_COMPLETION, layer); - if (result != VDO_SUCCESS) { - return result; - } - - result = makeActionManager(depot->zoneCount, getAllocatorThreadID, - getJournalZoneThread(threadConfig), depot, - scheduleTailBlockCommit, layer, - &depot->actionManager); - if (result != VDO_SUCCESS) { - return result; - } - - depot->origin = depot->firstBlock; - - result = makeSlabSummary(layer, summaryPartition, threadConfig, - depot->slabSizeShift, depot->slabConfig.dataBlocks, - depot->readOnlyNotifier, &depot->slabSummary); - if (result != VDO_SUCCESS) { - return result; - } - - SlabCount slabCount = calculateSlabCount(depot); - if (threadConfig->physicalZoneCount > slabCount) { - return logErrorWithStringError(VDO_BAD_CONFIGURATION, - "%u physical zones exceeds slab count %u", - threadConfig->physicalZoneCount, slabCount); - } - - // Allocate the block allocators. - for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { - ThreadID threadID = getPhysicalZoneThread(threadConfig, zone); - result = makeBlockAllocator(depot, zone, threadID, nonce, vioPoolSize, - layer, depot->readOnlyNotifier, - &depot->allocators[zone]); - if (result != VDO_SUCCESS) { - return result; - } - } - - // Allocate slabs. - result = allocateSlabs(depot, slabCount); - if (result != VDO_SUCCESS) { - return result; - } - - // Use the new slabs. - for (SlabCount i = depot->slabCount; i < depot->newSlabCount; i++) { - Slab *slab = depot->newSlabs[i]; - registerSlabWithAllocator(slab->allocator, slab); - depot->slabCount++; - } - - depot->slabs = depot->newSlabs; - depot->newSlabs = NULL; - depot->newSlabCount = 0; - - return VDO_SUCCESS; -} - -/** - * Allocate a slab depot. - * - * @param [in] state The parameters for the new depot - * @param [in] threadConfig The thread config of the VDO - * @param [in] nonce The nonce of the VDO - * @param [in] vioPoolSize The size of the VIO pool - * @param [in] layer The physical layer below this depot - * @param [in] summaryPartition The partition which holds the slab summary - * (if NULL, the depot is format-only) - * @param [in] readOnlyNotifier The context for entering read-only mode - * @param [in] recoveryJournal The recovery journal of the VDO - * @param [out] depotPtr A pointer to hold the depot - * - * @return A success or error code - **/ -__attribute__((warn_unused_result)) -static int allocateDepot(const SlabDepotState2_0 *state, - const ThreadConfig *threadConfig, - Nonce nonce, - BlockCount vioPoolSize, - PhysicalLayer *layer, - Partition *summaryPartition, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *recoveryJournal, - SlabDepot **depotPtr) -{ - // Calculate the bit shift for efficiently mapping block numbers to slabs. - // Using a shift requires that the slab size be a power of two. - BlockCount slabSize = state->slabConfig.slabBlocks; - if (!isPowerOfTwo(slabSize)) { - return logErrorWithStringError(UDS_INVALID_ARGUMENT, - "slab size must be a power of two"); - } - unsigned int slabSizeShift = logBaseTwo(slabSize); - - SlabDepot *depot; - int result = ALLOCATE_EXTENDED(SlabDepot, threadConfig->physicalZoneCount, - BlockAllocator *, __func__, &depot); - if (result != VDO_SUCCESS) { - return result; - } - - depot->oldZoneCount = state->zoneCount; - depot->zoneCount = threadConfig->physicalZoneCount; - depot->slabConfig = state->slabConfig; - depot->readOnlyNotifier = readOnlyNotifier; - depot->firstBlock = state->firstBlock; - depot->lastBlock = state->lastBlock; - depot->slabSizeShift = slabSizeShift; - depot->journal = recoveryJournal; - - result = allocateComponents(depot, nonce, threadConfig, vioPoolSize, - layer, summaryPartition); - if (result != VDO_SUCCESS) { - freeSlabDepot(&depot); - return result; - } - - *depotPtr = depot; - return VDO_SUCCESS; -} - -/** - * Configure the SlabDepot for the specified storage capacity, finding the - * number of data blocks that will fit and still leave room for the depot - * metadata, then return the saved state for that configuration. - * - * @param [in] blockCount The number of blocks in the underlying storage - * @param [in] firstBlock The number of the first block that may be allocated - * @param [in] slabConfig The configuration of a single slab - * @param [in] zoneCount The number of zones the depot will use - * @param [out] state The state structure to be configured - * - * @return VDO_SUCCESS or an error code - **/ -static int configureState(BlockCount blockCount, - PhysicalBlockNumber firstBlock, - SlabConfig slabConfig, - ZoneCount zoneCount, - SlabDepotState2_0 *state) -{ - BlockCount slabSize = slabConfig.slabBlocks; - logDebug("slabDepot configureState(blockCount=%" PRIu64 - ", firstBlock=%llu, slabSize=%llu, zoneCount=%u)", - blockCount, firstBlock, slabSize, zoneCount); - - // We do not allow runt slabs, so we waste up to a slab's worth. - size_t slabCount = (blockCount / slabSize); - if (slabCount == 0) { - return VDO_NO_SPACE; - } - - if (slabCount > MAX_SLABS) { - return VDO_TOO_MANY_SLABS; - } - - BlockCount totalSlabBlocks = slabCount * slabConfig.slabBlocks; - BlockCount totalDataBlocks = slabCount * slabConfig.dataBlocks; - PhysicalBlockNumber lastBlock = firstBlock + totalSlabBlocks; - - *state = (SlabDepotState2_0) { - .slabConfig = slabConfig, - .firstBlock = firstBlock, - .lastBlock = lastBlock, - .zoneCount = zoneCount, - }; - - logDebug("slabDepot lastBlock=%llu, totalDataBlocks=%" PRIu64 - ", slabCount=%zu, leftOver=%llu", - lastBlock, totalDataBlocks, slabCount, - blockCount - (lastBlock - firstBlock)); - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeSlabDepot(BlockCount blockCount, - PhysicalBlockNumber firstBlock, - SlabConfig slabConfig, - const ThreadConfig *threadConfig, - Nonce nonce, - BlockCount vioPoolSize, - PhysicalLayer *layer, - Partition *summaryPartition, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *recoveryJournal, - SlabDepot **depotPtr) -{ - SlabDepotState2_0 state; - int result = configureState(blockCount, firstBlock, slabConfig, 0, &state); - if (result != VDO_SUCCESS) { - return result; - } - - SlabDepot *depot = NULL; - result = allocateDepot(&state, threadConfig, nonce, vioPoolSize, layer, - summaryPartition, readOnlyNotifier, recoveryJournal, - &depot); - if (result != VDO_SUCCESS) { - return result; - } - - *depotPtr = depot; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeSlabDepot(SlabDepot **depotPtr) -{ - SlabDepot *depot = *depotPtr; - if (depot == NULL) { - return; - } - - abandonNewSlabs(depot); - - for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { - freeBlockAllocator(&depot->allocators[zone]); - } - - if (depot->slabs != NULL) { - for (SlabCount i = 0; i < depot->slabCount; i++) { - freeSlab(&depot->slabs[i]); - } - } - - FREE(depot->slabs); - freeActionManager(&depot->actionManager); - freeSlabSummary(&depot->slabSummary); - destroyEnqueueable(&depot->scrubbingCompletion); - FREE(depot); - *depotPtr = NULL; -} - -/**********************************************************************/ -size_t getSlabDepotEncodedSize(void) -{ - return ENCODED_HEADER_SIZE + sizeof(SlabDepotState2_0); -} - -/** - * Decode a slab config from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param config The config structure to receive the decoded values - * - * @return UDS_SUCCESS or an error code - **/ -static int decodeSlabConfig(Buffer *buffer, SlabConfig *config) -{ - BlockCount count; - int result = getUInt64LEFromBuffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - config->slabBlocks = count; - - result = getUInt64LEFromBuffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - config->dataBlocks = count; - - result = getUInt64LEFromBuffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - config->referenceCountBlocks = count; - - result = getUInt64LEFromBuffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - config->slabJournalBlocks = count; - - result = getUInt64LEFromBuffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - config->slabJournalFlushingThreshold = count; - - result = getUInt64LEFromBuffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - config->slabJournalBlockingThreshold = count; - - result = getUInt64LEFromBuffer(buffer, &count); - if (result != UDS_SUCCESS) { - return result; - } - config->slabJournalScrubbingThreshold = count; - - return UDS_SUCCESS; -} - -/** - * Encode a slab config into a buffer. - * - * @param config The config structure to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error code - **/ -static int encodeSlabConfig(const SlabConfig *config, Buffer *buffer) -{ - int result = putUInt64LEIntoBuffer(buffer, config->slabBlocks); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, config->dataBlocks); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, config->referenceCountBlocks); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, config->slabJournalBlocks); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, config->slabJournalFlushingThreshold); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, config->slabJournalBlockingThreshold); - if (result != UDS_SUCCESS) { - return result; - } - - return putUInt64LEIntoBuffer(buffer, config->slabJournalScrubbingThreshold); -} - -/**********************************************************************/ -int encodeSlabDepot(const SlabDepot *depot, Buffer *buffer) -{ - int result = encodeHeader(&SLAB_DEPOT_HEADER_2_0, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - size_t initialLength = contentLength(buffer); - - result = encodeSlabConfig(&depot->slabConfig, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, depot->firstBlock); - if (result != UDS_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, depot->lastBlock); - if (result != UDS_SUCCESS) { - return result; - } - - /* - * If this depot is currently using 0 zones, it must have been - * synchronously loaded by a tool and is now being saved. We - * did not load and combine the slab summary, so we still need - * to do that next time we load with the old zone count rather - * than 0. - */ - ZoneCount zonesToRecord = depot->zoneCount; - if (depot->zoneCount == 0) { - zonesToRecord = depot->oldZoneCount; - } - result = putByte(buffer, zonesToRecord); - if (result != UDS_SUCCESS) { - return result; - } - - size_t encodedSize = contentLength(buffer) - initialLength; - return ASSERT(SLAB_DEPOT_HEADER_2_0.size == encodedSize, - "encoded block map component size must match header size"); -} - -/** - * Decode slab depot component state version 2.0 from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param state The state structure to receive the decoded values - * - * @return UDS_SUCCESS or an error code - **/ -static int decodeSlabDepotState_2_0(Buffer *buffer, SlabDepotState2_0 *state) -{ - size_t initialLength = contentLength(buffer); - - int result = decodeSlabConfig(buffer, &state->slabConfig); - if (result != UDS_SUCCESS) { - return result; - } - - PhysicalBlockNumber firstBlock; - result = getUInt64LEFromBuffer(buffer, &firstBlock); - if (result != UDS_SUCCESS) { - return result; - } - state->firstBlock = firstBlock; - - PhysicalBlockNumber lastBlock; - result = getUInt64LEFromBuffer(buffer, &lastBlock); - if (result != UDS_SUCCESS) { - return result; - } - state->lastBlock = lastBlock; - - result = getByte(buffer, &state->zoneCount); - if (result != UDS_SUCCESS) { - return result; - } - - size_t decodedSize = initialLength - contentLength(buffer); - return ASSERT(SLAB_DEPOT_HEADER_2_0.size == decodedSize, - "decoded slab depot component size must match header size"); -} - -/**********************************************************************/ -int decodeSlabDepot(Buffer *buffer, - const ThreadConfig *threadConfig, - Nonce nonce, - PhysicalLayer *layer, - Partition *summaryPartition, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *recoveryJournal, - SlabDepot **depotPtr) -{ - Header header; - int result = decodeHeader(buffer, &header); - if (result != VDO_SUCCESS) { - return result; - } - - result = validateHeader(&SLAB_DEPOT_HEADER_2_0, &header, true, __func__); - if (result != VDO_SUCCESS) { - return result; - } - - SlabDepotState2_0 state; - result = decodeSlabDepotState_2_0(buffer, &state); - if (result != UDS_SUCCESS) { - return result; - } - - return allocateDepot(&state, threadConfig, nonce, VIO_POOL_SIZE, layer, - summaryPartition, readOnlyNotifier, recoveryJournal, - depotPtr); -} - -/**********************************************************************/ -int decodeSodiumSlabDepot(Buffer *buffer, - const ThreadConfig *threadConfig, - Nonce nonce, - PhysicalLayer *layer, - Partition *summaryPartition, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *recoveryJournal, - SlabDepot **depotPtr) -{ - // Sodium uses version 2.0 of the slab depot state. - return decodeSlabDepot(buffer, threadConfig, nonce, layer, summaryPartition, - readOnlyNotifier, recoveryJournal, depotPtr); -} - -/**********************************************************************/ -int allocateSlabRefCounts(SlabDepot *depot) -{ - SlabIterator iterator = getSlabIterator(depot); - while (hasNextSlab(&iterator)) { - int result = allocateRefCountsForSlab(nextSlab(&iterator)); - if (result != VDO_SUCCESS) { - return result; - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -BlockAllocator *getBlockAllocatorForZone(SlabDepot *depot, - ZoneCount zoneNumber) -{ - return depot->allocators[zoneNumber]; -} - -/**********************************************************************/ -int getSlabNumber(const SlabDepot *depot, - PhysicalBlockNumber pbn, - SlabCount *slabNumberPtr) -{ - if (pbn < depot->firstBlock) { - return VDO_OUT_OF_RANGE; - } - - SlabCount slabNumber = (pbn - depot->firstBlock) >> depot->slabSizeShift; - if (slabNumber >= depot->slabCount) { - return VDO_OUT_OF_RANGE; - } - - *slabNumberPtr = slabNumber; - return VDO_SUCCESS; -} - -/**********************************************************************/ -Slab *getSlab(const SlabDepot *depot, PhysicalBlockNumber pbn) -{ - if (pbn == ZERO_BLOCK) { - return NULL; - } - - SlabCount slabNumber; - int result = getSlabNumber(depot, pbn, &slabNumber); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(depot->readOnlyNotifier, result); - return NULL; - } - - return depot->slabs[slabNumber]; - -} - -/**********************************************************************/ -SlabJournal *getSlabJournal(const SlabDepot *depot, PhysicalBlockNumber pbn) -{ - Slab *slab = getSlab(depot, pbn); - return ((slab != NULL) ? slab->journal : NULL); -} - -/**********************************************************************/ -uint8_t getIncrementLimit(SlabDepot *depot, PhysicalBlockNumber pbn) -{ - Slab *slab = getSlab(depot, pbn); - if ((slab == NULL) || isUnrecoveredSlab(slab)) { - return 0; - } - - return getAvailableReferences(slab->referenceCounts, pbn); -} - -/**********************************************************************/ -bool isPhysicalDataBlock(const SlabDepot *depot, PhysicalBlockNumber pbn) -{ - if (pbn == ZERO_BLOCK) { - return true; - } - - SlabCount slabNumber; - if (getSlabNumber(depot, pbn, &slabNumber) != VDO_SUCCESS) { - return false; - } - - SlabBlockNumber sbn; - int result = slabBlockNumberFromPBN(depot->slabs[slabNumber], pbn, &sbn); - return (result == VDO_SUCCESS); -} - -/**********************************************************************/ -BlockCount getDepotAllocatedBlocks(const SlabDepot *depot) -{ - BlockCount total = 0; - for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { - // The allocators are responsible for thread safety. - total += getAllocatedBlocks(depot->allocators[zone]); - } - return total; -} - -/**********************************************************************/ -BlockCount getDepotDataBlocks(const SlabDepot *depot) -{ - // XXX This needs to be thread safe, but resize changes the slab count. It - // does so on the admin thread (our usual caller), so it's usually safe. - return (depot->slabCount * depot->slabConfig.dataBlocks); -} - -/**********************************************************************/ -BlockCount getDepotFreeBlocks(const SlabDepot *depot) -{ - /* - * We can't ever shrink a volume except when resize fails, and we can't - * allocate from the new slabs until after the resize succeeds, so by - * getting the number of allocated blocks first, we ensure the allocated - * count is always less than the capacity. Doing it in the other order on a - * full volume could lose a race with a sucessful resize, resulting in a - * nonsensical negative/underflow result. - */ - BlockCount allocated = getDepotAllocatedBlocks(depot); - memoryFence(); - return (getDepotDataBlocks(depot) - allocated); -} - -/**********************************************************************/ -SlabCount getDepotSlabCount(const SlabDepot *depot) -{ - return depot->slabCount; -} - -/**********************************************************************/ -SlabCount getDepotUnrecoveredSlabCount(const SlabDepot *depot) -{ - SlabCount total = 0; - for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { - // The allocators are responsible for thread safety. - total += getUnrecoveredSlabCount(depot->allocators[zone]); - } - return total; -} - -/** - * The preamble of a load operation which loads the slab summary. - * - *

Implements ActionPreamble. - **/ -static void startDepotLoad(void *context, VDOCompletion *parent) -{ - SlabDepot *depot = context; - loadSlabSummary(depot->slabSummary, - getCurrentManagerOperation(depot->actionManager), - depot->oldZoneCount, parent); -} - -/**********************************************************************/ -void loadSlabDepot(SlabDepot *depot, - AdminStateCode operation, - VDOCompletion *parent, - void *context) -{ - if (assertLoadOperation(operation, parent)) { - scheduleOperationWithContext(depot->actionManager, operation, - startDepotLoad, loadBlockAllocator, NULL, - context, parent); - } -} - -/**********************************************************************/ -void prepareToAllocate(SlabDepot *depot, - SlabDepotLoadType loadType, - VDOCompletion *parent) -{ - depot->loadType = loadType; - atomicStore32(&depot->zonesToScrub, depot->zoneCount); - scheduleAction(depot->actionManager, NULL, prepareAllocatorToAllocate, - NULL, parent); -} - -/**********************************************************************/ -void updateSlabDepotSize(SlabDepot *depot) -{ - depot->lastBlock = depot->newLastBlock; -} - -/**********************************************************************/ -int prepareToGrowSlabDepot(SlabDepot *depot, BlockCount newSize) -{ - if ((newSize >> depot->slabSizeShift) <= depot->slabCount) { - return VDO_INCREMENT_TOO_SMALL; - } - - // Generate the depot configuration for the new block count. - SlabDepotState2_0 newState; - int result = configureState(newSize, depot->firstBlock, depot->slabConfig, - depot->zoneCount, &newState); - if (result != VDO_SUCCESS) { - return result; - } - - SlabCount newSlabCount = computeSlabCount(depot->firstBlock, - newState.lastBlock, - depot->slabSizeShift); - if (newSlabCount <= depot->slabCount) { - return logErrorWithStringError(VDO_INCREMENT_TOO_SMALL, - "Depot can only grow"); - } - if (newSlabCount == depot->newSlabCount) { - // Check it out, we've already got all the new slabs allocated! - return VDO_SUCCESS; - } - - abandonNewSlabs(depot); - result = allocateSlabs(depot, newSlabCount); - if (result != VDO_SUCCESS) { - abandonNewSlabs(depot); - return result; - } - - depot->newSize = newSize; - depot->oldLastBlock = depot->lastBlock; - depot->newLastBlock = newState.lastBlock; - - return VDO_SUCCESS; -} - -/** - * Finish registering new slabs now that all of the allocators have received - * their new slabs. - * - *

Implements ActionConclusion. - **/ -static int finishRegistration(void *context) -{ - SlabDepot *depot = context; - depot->slabCount = depot->newSlabCount; - FREE(depot->slabs); - depot->slabs = depot->newSlabs; - depot->newSlabs = NULL; - depot->newSlabCount = 0; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void useNewSlabs(SlabDepot *depot, VDOCompletion *parent) -{ - ASSERT_LOG_ONLY(depot->newSlabs != NULL, "Must have new slabs to use"); - scheduleOperation(depot->actionManager, ADMIN_STATE_SUSPENDED_OPERATION, - NULL, registerNewSlabsForAllocator, finishRegistration, - parent); -} - -/**********************************************************************/ -void drainSlabDepot(SlabDepot *depot, - AdminStateCode operation, - VDOCompletion *parent) -{ - scheduleOperation(depot->actionManager, operation, NULL, drainBlockAllocator, - NULL, parent); -} - -/**********************************************************************/ -void resumeSlabDepot(SlabDepot *depot, VDOCompletion *parent) -{ - if (isReadOnly(depot->readOnlyNotifier)) { - finishCompletion(parent, VDO_READ_ONLY); - return; - } - - scheduleOperation(depot->actionManager, ADMIN_STATE_RESUMING, NULL, - resumeBlockAllocator, NULL, parent); -} - -/**********************************************************************/ -void commitOldestSlabJournalTailBlocks(SlabDepot *depot, - SequenceNumber recoveryBlockNumber) -{ - if (depot == NULL) { - return; - } - - depot->newReleaseRequest = recoveryBlockNumber; - scheduleDefaultAction(depot->actionManager); -} - -/**********************************************************************/ -const SlabConfig *getSlabConfig(const SlabDepot *depot) -{ - return &depot->slabConfig; -} - -/**********************************************************************/ -SlabSummary *getSlabSummary(const SlabDepot *depot) -{ - return depot->slabSummary; -} - -/**********************************************************************/ -SlabSummaryZone *getSlabSummaryForZone(const SlabDepot *depot, ZoneCount zone) -{ - if (depot->slabSummary == NULL) { - return NULL; - } - return getSummaryForZone(depot->slabSummary, zone); -} - -/**********************************************************************/ -void scrubAllUnrecoveredSlabs(SlabDepot *depot, - void *parent, - VDOAction *callback, - VDOAction *errorHandler, - ThreadID threadID, - VDOCompletion *launchParent) -{ - prepareCompletion(&depot->scrubbingCompletion, callback, errorHandler, - threadID, parent); - scheduleAction(depot->actionManager, NULL, scrubAllUnrecoveredSlabsInZone, - NULL, launchParent); -} - -/**********************************************************************/ -void notifyZoneFinishedScrubbing(VDOCompletion *completion) -{ - SlabDepot *depot = completion->parent; - if (atomicAdd32(&depot->zonesToScrub, -1) == 0) { - // We're the last! - completeCompletion(&depot->scrubbingCompletion); - } -} - -/**********************************************************************/ -bool hasUnrecoveredSlabs(SlabDepot *depot) -{ - return (atomicLoad32(&depot->zonesToScrub) > 0); -} - -/**********************************************************************/ -BlockCount getNewDepotSize(const SlabDepot *depot) -{ - return (depot->newSlabs == NULL) ? 0 : depot->newSize; -} - -/**********************************************************************/ -bool areEquivalentDepots(SlabDepot *depotA, SlabDepot *depotB) -{ - if ((depotA->firstBlock != depotB->firstBlock) - || (depotA->lastBlock != depotB->lastBlock) - || (depotA->slabCount != depotB->slabCount) - || (depotA->slabSizeShift != depotB->slabSizeShift) - || (getDepotAllocatedBlocks(depotA) - != getDepotAllocatedBlocks(depotB))) { - return false; - } - - for (size_t i = 0; i < depotA->slabCount; i++) { - Slab *slabA = depotA->slabs[i]; - Slab *slabB = depotB->slabs[i]; - if ((slabA->start != slabB->start) - || (slabA->end != slabB->end) - || !areEquivalentReferenceCounters(slabA->referenceCounts, - slabB->referenceCounts)) { - return false; - } - } - - return true; -} - -/**********************************************************************/ -void allocateFromLastSlab(SlabDepot *depot) -{ - for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { - allocateFromAllocatorLastSlab(depot->allocators[zone]); - } -} - -/**********************************************************************/ -BlockAllocatorStatistics -getDepotBlockAllocatorStatistics(const SlabDepot *depot) -{ - BlockAllocatorStatistics totals; - memset(&totals, 0, sizeof(totals)); - - for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { - BlockAllocator *allocator = depot->allocators[zone]; - BlockAllocatorStatistics stats = getBlockAllocatorStatistics(allocator); - totals.slabCount += stats.slabCount; - totals.slabsOpened += stats.slabsOpened; - totals.slabsReopened += stats.slabsReopened; - } - - return totals; -} - -/**********************************************************************/ -RefCountsStatistics getDepotRefCountsStatistics(const SlabDepot *depot) -{ - RefCountsStatistics depotStats; - memset(&depotStats, 0, sizeof(depotStats)); - - for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { - BlockAllocator *allocator = depot->allocators[zone]; - RefCountsStatistics stats = getRefCountsStatistics(allocator); - depotStats.blocksWritten += stats.blocksWritten; - } - - return depotStats; -} - -/**********************************************************************/ -SlabJournalStatistics getDepotSlabJournalStatistics(const SlabDepot *depot) -{ - SlabJournalStatistics depotStats; - memset(&depotStats, 0, sizeof(depotStats)); - - for (ZoneCount zone = 0; zone < depot->zoneCount; zone++) { - BlockAllocator *allocator = depot->allocators[zone]; - SlabJournalStatistics stats = getSlabJournalStatistics(allocator); - depotStats.diskFullCount += stats.diskFullCount; - depotStats.flushCount += stats.flushCount; - depotStats.blockedCount += stats.blockedCount; - depotStats.blocksWritten += stats.blocksWritten; - depotStats.tailBusyCount += stats.tailBusyCount; - } - - return depotStats; -} - -/**********************************************************************/ -void dumpSlabDepot(const SlabDepot *depot) -{ - logInfo("Slab Depot"); - logInfo(" zoneCount=%u oldZoneCount=%u slabCount=%" PRIu32 - " activeReleaseRequest=%llu newReleaseRequest=%llu", - (unsigned int) depot->zoneCount, (unsigned int) depot->oldZoneCount, - depot->slabCount, depot->activeReleaseRequest, - depot->newReleaseRequest); -} diff --git a/vdo/base/slabDepot.h b/vdo/base/slabDepot.h deleted file mode 100644 index b439470..0000000 --- a/vdo/base/slabDepot.h +++ /dev/null @@ -1,515 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepot.h#12 $ - */ - -#ifndef SLAB_DEPOT_H -#define SLAB_DEPOT_H - -#include "buffer.h" - -#include "adminState.h" -#include "completion.h" -#include "fixedLayout.h" -#include "journalPoint.h" -#include "statistics.h" -#include "types.h" -#include "waitQueue.h" - -/** - * A SlabDepot is responsible for managing all of the slabs and block - * allocators of a VDO. It has a single array of slabs in order to eliminate - * the need for additional math in order to compute which physical zone a PBN - * is in. It also has a BlockAllocator per zone. - * - * Load operations are required to be performed on a single thread. Normal - * operations are assumed to be performed in the appropriate zone. Allocations - * and reference count updates must be done from the thread of their physical - * zone. Requests to commit slab journal tail blocks from the recovery journal - * must be done on the journal zone thread. Save operations are required to be - * launched from the same thread as the original load operation. - **/ - -typedef enum { - NORMAL_LOAD, - RECOVERY_LOAD, - REBUILD_LOAD -} SlabDepotLoadType; - -/** - * Calculate the number of slabs a depot would have. - * - * @param depot The depot - * - * @return The number of slabs - **/ -SlabCount calculateSlabCount(SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Create a slab depot. - * - * @param [in] blockCount The number of blocks initially available - * @param [in] firstBlock The number of the first block which may be - * allocated - * @param [in] slabConfig The slab configuration - * @param [in] threadConfig The thread configuration of the VDO - * @param [in] nonce The nonce of the VDO - * @param [in] vioPoolSize The size of the VIO pool - * @param [in] layer The physical layer below this depot - * @param [in] summaryPartition The partition which holds the slab summary - * @param [in] readOnlyNotifier The context for entering read-only mode - * @param [in] recoveryJournal The recovery journal of the VDO - * @param [out] depotPtr A pointer to hold the depot - * - * @return A success or error code - **/ -int makeSlabDepot(BlockCount blockCount, - PhysicalBlockNumber firstBlock, - SlabConfig slabConfig, - const ThreadConfig *threadConfig, - Nonce nonce, - BlockCount vioPoolSize, - PhysicalLayer *layer, - Partition *summaryPartition, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *recoveryJournal, - SlabDepot **depotPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy a slab depot and null out the reference to it. - * - * @param depotPtr The reference to the depot to destroy - **/ -void freeSlabDepot(SlabDepot **depotPtr); - -/** - * Get the size of the encoded state of a slab depot. - * - * @return The encoded size of the depot's state - **/ -size_t getSlabDepotEncodedSize(void) - __attribute__((warn_unused_result)); - -/** - * Encode the state of a slab depot into a buffer. - * - * @param depot The depot to encode - * @param buffer The buffer to encode into - * - * @return UDS_SUCCESS or an error - **/ -int encodeSlabDepot(const SlabDepot *depot, Buffer *buffer) - __attribute__((warn_unused_result)); - -/** - * Decode the state of a slab depot saved in a buffer. - * - * @param [in] buffer The buffer containing the saved state - * @param [in] threadConfig The thread config of the VDO - * @param [in] nonce The nonce of the VDO - * @param [in] layer The physical layer below this depot - * @param [in] summaryPartition The partition which holds the slab summary - * @param [in] readOnlyNotifier The context for entering read-only mode - * @param [in] recoveryJournal The recovery journal of the VDO - * @param [out] depotPtr A pointer to hold the depot - * - * @return A success or error code - **/ -int decodeSodiumSlabDepot(Buffer *buffer, - const ThreadConfig *threadConfig, - Nonce nonce, - PhysicalLayer *layer, - Partition *summaryPartition, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *recoveryJournal, - SlabDepot **depotPtr) - __attribute__((warn_unused_result)); - -/** - * Decode the state of a slab depot saved in a buffer. - * - * @param [in] buffer The buffer containing the saved state - * @param [in] threadConfig The thread config of the VDO - * @param [in] nonce The nonce of the VDO - * @param [in] layer The physical layer below this depot - * @param [in] summaryPartition The partition which holds the slab summary - * @param [in] readOnlyNotifier The context for entering read-only mode - * @param [in] recoveryJournal The recovery journal of the VDO - * @param [out] depotPtr A pointer to hold the depot - * - * @return A success or error code - **/ -int decodeSlabDepot(Buffer *buffer, - const ThreadConfig *threadConfig, - Nonce nonce, - PhysicalLayer *layer, - Partition *summaryPartition, - ReadOnlyNotifier *readOnlyNotifier, - RecoveryJournal *recoveryJournal, - SlabDepot **depotPtr) - __attribute__((warn_unused_result)); - -/** - * Allocate the RefCounts for all slabs in the depot. This method may be called - * only before entering normal operation from the load thread. - * - * @param depot The depot whose RefCounts need allocation - * - * @return VDO_SUCCESS or an error - **/ -int allocateSlabRefCounts(SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the block allocator for a specified physical zone from a depot. - * - * @param depot The depot - * @param zoneNumber The physical zone - * - * @return The block allocator for the specified zone - **/ -BlockAllocator *getBlockAllocatorForZone(SlabDepot *depot, - ZoneCount zoneNumber) - __attribute__((warn_unused_result)); - -/** - * Get the number of the slab that contains a specified block. - * - * @param depot The slab depot - * @param pbn The physical block number - * @param slabNumberPtr A pointer to hold the slab number - * - * @return VDO_SUCCESS or an error - **/ -int getSlabNumber(const SlabDepot *depot, - PhysicalBlockNumber pbn, - SlabCount *slabNumberPtr) - __attribute__((warn_unused_result)); - -/** - * Get the slab object for the slab that contains a specified block. Will put - * the VDO in read-only mode if the PBN is not a valid data block nor the zero - * block. - * - * @param depot The slab depot - * @param pbn The physical block number - * - * @return The slab containing the block, or NULL if the block number is the - * zero block or otherwise out of range - **/ -Slab *getSlab(const SlabDepot *depot, PhysicalBlockNumber pbn) - __attribute__((warn_unused_result)); - -/** - * Get the slab journal for the slab that contains a specified block. - * - * @param depot The slab depot - * @param pbn The physical block number within the block depot partition - * of any block in the slab - * - * @return The slab journal of the slab containing the block, or NULL if the - * block number is for the zero block or otherwise out of range - **/ -SlabJournal *getSlabJournal(const SlabDepot *depot, PhysicalBlockNumber pbn) - __attribute__((warn_unused_result)); - -/** - * Determine how many new references a block can acquire. This method must be - * called from the the physical zone thread of the PBN. - * - * @param depot The slab depot - * @param pbn The physical block number that is being queried - * - * @return the number of available references - **/ -uint8_t getIncrementLimit(SlabDepot *depot, PhysicalBlockNumber pbn) - __attribute__((warn_unused_result)); - -/** - * Determine whether the given PBN refers to a data block. - * - * @param depot The depot - * @param pbn The physical block number to ask about - * - * @return True if the PBN corresponds to a data block - **/ -bool isPhysicalDataBlock(const SlabDepot *depot, PhysicalBlockNumber pbn) - __attribute__((warn_unused_result)); - -/** - * Get the total number of data blocks allocated across all the slabs in the - * depot, which is the total number of blocks with a non-zero reference count. - * This may be called from any thread. - * - * @param depot The slab depot - * - * @return The total number of blocks with a non-zero reference count - **/ -BlockCount getDepotAllocatedBlocks(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the total of the statistics from all the block allocators in the depot. - * - * @param depot The slab depot - * - * @return The statistics from all block allocators in the depot - **/ -BlockAllocatorStatistics -getDepotBlockAllocatorStatistics(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the total number of data blocks in all the slabs in the depot. This may - * be called from any thread. - * - * @param depot The slab depot - * - * @return The total number of data blocks in all slabs - **/ -BlockCount getDepotDataBlocks(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the total number of free blocks remaining in all the slabs in the - * depot, which is the total number of blocks that have a zero reference - * count. This may be called from any thread. - * - * @param depot The slab depot - * - * @return The total number of blocks with a zero reference count - **/ -BlockCount getDepotFreeBlocks(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the total number of slabs in the depot - * - * @param depot The slab depot - * - * @return The total number of slabs - **/ -SlabCount getDepotSlabCount(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the total number of unrecovered slabs in the depot, which is the total - * number of unrecovered slabs from all zones. This may be called from any - * thread. - * - * @param depot The slab depot - * - * @return The total number of slabs that are unrecovered - **/ -SlabCount getDepotUnrecoveredSlabCount(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the aggregated slab journal statistics for the depot. - * - * @param depot The slab depot - * - * @return The aggregated statistics for all slab journals in the depot - **/ -SlabJournalStatistics getDepotSlabJournalStatistics(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the cumulative RefCounts statistics for the depot. - * - * @param depot The slab depot - * - * @return The cumulative statistics for all RefCounts in the depot - **/ -RefCountsStatistics getDepotRefCountsStatistics(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Asynchronously load any slab depot state that isn't included in the - * SuperBlock component. This method may be called only before entering normal - * operation from the load thread. - * - * @param depot The depot to load - * @param operation The type of load to perform - * @param parent The completion to finish when the load is complete - * @param context Additional context for the load operation; may be NULL - **/ -void loadSlabDepot(SlabDepot *depot, - AdminStateCode operation, - VDOCompletion *parent, - void *context); - -/** - * Prepare the slab depot to come online and start allocating blocks. This - * method may be called only before entering normal operation from the load - * thread. It must be called before allocation may proceed. - * - * @param depot The depot to prepare - * @param loadType The load type - * @param parent The completion to finish when the operation is complete - **/ -void prepareToAllocate(SlabDepot *depot, - SlabDepotLoadType loadType, - VDOCompletion *parent); - -/** - * Update the slab depot to reflect its new size in memory. This size is saved - * to disk as part of the super block. - * - * @param depot The depot to update - **/ -void updateSlabDepotSize(SlabDepot *depot); - -/** - * Allocate new memory needed for a resize of a slab depot to the given size. - * - * @param depot The depot to prepare to resize - * @param newSize The number of blocks in the new depot - * - * @return VDO_SUCCESS or an error - **/ -int prepareToGrowSlabDepot(SlabDepot *depot, BlockCount newSize) - __attribute__((warn_unused_result)); - -/** - * Use the new slabs allocated for resize. - * - * @param depot The depot - * @param parent The object to notify when complete - **/ -void useNewSlabs(SlabDepot *depot, VDOCompletion *parent); - -/** - * Abandon any new slabs in this depot, freeing them as needed. - * - * @param depot The depot - **/ -void abandonNewSlabs(SlabDepot *depot); - -/** - * Drain all slab depot I/O. If saving, or flushing, all dirty depot metadata - * will be written out. If saving or suspending, the depot will be left in a - * suspended state. - * - * @param depot The depot to drain - * @param operation The drain operation (flush, rebuild, suspend, or save) - * @param parent The completion to finish when the drain is complete - **/ -void drainSlabDepot(SlabDepot *depot, - AdminStateCode operation, - VDOCompletion *parent); - -/** - * Resume a suspended slab depot. - * - * @param depot The depot to resume - * @param parent The completion to finish when the depot has resumed - **/ -void resumeSlabDepot(SlabDepot *depot, VDOCompletion *parent); - -/** - * Commit all dirty tail blocks which are locking a given recovery journal - * block. This method must be called from the journal zone thread. - * - * @param depot The depot - * @param recoveryBlockNumber The sequence number of the recovery journal - * block whose locks should be released - **/ -void commitOldestSlabJournalTailBlocks(SlabDepot *depot, - SequenceNumber recoveryBlockNumber); - -/** - * Get the SlabConfig of a depot. - * - * @param depot The slab depot - * - * @return The slab configuration of the specified depot - **/ -const SlabConfig *getSlabConfig(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the slab summary. - * - * @param depot The slab depot - * - * @return The slab summary - **/ -SlabSummary *getSlabSummary(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Get the portion of the slab summary for a given physical zone. - * - * @param depot The slab depot - * @param zone The zone - * - * @return The portion of the slab summary for the specified zone - **/ -SlabSummaryZone *getSlabSummaryForZone(const SlabDepot *depot, ZoneCount zone) - __attribute__((warn_unused_result)); - -/** - * Scrub all unrecovered slabs. - * - * @param depot The depot to scrub - * @param parent The object to notify when scrubbing is complete - * @param callback The function to call when scrubbing is complete - * @param errorHandler The handler for scrubbing errors - * @param threadID The thread on which to run the callback - * @param launchParent The object to notify when scrubbing has been launched - * for all zones - **/ -void scrubAllUnrecoveredSlabs(SlabDepot *depot, - void *parent, - VDOAction *callback, - VDOAction *errorHandler, - ThreadID threadID, - VDOCompletion *launchParent); - -/** - * Check whether there are outstanding unrecovered slabs. - * - * @param depot The slab depot - * - * @return Whether there are outstanding unrecovered slabs - **/ -bool hasUnrecoveredSlabs(SlabDepot *depot); - -/** - * Get the physical size to which this depot is prepared to grow. - * - * @param depot The slab depot - * - * @return The new number of blocks the depot will be grown to, or 0 if the - * depot is not prepared to grow - **/ -BlockCount getNewDepotSize(const SlabDepot *depot) - __attribute__((warn_unused_result)); - -/** - * Dump the slab depot, in a thread-unsafe fashion. - * - * @param depot The slab depot - **/ -void dumpSlabDepot(const SlabDepot *depot); - -#endif // SLAB_DEPOT_H diff --git a/vdo/base/slabDepotInternals.h b/vdo/base/slabDepotInternals.h deleted file mode 100644 index 7dfe57b..0000000 --- a/vdo/base/slabDepotInternals.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabDepotInternals.h#13 $ - */ - -#ifndef SLAB_DEPOT_INTERNALS_H -#define SLAB_DEPOT_INTERNALS_H - -#include "slabDepot.h" - -#include "atomic.h" - -#include "actionManager.h" - -struct slabDepot { - ZoneCount zoneCount; - ZoneCount oldZoneCount; - SlabConfig slabConfig; - SlabSummary *slabSummary; - ReadOnlyNotifier *readOnlyNotifier; - ActionManager *actionManager; - - PhysicalBlockNumber firstBlock; - PhysicalBlockNumber lastBlock; - PhysicalBlockNumber origin; - - /** slabSize == (1 << slabSizeShift) */ - unsigned int slabSizeShift; - - /** Determines how slabs should be queued during load */ - SlabDepotLoadType loadType; - - /** The state for notifying slab journals to release recovery journal */ - SequenceNumber activeReleaseRequest; - SequenceNumber newReleaseRequest; - - /** The completion for scrubbing */ - VDOCompletion scrubbingCompletion; - Atomic32 zonesToScrub; - - /** Cached journal pointer for slab creation */ - RecoveryJournal *journal; - - /** Array of pointers to individually allocated slabs */ - Slab **slabs; - /** The number of slabs currently allocated and stored in 'slabs' */ - SlabCount slabCount; - - /** Array of pointers to a larger set of slabs (used during resize) */ - Slab **newSlabs; - /** The number of slabs currently allocated and stored in 'newSlabs' */ - SlabCount newSlabCount; - /** The size that 'newSlabs' was allocated for */ - BlockCount newSize; - - /** The last block before resize, for rollback */ - PhysicalBlockNumber oldLastBlock; - /** The last block after resize, for resize */ - PhysicalBlockNumber newLastBlock; - - /** The block allocators for this depot */ - BlockAllocator *allocators[]; -}; - -/** - * Destroy a slab. - * - * @param slab The slab to destroy - **/ -void destroySlab(Slab *slab); - -/** - * Inform a slab's depot that the slab has been created. - * - * @param slab The slab to register - **/ -void registerSlabWithDepot(Slab *slab); - -/** - * Notify a slab depot that one of its allocators has finished scrubbing slabs. - * This method should only be called if the scrubbing was successful. This - * callback is registered by each block allocator in - * scrubAllUnrecoveredSlabsInZone(). - * - * @param completion A completion whose parent must be a slab depot - **/ -void notifyZoneFinishedScrubbing(VDOCompletion *completion); - -/** - * Check whether two depots are equivalent (i.e. represent the same - * state and have the same reference counter). This method is used for unit - * testing. - * - * @param depotA The first depot to compare - * @param depotB The second depot to compare - * - * @return true if the two depots are equivalent - **/ -bool areEquivalentDepots(SlabDepot *depotA, SlabDepot *depotB) - __attribute__((warn_unused_result)); - -/** - * Start allocating from the highest numbered slab in each zone. - * - * @param depot The depot - **/ -void allocateFromLastSlab(SlabDepot *depot); - -#endif /* SLAB_DEPOT_INTERNALS_H */ diff --git a/vdo/base/slabIterator.h b/vdo/base/slabIterator.h deleted file mode 100644 index e977c2d..0000000 --- a/vdo/base/slabIterator.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabIterator.h#1 $ - */ - -#ifndef SLAB_ITERATOR_H -#define SLAB_ITERATOR_H - -#include "slab.h" -#include "types.h" - -/** - * SlabIterator is a structure for iterating over a set of slabs. - **/ -typedef struct { - Slab **slabs; - Slab *next; - SlabCount end; - SlabCount stride; -} SlabIterator; - -/** - * Return a SlabIterator initialized to iterate over an array of slabs - * with a given stride. Iteration always occurs from higher to lower numbered - * slabs. - * - * @param slabs The array of slabs - * @param start The number of the slab to start iterating from - * @param end The number of the last slab which may be returned - * @param stride The difference in slab number between successive slabs - * - * @return an initialized iterator structure - **/ -static inline SlabIterator iterateSlabs(Slab **slabs, - SlabCount start, - SlabCount end, - SlabCount stride) -{ - return (SlabIterator) { - .slabs = slabs, - .next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]), - .end = end, - .stride = stride, - }; -} - -/** - * Check whether another Slab would be returned by the iterator. - * - * @param iterator The iterator to poll - * - * @return true if the next call to nextSlab - * will return a Slab - **/ -static inline bool hasNextSlab(const SlabIterator *iterator) -{ - return (iterator->next != NULL); -} - -/** - * Get the next Slab, advancing the iterator. - * - * @param iterator The iterator over the Slab chain - * - * @return the next Slab or NULL if the array of slabs is empty - * or if all the appropriate Slabs have been returned - **/ -static inline Slab *nextSlab(SlabIterator *iterator) -{ - Slab *slab = iterator->next; - if ((slab == NULL) - || (slab->slabNumber < iterator->end + iterator->stride)) { - iterator->next = NULL; - } else { - iterator->next = iterator->slabs[slab->slabNumber - iterator->stride]; - } - return slab; -} - -#endif // SLAB_ITERATOR_H diff --git a/vdo/base/slabJournal.c b/vdo/base/slabJournal.c deleted file mode 100644 index 1895f80..0000000 --- a/vdo/base/slabJournal.c +++ /dev/null @@ -1,1321 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournal.c#18 $ - */ - -#include "slabJournalInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" - -#include "adminState.h" -#include "blockAllocatorInternals.h" -#include "dataVIO.h" -#include "recoveryJournal.h" -#include "refCounts.h" -#include "slabDepot.h" -#include "slabSummary.h" - -/** - * Return the slab journal from the resource waiter. - * - * @param waiter The waiter - * - * @return The slab journal - **/ -__attribute__((warn_unused_result)) -static inline SlabJournal *slabJournalFromResourceWaiter(Waiter *waiter) -{ - STATIC_ASSERT(offsetof(SlabJournal, resourceWaiter) == 0); - return (SlabJournal *) waiter; -} - -/** - * Return the slab journal from the flush waiter. - * - * @param waiter The waiter - * - * @return The slab journal - **/ -__attribute__((warn_unused_result)) -static inline SlabJournal *slabJournalFromFlushWaiter(Waiter *waiter) -{ - if (waiter == NULL) { - return NULL; - } - return (SlabJournal *) - ((uintptr_t) waiter - offsetof(SlabJournal, flushWaiter)); -} - -/**********************************************************************/ -SlabJournal *slabJournalFromDirtyNode(RingNode *node) -{ - if (node == NULL) { - return NULL; - } - return (SlabJournal *) ((uintptr_t) node - offsetof(SlabJournal, dirtyNode)); -} - -/** - * Return the slab journal from the slab summary waiter. - * - * @param waiter The waiter - * - * @return The slab journal - **/ -__attribute__((warn_unused_result)) -static inline SlabJournal *slabJournalFromSlabSummaryWaiter(Waiter *waiter) -{ - if (waiter == NULL) { - return NULL; - } - return (SlabJournal *) - ((uintptr_t) waiter - offsetof(SlabJournal, slabSummaryWaiter)); -} - -/** - * Get the physical block number for a given sequence number. - * - * @param journal The journal - * @param sequence The sequence number of the desired block - * - * @return the block number corresponding to the sequence number - **/ -__attribute__((warn_unused_result)) -static inline PhysicalBlockNumber getBlockNumber(SlabJournal *journal, - SequenceNumber sequence) -{ - TailBlockOffset offset = getSlabJournalBlockOffset(journal, sequence); - return (journal->slab->journalOrigin + offset); -} - -/** - * Get the lock object for a slab journal block by sequence number. - * - * @param journal Slab journal to retrieve from - * @param sequenceNumber Sequence number of the block - * - * @return the lock object for the given sequence number - **/ -__attribute__((warn_unused_result)) -static inline JournalLock *getLock(SlabJournal *journal, - SequenceNumber sequenceNumber) -{ - TailBlockOffset offset = getSlabJournalBlockOffset(journal, sequenceNumber); - return &journal->locks[offset]; -} - -/** - * Check whether the VDO is in read-only mode. - * - * @param journal The journal whose owning VDO should be checked - * - * @return true if the VDO is in read-only mode - **/ -__attribute__((warn_unused_result)) -static inline bool isVDOReadOnly(SlabJournal *journal) -{ - return isReadOnly(journal->slab->allocator->readOnlyNotifier); -} - -/** - * Check whether there are entry waiters which should delay a flush. - * - * @param journal The journal to check - * - * @return true if there are no entry waiters, or if the slab - * is unrecovered - **/ -__attribute__((warn_unused_result)) -static inline bool mustMakeEntriesToFlush(SlabJournal *journal) -{ - return (!slabIsRebuilding(journal->slab) - && hasWaiters(&journal->entryWaiters)); -} - -/** - * Check whether a reap is currently in progress. - * - * @param journal The journal which may be reaping - * - * @return true if the journal is reaping - **/ -__attribute__((warn_unused_result)) -static inline bool isReaping(SlabJournal *journal) -{ - return (journal->head != journal->unreapable); -} - -/**********************************************************************/ -bool isSlabJournalActive(SlabJournal *journal) -{ - return (mustMakeEntriesToFlush(journal) - || isReaping(journal) - || journal->waitingToCommit - || !isRingEmpty(&journal->uncommittedBlocks) - || journal->updatingSlabSummary); -} - -/** - * Initialize tail block as a new block. - * - * @param journal The journal whose tail block is being initialized - **/ -static void initializeTailBlock(SlabJournal *journal) -{ - SlabJournalBlockHeader *header = &journal->tailHeader; - header->sequenceNumber = journal->tail; - header->entryCount = 0; - header->hasBlockMapIncrements = false; -} - -/** - * Set all journal fields appropriately to start journaling. - * - * @param journal The journal to be reset, based on its tail sequence number - **/ -static void initializeJournalState(SlabJournal *journal) -{ - journal->unreapable = journal->head; - journal->reapLock = getLock(journal, journal->unreapable); - journal->nextCommit = journal->tail; - journal->summarized = journal->lastSummarized = journal->tail; - initializeTailBlock(journal); -} - -/** - * Check whether a journal block is full. - * - * @param journal The slab journal for the block - * - * @return true if the tail block is full - **/ -__attribute__((warn_unused_result)) -static bool blockIsFull(SlabJournal *journal) -{ - JournalEntryCount count = journal->tailHeader.entryCount; - return (journal->tailHeader.hasBlockMapIncrements - ? (journal->fullEntriesPerBlock == count) - : (journal->entriesPerBlock == count)); -} - -/**********************************************************************/ -static void addEntries(SlabJournal *journal); -static void updateTailBlockLocation(SlabJournal *journal); -static void releaseJournalLocks(Waiter *waiter, void *context); - -/**********************************************************************/ -int makeSlabJournal(BlockAllocator *allocator, - Slab *slab, - RecoveryJournal *recoveryJournal, - SlabJournal **journalPtr) -{ - SlabJournal *journal; - const SlabConfig *slabConfig = getSlabConfig(allocator->depot); - int result = ALLOCATE_EXTENDED(SlabJournal, slabConfig->slabJournalBlocks, - JournalLock, __func__, &journal); - if (result != VDO_SUCCESS) { - return result; - } - - journal->slab = slab; - journal->size = slabConfig->slabJournalBlocks; - journal->flushingThreshold = slabConfig->slabJournalFlushingThreshold; - journal->blockingThreshold = slabConfig->slabJournalBlockingThreshold; - journal->scrubbingThreshold = slabConfig->slabJournalScrubbingThreshold; - journal->entriesPerBlock = SLAB_JOURNAL_ENTRIES_PER_BLOCK; - journal->fullEntriesPerBlock = SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK; - journal->events = &allocator->slabJournalStatistics; - journal->recoveryJournal = recoveryJournal; - journal->summary = getSlabSummaryZone(allocator); - journal->tail = 1; - journal->head = 1; - - journal->flushingDeadline = journal->flushingThreshold; - // Set there to be some time between the deadline and the blocking threshold, - // so that hopefully all are done before blocking. - if ((journal->blockingThreshold - journal->flushingThreshold) > 5) { - journal->flushingDeadline = journal->blockingThreshold - 5; - } - - journal->slabSummaryWaiter.callback = releaseJournalLocks; - - result = ALLOCATE(VDO_BLOCK_SIZE, char, "PackedSlabJournalBlock", - (char **) &journal->block); - if (result != VDO_SUCCESS) { - freeSlabJournal(&journal); - return result; - } - - initializeRing(&journal->dirtyNode); - initializeRing(&journal->uncommittedBlocks); - - journal->tailHeader.nonce = slab->allocator->nonce; - journal->tailHeader.metadataType = VDO_METADATA_SLAB_JOURNAL; - initializeJournalState(journal); - - *journalPtr = journal; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeSlabJournal(SlabJournal **journalPtr) -{ - SlabJournal *journal = *journalPtr; - if (journal == NULL) { - return; - } - - FREE(journal->block); - FREE(journal); - *journalPtr = NULL; -} - -/**********************************************************************/ -bool isSlabJournalBlank(const SlabJournal *journal) -{ - return ((journal != NULL) - && (journal->tail == 1) - && (journal->tailHeader.entryCount == 0)); -} - -/**********************************************************************/ -bool isSlabJournalDirty(const SlabJournal *journal) -{ - return (journal->recoveryLock != 0); -} - -/** - * Put a slab journal on the dirty ring of its allocator in the correct order. - * - * @param journal The journal to be marked dirty - * @param lock The recovery journal lock held by the slab journal - **/ -static void markSlabJournalDirty(SlabJournal *journal, SequenceNumber lock) -{ - ASSERT_LOG_ONLY(!isSlabJournalDirty(journal), "slab journal was clean"); - - journal->recoveryLock = lock; - RingNode *dirtyRing = &journal->slab->allocator->dirtySlabJournals; - RingNode *node = dirtyRing->prev; - while (node != dirtyRing) { - SlabJournal *dirtyJournal = slabJournalFromDirtyNode(node); - if (dirtyJournal->recoveryLock <= journal->recoveryLock) { - break; - } - - node = node->prev; - } - - pushRingNode(node->next, &journal->dirtyNode); -} - -/**********************************************************************/ -static void markSlabJournalClean(SlabJournal *journal) -{ - journal->recoveryLock = 0; - unspliceRingNode(&journal->dirtyNode); -} - -/** - * Implements WaiterCallback. This callback is invoked on all VIOs waiting - * to make slab journal entries after the VDO has gone into read-only mode. - **/ -static void abortWaiter(Waiter *waiter, - void *context __attribute__((unused))) -{ - continueDataVIO(waiterAsDataVIO(waiter), VDO_READ_ONLY); -} - -/**********************************************************************/ -void abortSlabJournalWaiters(SlabJournal *journal) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() - == journal->slab->allocator->threadID), - "abortSlabJournalWaiters() called on correct thread"); - notifyAllWaiters(&journal->entryWaiters, abortWaiter, journal); - checkIfSlabDrained(journal->slab); -} - -/** - * Put the journal in read-only mode. All attempts to add entries after - * this function is called will fail. All VIOs waiting for to make entries - * will be awakened with an error. All flushes will complete as soon as all - * pending IO is done. - * - * @param journal The journal which has failed - * @param errorCode The error result triggering this call - **/ -static void enterJournalReadOnlyMode(SlabJournal *journal, int errorCode) -{ - enterReadOnlyMode(journal->slab->allocator->readOnlyNotifier, errorCode); - abortSlabJournalWaiters(journal); -} - -/** - * Actually advance the head of the journal now that any necessary flushes - * are complete. - * - * @param journal The journal to be reaped - **/ -static void finishReaping(SlabJournal *journal) -{ - journal->head = journal->unreapable; - addEntries(journal); - checkIfSlabDrained(journal->slab); -} - -/**********************************************************************/ -static void reapSlabJournal(SlabJournal *journal); - -/** - * Finish reaping now that we have flushed the lower layer and then try - * reaping again in case we deferred reaping due to an outstanding VIO. - * - * @param completion The flush VIO - **/ -static void completeReaping(VDOCompletion *completion) -{ - VIOPoolEntry *entry = completion->parent; - SlabJournal *journal = entry->parent; - returnVIO(journal->slab->allocator, entry); - finishReaping(journal); - reapSlabJournal(journal); -} - -/** - * Handle an error flushing the lower layer. - * - * @param completion The flush VIO - **/ -static void handleFlushError(VDOCompletion *completion) -{ - SlabJournal *journal = ((VIOPoolEntry *) completion->parent)->parent; - enterJournalReadOnlyMode(journal, completion->result); - completeReaping(completion); -} - -/** - * Waiter callback for getting a VIO with which to flush the lower layer prior - * to reaping. - * - * @param waiter The journal as a flush waiter - * @param vioContext The newly acquired flush VIO - **/ -static void flushForReaping(Waiter *waiter, void *vioContext) -{ - SlabJournal *journal = slabJournalFromFlushWaiter(waiter); - VIOPoolEntry *entry = vioContext; - VIO *vio = entry->vio; - - entry->parent = journal; - vio->completion.callbackThreadID = journal->slab->allocator->threadID; - launchFlush(vio, completeReaping, handleFlushError); -} - -/** - * Conduct a reap on a slab journal to reclaim unreferenced blocks. - * - * @param journal The slab journal - **/ -static void reapSlabJournal(SlabJournal *journal) -{ - if (isReaping(journal)) { - // We already have a reap in progress so wait for it to finish. - return; - } - - if (isUnrecoveredSlab(journal->slab) || !isNormal(&journal->slab->state) - || isVDOReadOnly(journal)) { - // We must not reap in the first two cases, and there's no point in - // read-only mode. - return; - } - - /* - * Start reclaiming blocks only when the journal head has no references. Then - * stop when a block is referenced or reap reaches the most recently written - * block, referenced by the slab summary, which has the sequence number just - * before the tail. - */ - bool reaped = false; - while ((journal->unreapable < journal->tail) - && (journal->reapLock->count == 0)) { - reaped = true; - journal->unreapable++; - journal->reapLock++; - if (journal->reapLock == &journal->locks[journal->size]) { - journal->reapLock = &journal->locks[0]; - } - } - - if (!reaped) { - return; - } - - PhysicalLayer *layer = journal->slab->allocator->completion.layer; - if (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC) { - finishReaping(journal); - return; - } - - /* - * In async mode, it is never safe to reap a slab journal block without first - * issuing a flush, regardless of whether a user flush has been received or - * not. In the absence of the flush, the reference block write which released - * the locks allowing the slab journal to reap may not be persisted. Although - * slab summary writes will eventually issue flushes, multiple slab journal - * block writes can be issued while previous slab summary updates have not - * yet been made. Even though those slab journal block writes will be ignored - * if the slab summary update is not persisted, they may still overwrite the - * to-be-reaped slab journal block resulting in a loss of reference count - * updates (VDO-2912). - * - * In sync mode, it is similarly unsafe. However, we cannot possibly make - * those additional slab journal block writes due to the blocking threshold - * and the recovery journal's flush policy of flushing before every block. - * We may make no more than (number of VIOs) entries in slab journals since - * the last recovery journal flush; thus, due to the size of the slab - * journal blocks, the RJ must have flushed the storage no more than one - * slab journal block ago. So we could only overwrite the to-be-reaped block - * if we wrote and flushed the last block in the journal. But the blocking - * threshold prevents that. - */ - journal->flushWaiter.callback = flushForReaping; - int result = acquireVIO(journal->slab->allocator, &journal->flushWaiter); - if (result != VDO_SUCCESS) { - enterJournalReadOnlyMode(journal, result); - return; - } -} - -/** - * This is the callback invoked after a slab summary update completes. It - * is registered in the constructor on behalf of updateTailBlockLocation(). - * - * Implements WaiterCallback. - * - * @param waiter The slab summary waiter that has just been notified - * @param context The result code of the update - **/ -static void releaseJournalLocks(Waiter *waiter, void *context) -{ - SlabJournal *journal = slabJournalFromSlabSummaryWaiter(waiter); - int result = *((int *) context); - if (result != VDO_SUCCESS) { - if (result != VDO_READ_ONLY) { - // Don't bother logging what might be lots of errors if we are already - // in read-only mode. - logErrorWithStringError(result, "failed slab summary update %llu", - journal->summarized); - } - - journal->updatingSlabSummary = false; - enterJournalReadOnlyMode(journal, result); - return; - } - - if (journal->partialWriteInProgress - && (journal->summarized == journal->tail)) { - journal->partialWriteInProgress = false; - addEntries(journal); - } - - SequenceNumber first = journal->lastSummarized; - journal->lastSummarized = journal->summarized; - for (SequenceNumber i = journal->summarized - 1; i >= first; i--) { - // Release the lock the summarized block held on the recovery journal. - // (During replay, recoveryStart will always be 0.) - if (journal->recoveryJournal != NULL) { - ZoneCount zoneNumber = journal->slab->allocator->zoneNumber; - releaseRecoveryJournalBlockReference(journal->recoveryJournal, - getLock(journal, i)->recoveryStart, - ZONE_TYPE_PHYSICAL, - zoneNumber); - - } - - // Release our own lock against reaping for blocks that are committed. - // (This function will not change locks during replay.) - adjustSlabJournalBlockReference(journal, i, -1); - } - - journal->updatingSlabSummary = false; - - reapSlabJournal(journal); - - // Check if the slab summary needs to be updated again. - updateTailBlockLocation(journal); -} - -/** - * Update the tail block location in the slab summary, if necessary. - * - * @param journal The slab journal that is updating its tail block location - **/ -static void updateTailBlockLocation(SlabJournal *journal) -{ - if (journal->updatingSlabSummary || isVDOReadOnly(journal) - || (journal->lastSummarized >= journal->nextCommit)) { - checkIfSlabDrained(journal->slab); - return; - } - - BlockCount freeBlockCount; - if (isUnrecoveredSlab(journal->slab)) { - freeBlockCount = getSummarizedFreeBlockCount(journal->summary, - journal->slab->slabNumber); - } else { - freeBlockCount = getSlabFreeBlockCount(journal->slab); - } - - journal->summarized = journal->nextCommit; - journal->updatingSlabSummary = true; - - /* - * Update slab summary as dirty. - * Slab journal can only reap past sequence number 1 when all the refCounts - * for this slab have been written to the layer. Therefore, indicate that the - * refCounts must be loaded when the journal head has reaped past sequence - * number 1. - */ - TailBlockOffset blockOffset - = getSlabJournalBlockOffset(journal, journal->summarized); - updateSlabSummaryEntry(journal->summary, &journal->slabSummaryWaiter, - journal->slab->slabNumber, blockOffset, - (journal->head > 1), false, freeBlockCount); -} - -/**********************************************************************/ -void reopenSlabJournal(SlabJournal *journal) -{ - ASSERT_LOG_ONLY(journal->tailHeader.entryCount == 0, - "Slab journal's active block empty before reopening"); - journal->head = journal->tail; - initializeJournalState(journal); - - // Ensure no locks are spuriously held on an empty journal. - for (SequenceNumber block = 1; block <= journal->size; block++) { - ASSERT_LOG_ONLY((getLock(journal, block)->count == 0), - "Scrubbed journal's block %llu is not locked", - block); - } - - addEntries(journal); -} - -/**********************************************************************/ -static SequenceNumber getCommittingSequenceNumber(const VIOPoolEntry *entry) -{ - const PackedSlabJournalBlock *block = entry->buffer; - return getUInt64LE(block->header.fields.sequenceNumber); -} - -/** - * Handle post-commit processing. This is the callback registered by - * writeSlabJournalBlock(). - * - * @param completion The write VIO as a completion - **/ -static void completeWrite(VDOCompletion *completion) -{ - int writeResult = completion->result; - VIOPoolEntry *entry = completion->parent; - SlabJournal *journal = entry->parent; - - SequenceNumber committed = getCommittingSequenceNumber(entry); - unspliceRingNode(&entry->node); - returnVIO(journal->slab->allocator, entry); - - if (writeResult != VDO_SUCCESS) { - logErrorWithStringError(writeResult, - "cannot write slab journal block %llu", - committed); - enterJournalReadOnlyMode(journal, writeResult); - return; - } - - relaxedAdd64(&journal->events->blocksWritten, 1); - - if (isRingEmpty(&journal->uncommittedBlocks)) { - // If no blocks are outstanding, then the commit point is at the tail. - journal->nextCommit = journal->tail; - } else { - // The commit point is always the beginning of the oldest incomplete block. - VIOPoolEntry *oldest = asVIOPoolEntry(journal->uncommittedBlocks.next); - journal->nextCommit = getCommittingSequenceNumber(oldest); - } - - updateTailBlockLocation(journal); -} - -/** - * Callback from acquireVIO() registered in commitSlabJournalTail(). - * - * @param waiter The VIO pool waiter which was just notified - * @param vioContext The VIO pool entry for the write - **/ -static void writeSlabJournalBlock(Waiter *waiter, void *vioContext) -{ - SlabJournal *journal = slabJournalFromResourceWaiter(waiter); - VIOPoolEntry *entry = vioContext; - SlabJournalBlockHeader *header = &journal->tailHeader; - - header->head = journal->head; - pushRingNode(&journal->uncommittedBlocks, &entry->node); - packSlabJournalBlockHeader(header, &journal->block->header); - - // Copy the tail block into the VIO. - memcpy(entry->buffer, journal->block, VDO_BLOCK_SIZE); - - int unusedEntries = journal->entriesPerBlock - header->entryCount; - ASSERT_LOG_ONLY(unusedEntries >= 0, "Slab journal block is not overfull"); - if (unusedEntries > 0) { - // Release the per-entry locks for any unused entries in the block we are - // about to write. - adjustSlabJournalBlockReference(journal, header->sequenceNumber, - -unusedEntries); - journal->partialWriteInProgress = !blockIsFull(journal); - } - - PhysicalBlockNumber blockNumber - = getBlockNumber(journal, header->sequenceNumber); - - entry->parent = journal; - entry->vio->completion.callbackThreadID = journal->slab->allocator->threadID; - /* - * This block won't be read in recovery until the slab summary is updated - * to refer to it. The slab summary update does a flush which is sufficient - * to protect us from VDO-2331. - */ - launchWriteMetadataVIO(entry->vio, blockNumber, completeWrite, - completeWrite); - - // Since the write is submitted, the tail block structure can be reused. - journal->tail++; - initializeTailBlock(journal); - journal->waitingToCommit = false; - if (journal->slab->state.state == ADMIN_STATE_WAITING_FOR_RECOVERY) { - finishOperationWithResult(&journal->slab->state, - (isVDOReadOnly(journal) - ? VDO_READ_ONLY : VDO_SUCCESS)); - return; - } - - addEntries(journal); -} - -/**********************************************************************/ -void commitSlabJournalTail(SlabJournal *journal) -{ - if ((journal->tailHeader.entryCount == 0) - && mustMakeEntriesToFlush(journal)) { - // There are no entries at the moment, but there are some waiters, so defer - // initiating the flush until those entries are ready to write. - return; - } - - if (isVDOReadOnly(journal) - || journal->waitingToCommit - || (journal->tailHeader.entryCount == 0)) { - // There is nothing to do since the tail block is empty, or writing, or - // the journal is in read-only mode. - return; - } - - /* - * Since we are about to commit the tail block, this journal no longer - * needs to be on the ring of journals which the recovery journal might - * ask to commit. - */ - markSlabJournalClean(journal); - - journal->waitingToCommit = true; - - journal->resourceWaiter.callback = writeSlabJournalBlock; - int result = acquireVIO(journal->slab->allocator, &journal->resourceWaiter); - if (result != VDO_SUCCESS) { - journal->waitingToCommit = false; - enterJournalReadOnlyMode(journal, result); - return; - } -} - -/**********************************************************************/ -void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader, - SlabJournalPayload *payload, - SlabBlockNumber sbn, - JournalOperation operation) -{ - JournalEntryCount entryNumber = tailHeader->entryCount++; - if (operation == BLOCK_MAP_INCREMENT) { - if (!tailHeader->hasBlockMapIncrements) { - memset(payload->fullEntries.entryTypes, 0, - SLAB_JOURNAL_ENTRY_TYPES_SIZE); - tailHeader->hasBlockMapIncrements = true; - } - - payload->fullEntries.entryTypes[entryNumber / 8] - |= ((byte) 1 << (entryNumber % 8)); - } - - packSlabJournalEntry(&payload->entries[entryNumber], sbn, - isIncrementOperation(operation)); -} - -/**********************************************************************/ -SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block, - JournalEntryCount entryCount) -{ - SlabJournalEntry entry - = unpackSlabJournalEntry(&block->payload.entries[entryCount]); - if (block->header.fields.hasBlockMapIncrements - && ((block->payload.fullEntries.entryTypes[entryCount / 8] - & ((byte) 1 << (entryCount % 8))) != 0)) { - entry.operation = BLOCK_MAP_INCREMENT; - } - return entry; -} - -/** - * Actually add an entry to the slab journal, potentially firing off a write - * if a block becomes full. This function is synchronous. - * - * @param journal The slab journal to append to - * @param pbn The pbn being adjusted - * @param operation The type of entry to make - * @param recoveryPoint The recovery journal point for this entry - **/ -static void addEntry(SlabJournal *journal, - PhysicalBlockNumber pbn, - JournalOperation operation, - const JournalPoint *recoveryPoint) -{ - int result = ASSERT(beforeJournalPoint(&journal->tailHeader.recoveryPoint, - recoveryPoint), - "recovery journal point is monotonically increasing, " - "recovery point: %llu.%u, " - "block recovery point: %llu.%u", - recoveryPoint->sequenceNumber, recoveryPoint->entryCount, - journal->tailHeader.recoveryPoint.sequenceNumber, - journal->tailHeader.recoveryPoint.entryCount); - if (result != VDO_SUCCESS) { - enterJournalReadOnlyMode(journal, result); - return; - } - - PackedSlabJournalBlock *block = journal->block; - if (operation == BLOCK_MAP_INCREMENT) { - result = ASSERT_LOG_ONLY((journal->tailHeader.entryCount - < journal->fullEntriesPerBlock), - "block has room for full entries"); - if (result != VDO_SUCCESS) { - enterJournalReadOnlyMode(journal, result); - return; - } - } - - encodeSlabJournalEntry(&journal->tailHeader, &block->payload, - pbn - journal->slab->start, operation); - journal->tailHeader.recoveryPoint = *recoveryPoint; - if (blockIsFull(journal)) { - commitSlabJournalTail(journal); - } -} - -/**********************************************************************/ -bool attemptReplayIntoSlabJournal(SlabJournal *journal, - PhysicalBlockNumber pbn, - JournalOperation operation, - JournalPoint *recoveryPoint, - VDOCompletion *parent) -{ - // Only accept entries after the current recovery point. - if (!beforeJournalPoint(&journal->tailHeader.recoveryPoint, recoveryPoint)) { - return true; - } - - SlabJournalBlockHeader *header = &journal->tailHeader; - if ((header->entryCount >= journal->fullEntriesPerBlock) - && (header->hasBlockMapIncrements || - (operation == BLOCK_MAP_INCREMENT))) { - // The tail block does not have room for the entry we are attempting - // to add so commit the tail block now. - commitSlabJournalTail(journal); - } - - if (journal->waitingToCommit) { - startOperationWithWaiter(&journal->slab->state, - ADMIN_STATE_WAITING_FOR_RECOVERY, parent, NULL); - return false; - } - - if ((journal->tail - journal->head) >= journal->size) { - /* - * We must have reaped the current head before the crash, since - * the blocked threshold keeps us from having more entries than - * fit in a slab journal; hence we can just advance the head - * (and unreapable block), as needed. - */ - journal->head++; - journal->unreapable++; - } - - markSlabReplaying(journal->slab); - addEntry(journal, pbn, operation, recoveryPoint); - return true; -} - -/** - * Check whether the journal should be saving reference blocks out. - * - * @param journal The journal to check - * - * @return true if the journal should be requesting reference block writes - **/ -static bool requiresFlushing(const SlabJournal *journal) -{ - BlockCount journalLength = (journal->tail - journal->head); - return (journalLength >= journal->flushingThreshold); -} - -/** - * Check whether the journal must be reaped before adding new entries. - * - * @param journal The journal to check - * - * @return true if the journal must be reaped - **/ -static bool requiresReaping(const SlabJournal *journal) -{ - BlockCount journalLength = (journal->tail - journal->head); - return (journalLength >= journal->blockingThreshold); -} - -/**********************************************************************/ -bool requiresScrubbing(const SlabJournal *journal) -{ - BlockCount journalLength = (journal->tail - journal->head); - return (journalLength >= journal->scrubbingThreshold); -} - -/** - * Implements WaiterCallback. This callback is invoked by addEntries() once - * it has determined that we are ready to make another entry in the slab - * journal. - * - * @param waiter The VIO which should make an entry now - * @param context The slab journal to make an entry in - **/ -static void addEntryFromWaiter(Waiter *waiter, void *context) -{ - DataVIO *dataVIO = waiterAsDataVIO(waiter); - SlabJournal *journal = (SlabJournal *) context; - SlabJournalBlockHeader *header = &journal->tailHeader; - SequenceNumber recoveryBlock = dataVIO->recoveryJournalPoint.sequenceNumber; - - if (header->entryCount == 0) { - /* - * This is the first entry in the current tail block, so get a lock - * on the recovery journal which we will hold until this tail block is - * committed. - */ - getLock(journal, header->sequenceNumber)->recoveryStart = recoveryBlock; - if (journal->recoveryJournal != NULL) { - ZoneCount zoneNumber = journal->slab->allocator->zoneNumber; - acquireRecoveryJournalBlockReference(journal->recoveryJournal, - recoveryBlock, ZONE_TYPE_PHYSICAL, - zoneNumber); - } - markSlabJournalDirty(journal, recoveryBlock); - - // If the slab journal is over the first threshold, tell the refCounts to - // write some reference blocks, but proceed apace. - if (requiresFlushing(journal)) { - relaxedAdd64(&journal->events->flushCount, 1); - BlockCount journalLength = (journal->tail - journal->head); - BlockCount blocksToDeadline = 0; - if (journalLength <= journal->flushingDeadline) { - blocksToDeadline = journal->flushingDeadline - journalLength; - } - saveSeveralReferenceBlocks(journal->slab->referenceCounts, - blocksToDeadline + 1); - } - } - - JournalPoint slabJournalPoint = { - .sequenceNumber = header->sequenceNumber, - .entryCount = header->entryCount, - }; - - addEntry(journal, dataVIO->operation.pbn, dataVIO->operation.type, - &dataVIO->recoveryJournalPoint); - - // Now that an entry has been made in the slab journal, update the - // reference counts. - int result = modifySlabReferenceCount(journal->slab, &slabJournalPoint, - dataVIO->operation); - continueDataVIO(dataVIO, result); -} - -/** - * Check whether the next entry to be made is a block map increment. - * - * @param journal The journal - * - * @return true if the first entry waiter's operation is a block - * map increment - **/ -static inline bool isNextEntryABlockMapIncrement(SlabJournal *journal) -{ - DataVIO *dataVIO = waiterAsDataVIO(getFirstWaiter(&journal->entryWaiters)); - return (dataVIO->operation.type == BLOCK_MAP_INCREMENT); -} - -/** - * Add as many entries as possible from the queue of VIOs waiting to make - * entries. By processing the queue in order, we ensure that slab journal - * entries are made in the same order as recovery journal entries for the - * same increment or decrement. - * - * @param journal The journal to which entries may be added - **/ -static void addEntries(SlabJournal *journal) -{ - if (journal->addingEntries) { - // Protect against re-entrancy. - return; - } - - journal->addingEntries = true; - while (hasWaiters(&journal->entryWaiters)) { - if (journal->partialWriteInProgress || slabIsRebuilding(journal->slab)) { - // Don't add entries while rebuilding or while a partial write is - // outstanding (VDO-2399). - break; - } - - SlabJournalBlockHeader *header = &journal->tailHeader; - if (journal->waitingToCommit) { - // If we are waiting for resources to write the tail block, and the - // tail block is full, we can't make another entry. - relaxedAdd64(&journal->events->tailBusyCount, 1); - break; - } else if (isNextEntryABlockMapIncrement(journal) - && (header->entryCount >= journal->fullEntriesPerBlock)) { - // The tail block does not have room for a block map increment, so - // commit it now. - commitSlabJournalTail(journal); - if (journal->waitingToCommit) { - relaxedAdd64(&journal->events->tailBusyCount, 1); - break; - } - } - - // If the slab is over the blocking threshold, make the VIO wait. - if (requiresReaping(journal)) { - relaxedAdd64(&journal->events->blockedCount, 1); - saveDirtyReferenceBlocks(journal->slab->referenceCounts); - break; - } - - if (header->entryCount == 0) { - JournalLock *lock = getLock(journal, header->sequenceNumber); - // Check if the on disk slab journal is full. Because of the - // blocking and scrubbing thresholds, this should never happen. - if (lock->count > 0) { - ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail, - "New block has locks, but journal is not full"); - - /* - * The blocking threshold must let the journal fill up if the new - * block has locks; if the blocking threshold is smaller than the - * journal size, the new block cannot possibly have locks already. - */ - ASSERT_LOG_ONLY((journal->blockingThreshold >= journal->size), - "New block can have locks already iff blocking" - "threshold is at the end of the journal"); - - relaxedAdd64(&journal->events->diskFullCount, 1); - saveDirtyReferenceBlocks(journal->slab->referenceCounts); - break; - } - - /* - * Don't allow the new block to be reaped until all of the reference - * count blocks are written and the journal block has been - * fully committed as well. - */ - lock->count = journal->entriesPerBlock + 1; - - if (header->sequenceNumber == 1) { - /* - * This is the first entry in this slab journal, ever. Dirty all of - * the reference count blocks. Each will acquire a lock on the - * tail block so that the journal won't be reaped until the - * reference counts are initialized. The lock acquisition must - * be done by the RefCounts since here we don't know how many - * reference blocks the RefCounts has. - */ - acquireDirtyBlockLocks(journal->slab->referenceCounts); - } - } - - notifyNextWaiter(&journal->entryWaiters, addEntryFromWaiter, journal); - } - - journal->addingEntries = false; - - // If there are no waiters, and we are flushing or saving, commit the - // tail block. - if (isSlabDraining(journal->slab) && !isSuspending(&journal->slab->state) - && !hasWaiters(&journal->entryWaiters)) { - commitSlabJournalTail(journal); - } -} - -/**********************************************************************/ -void addSlabJournalEntry(SlabJournal *journal, DataVIO *dataVIO) -{ - if (!isSlabOpen(journal->slab)) { - continueDataVIO(dataVIO, VDO_INVALID_ADMIN_STATE); - return; - } - - if (isVDOReadOnly(journal)) { - continueDataVIO(dataVIO, VDO_READ_ONLY); - return; - } - - int result = enqueueDataVIO(&journal->entryWaiters, dataVIO, - THIS_LOCATION("$F($j-$js)")); - if (result != VDO_SUCCESS) { - continueDataVIO(dataVIO, result); - return; - } - - if (isUnrecoveredSlab(journal->slab) && requiresReaping(journal)) { - increaseScrubbingPriority(journal->slab); - } - - addEntries(journal); -} - -/**********************************************************************/ -void adjustSlabJournalBlockReference(SlabJournal *journal, - SequenceNumber sequenceNumber, - int adjustment) -{ - if (sequenceNumber == 0) { - return; - } - - if (isReplayingSlab(journal->slab)) { - // Locks should not be used during offline replay. - return; - } - - ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero"); - JournalLock *lock = getLock(journal, sequenceNumber); - if (adjustment < 0) { - ASSERT_LOG_ONLY((-adjustment <= lock->count), - "adjustment %d of lock count %u for slab journal block %" - PRIu64 " must not underflow", adjustment, lock->count, - sequenceNumber); - } - - lock->count += adjustment; - if (lock->count == 0) { - reapSlabJournal(journal); - } -} - -/**********************************************************************/ -bool releaseRecoveryJournalLock(SlabJournal *journal, - SequenceNumber recoveryLock) -{ - if (recoveryLock > journal->recoveryLock) { - ASSERT_LOG_ONLY((recoveryLock < journal->recoveryLock), - "slab journal recovery lock is not older than the recovery" - " journal head"); - return false; - } - - if ((recoveryLock < journal->recoveryLock) || isVDOReadOnly(journal)) { - return false; - } - - // All locks are held by the block which is in progress; write it. - commitSlabJournalTail(journal); - return true; -} - -/**********************************************************************/ -void drainSlabJournal(SlabJournal *journal) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() - == journal->slab->allocator->threadID), - "drainSlabJournal() called on correct thread"); - if (isQuiescing(&journal->slab->state)) { - // XXX: we should revisit this assertion since it is no longer clear what - // it is for. - ASSERT_LOG_ONLY((!(slabIsRebuilding(journal->slab) - && hasWaiters(&journal->entryWaiters))), - "slab is recovered or has no waiters"); - } - - switch (journal->slab->state.state) { - case ADMIN_STATE_REBUILDING: - case ADMIN_STATE_SUSPENDING: - case ADMIN_STATE_SAVE_FOR_SCRUBBING: - break; - - default: - commitSlabJournalTail(journal); - } -} - -/** - * Finish the decode process by returning the VIO and notifying the slab that - * we're done. - * - * @param completion The VIO as a completion - **/ -static void finishDecodingJournal(VDOCompletion *completion) -{ - int result = completion->result; - VIOPoolEntry *entry = completion->parent; - SlabJournal *journal = entry->parent; - returnVIO(journal->slab->allocator, entry); - notifySlabJournalIsLoaded(journal->slab, result); -} - -/** - * Set up the in-memory journal state to the state which was written to disk. - * This is the callback registered in readSlabJournalTail(). - * - * @param completion The VIO which was used to read the journal tail - **/ -static void setDecodedState(VDOCompletion *completion) -{ - VIOPoolEntry *entry = completion->parent; - SlabJournal *journal = entry->parent; - PackedSlabJournalBlock *block = entry->buffer; - - SlabJournalBlockHeader header; - unpackSlabJournalBlockHeader(&block->header, &header); - - if ((header.metadataType != VDO_METADATA_SLAB_JOURNAL) - || (header.nonce != journal->slab->allocator->nonce)) { - finishDecodingJournal(completion); - return; - } - - journal->tail = header.sequenceNumber + 1; - - // If the slab is clean, this implies the slab journal is empty, so advance - // the head appropriately. - if (getSummarizedCleanliness(journal->summary, journal->slab->slabNumber)) { - journal->head = journal->tail; - } else { - journal->head = header.head; - } - - journal->tailHeader = header; - initializeJournalState(journal); - finishDecodingJournal(completion); -} - -/** - * This reads the slab journal tail block by using a VIO acquired from the VIO - * pool. This is the success callback from acquireVIOFromPool() when decoding - * the slab journal. - * - * @param waiter The VIO pool waiter which has just been notified - * @param vioContext The VIO pool entry given to the waiter - **/ -static void readSlabJournalTail(Waiter *waiter, void *vioContext) -{ - SlabJournal *journal = slabJournalFromResourceWaiter(waiter); - Slab *slab = journal->slab; - VIOPoolEntry *entry = vioContext; - TailBlockOffset lastCommitPoint - = getSummarizedTailBlockOffset(journal->summary, slab->slabNumber); - entry->parent = journal; - - - // Slab summary keeps the commit point offset, so the tail block is the - // block before that. Calculation supports small journals in unit tests. - TailBlockOffset tailBlock = ((lastCommitPoint == 0) - ? (TailBlockOffset) (journal->size - 1) - : (lastCommitPoint - 1)); - entry->vio->completion.callbackThreadID = slab->allocator->threadID; - launchReadMetadataVIO(entry->vio, slab->journalOrigin + tailBlock, - setDecodedState, finishDecodingJournal); -} - -/**********************************************************************/ -void decodeSlabJournal(SlabJournal *journal) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() - == journal->slab->allocator->threadID), - "decodeSlabJournal() called on correct thread"); - Slab *slab = journal->slab; - TailBlockOffset lastCommitPoint - = getSummarizedTailBlockOffset(journal->summary, slab->slabNumber); - if ((lastCommitPoint == 0) - && !mustLoadRefCounts(journal->summary, slab->slabNumber)) { - /* - * This slab claims that it has a tail block at (journal->size - 1), but - * a head of 1. This is impossible, due to the scrubbing threshold, on - * a real system, so don't bother reading the (bogus) data off disk. - */ - ASSERT_LOG_ONLY(((journal->size < 16) - || (journal->scrubbingThreshold < (journal->size - 1))), - "Scrubbing threshold protects against reads of unwritten" - "slab journal blocks"); - notifySlabJournalIsLoaded(slab, VDO_SUCCESS); - return; - } - - journal->resourceWaiter.callback = readSlabJournalTail; - int result = acquireVIO(slab->allocator, &journal->resourceWaiter); - if (result != VDO_SUCCESS) { - notifySlabJournalIsLoaded(slab, result); - } -} - -/**********************************************************************/ -void dumpSlabJournal(const SlabJournal *journal) -{ - logInfo(" slab journal: entryWaiters=%zu waitingToCommit=%s" - " updatingSlabSummary=%s head=%llu unreapable=%" PRIu64 - " tail=%llu nextCommit=%llu summarized=%" PRIu64 - " lastSummarized=%llu recoveryJournalLock=%" PRIu64 - " dirty=%s", countWaiters(&journal->entryWaiters), - boolToString(journal->waitingToCommit), - boolToString(journal->updatingSlabSummary), - journal->head, journal->unreapable, journal->tail, - journal->nextCommit, journal->summarized, journal->lastSummarized, - journal->recoveryLock, - boolToString(isSlabJournalDirty(journal))); - // Given the frequency with which the locks are just a tiny bit off, it - // might be worth dumping all the locks, but that might be too much logging. -} diff --git a/vdo/base/slabJournal.h b/vdo/base/slabJournal.h deleted file mode 100644 index a411711..0000000 --- a/vdo/base/slabJournal.h +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournal.h#8 $ - */ - -#ifndef SLAB_JOURNAL_H -#define SLAB_JOURNAL_H - -#include "completion.h" -#include "journalPoint.h" -#include "ringNode.h" -#include "types.h" - -/** - * Convert a completion to a SlabJournal. - * - * @param completion The completion to convert - * - * @return The completion as a SlabJournal - **/ -SlabJournal *asSlabJournal(VDOCompletion *completion) - __attribute__((warn_unused_result)); - -/** - * Calculate the number of slab journal entries per block. - * - * @return The number of slab journal entries per block - **/ -size_t getSlabJournalEntriesPerBlock(void) - __attribute__((warn_unused_result)); - -/** - * Obtain a pointer to a SlabJournal structure from a pointer to the - * dirtyRingNode field within it. - * - * @param node The RingNode to convert - * - * @return The RingNode as a SlabJournal - **/ -SlabJournal *slabJournalFromDirtyNode(RingNode *node) - __attribute__((warn_unused_result)); - -/** - * Create a slab journal. - * - * @param [in] allocator The block allocator which owns this journal - * @param [in] slab The parent slab of the journal - * @param [in] recoveryJournal The recovery journal of the VDO - * @param [out] journalPtr The pointer to hold the new slab journal - * - * @return VDO_SUCCESS or error code - **/ -int makeSlabJournal(BlockAllocator *allocator, - Slab *slab, - RecoveryJournal *recoveryJournal, - SlabJournal **journalPtr) - __attribute__((warn_unused_result)); - -/** - * Free a slab journal and null out the reference to it. - * - * @param journalPtr The reference to the slab journal to free - **/ -void freeSlabJournal(SlabJournal **journalPtr); - -/** - * Check whether a slab journal is blank, meaning it has never had any entries - * recorded in it. - * - * @param journal The journal to query - * - * @return true if the slab journal has never been modified - **/ -bool isSlabJournalBlank(const SlabJournal *journal) - __attribute__((warn_unused_result)); - -/** - * Check whether the slab journal is on the block allocator's ring of dirty - * journals. - * - * @param journal The journal to query - * - * @return true if the journal has been added to the dirty ring - **/ -bool isSlabJournalDirty(const SlabJournal *journal) - __attribute__((warn_unused_result)); - -/** - * Check whether a slab journal is active. - * - * @param journal The slab journal to check - * - * @return true if the journal is active - **/ -bool isSlabJournalActive(SlabJournal *journal) - __attribute__((warn_unused_result)); - -/** - * Abort any VIOs waiting to make slab journal entries. - * - * @param journal The journal to abort - **/ -void abortSlabJournalWaiters(SlabJournal *journal); - -/** - * Reopen a slab journal by emptying it and then adding any pending entries. - * - * @param journal The journal to reopen - **/ -void reopenSlabJournal(SlabJournal *journal); - -/** - * Attempt to replay a recovery journal entry into a slab journal. - * - * @param journal The slab journal to use - * @param pbn The PBN for the entry - * @param operation The type of entry to add - * @param recoveryPoint The recovery journal point corresponding to this entry - * @param parent The completion to notify when there is space to add - * the entry if the entry could not be added immediately - * - * @return true if the entry was added immediately - **/ -bool attemptReplayIntoSlabJournal(SlabJournal *journal, - PhysicalBlockNumber pbn, - JournalOperation operation, - JournalPoint *recoveryPoint, - VDOCompletion *parent) - __attribute__((warn_unused_result)); - -/** - * Add an entry to a slab journal. - * - * @param journal The slab journal to use - * @param dataVIO The DataVIO for which to add the entry - **/ -void addSlabJournalEntry(SlabJournal *journal, DataVIO *dataVIO); - -/** - * Adjust the reference count for a slab journal block. Note that when the - * adjustment is negative, the slab journal will be reaped. - * - * @param journal The slab journal - * @param sequenceNumber The journal sequence number of the referenced block - * @param adjustment Amount to adjust the reference counter - **/ -void adjustSlabJournalBlockReference(SlabJournal *journal, - SequenceNumber sequenceNumber, - int adjustment); - -/** - * Request the slab journal to release the recovery journal lock it may hold on - * a specified recovery journal block. - * - * @param journal The slab journal - * @param recoveryLock The sequence number of the recovery journal block - * whose locks should be released - * - * @return true if the journal does hold a lock on the specified - * block (which it will release) - **/ -bool releaseRecoveryJournalLock(SlabJournal *journal, - SequenceNumber recoveryLock) - __attribute__((warn_unused_result)); - -/** - * Commit the tail block of a slab journal. - * - * @param journal The journal whose tail block should be committed - **/ -void commitSlabJournalTail(SlabJournal *journal); - -/** - * Drain slab journal I/O. Depending upon the type of drain (as recorded in - * the journal's slab), any dirty journal blocks may be written out. - * - * @param journal The journal to drain - **/ -void drainSlabJournal(SlabJournal *journal); - -/** - * Decode the slab journal by reading its tail. - * - * @param journal The journal to decode - **/ -void decodeSlabJournal(SlabJournal *journal); - -/** - * Check to see if the journal should be scrubbed. - * - * @param journal The slab journal - * - * @return true if the journal requires scrubbing - **/ -bool requiresScrubbing(const SlabJournal *journal) - __attribute__((warn_unused_result)); - -/** - * Dump the slab journal. - * - * @param journal The slab journal to dump - **/ -void dumpSlabJournal(const SlabJournal *journal); - -#endif // SLAB_JOURNAL_H diff --git a/vdo/base/slabJournalEraser.c b/vdo/base/slabJournalEraser.c deleted file mode 100644 index 7cd6a81..0000000 --- a/vdo/base/slabJournalEraser.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalEraser.c#1 $ - */ - -#include "slabJournalEraser.h" - -#include "memoryAlloc.h" - -#include "completion.h" -#include "constants.h" -#include "extent.h" -#include "slab.h" -#include "slabDepot.h" - -typedef struct { - VDOCompletion *parent; - VDOExtent *extent; - char *zeroBuffer; - SlabIterator slabs; -} SlabJournalEraser; - -/** - * Free the eraser and finish the parent. - * - * @param eraser The eraser that is done - * @param result The result to return to the parent - **/ -static void finishErasing(SlabJournalEraser *eraser, int result) -{ - VDOCompletion *parent = eraser->parent; - freeExtent(&eraser->extent); - FREE(eraser->zeroBuffer); - FREE(eraser); - finishCompletion(parent, result); -} - -/** - * Finish erasing slab journals with an error. - * - * @param completion A completion whose parent is the eraser - **/ -static void handleErasingError(VDOCompletion *completion) -{ - SlabJournalEraser *eraser = completion->parent; - finishErasing(eraser, eraser->extent->completion.result); -} - -/** - * Erase the next slab journal. - * - * @param extentCompletion A completion whose parent is the eraser - **/ -static void eraseNextSlabJournal(VDOCompletion *extentCompletion) -{ - SlabJournalEraser *eraser = extentCompletion->parent; - - if (!hasNextSlab(&eraser->slabs)) { - finishErasing(eraser, VDO_SUCCESS); - return; - } - - Slab *slab = nextSlab(&eraser->slabs); - writeMetadataExtent(eraser->extent, slab->journalOrigin); -} - -/**********************************************************************/ -void eraseSlabJournals(SlabDepot *depot, - SlabIterator slabs, - VDOCompletion *parent) -{ - SlabJournalEraser *eraser; - int result = ALLOCATE(1, SlabJournalEraser, __func__, &eraser); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - eraser->parent = parent; - eraser->slabs = slabs; - - BlockCount journalSize = getSlabConfig(depot)->slabJournalBlocks; - result = ALLOCATE(journalSize * VDO_BLOCK_SIZE, char, __func__, - &eraser->zeroBuffer); - if (result != VDO_SUCCESS) { - finishErasing(eraser, result); - return; - } - - result = createExtent(parent->layer, VIO_TYPE_SLAB_JOURNAL, - VIO_PRIORITY_METADATA, journalSize, eraser->zeroBuffer, - &eraser->extent); - if (result != VDO_SUCCESS) { - finishErasing(eraser, result); - return; - } - - VDOCompletion *extentCompletion = &eraser->extent->completion; - prepareCompletion(extentCompletion, eraseNextSlabJournal, - handleErasingError, getCallbackThreadID(), eraser); - eraseNextSlabJournal(extentCompletion); -} diff --git a/vdo/base/slabJournalEraser.h b/vdo/base/slabJournalEraser.h deleted file mode 100644 index 215d86f..0000000 --- a/vdo/base/slabJournalEraser.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalEraser.h#1 $ - */ - -#ifndef SLAB_JOURNAL_ERASER_H -#define SLAB_JOURNAL_ERASER_H - -#include "slabIterator.h" -#include "types.h" - -/** - * Begin erasing slab journals, one at a time. - * - * @param depot The depot from which to erase - * @param slabs The slabs whose journals need erasing - * @param parent The object to notify when complete - **/ -void eraseSlabJournals(SlabDepot *depot, - SlabIterator slabs, - VDOCompletion *parent); - -#endif // SLAB_JOURNAL_ERASER_H diff --git a/vdo/base/slabJournalInternals.h b/vdo/base/slabJournalInternals.h deleted file mode 100644 index ce7eafb..0000000 --- a/vdo/base/slabJournalInternals.h +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabJournalInternals.h#8 $ - */ - -#ifndef SLAB_JOURNAL_INTERNALS_H -#define SLAB_JOURNAL_INTERNALS_H - -#include "slabJournal.h" - -#include "numeric.h" - -#include "blockAllocatorInternals.h" -#include "blockMapEntry.h" -#include "journalPoint.h" -#include "slab.h" -#include "slabSummary.h" -#include "statistics.h" -#include "waitQueue.h" - -/** - * Slab journal blocks may have one of two formats, depending upon whether or - * not any of the entries in the block are block map increments. Since the - * steady state for a VDO is that all of the necessary block map pages will - * be allocated, most slab journal blocks will have only data entries. Such - * blocks can hold more entries, hence the two formats. - **/ - -/** A single slab journal entry */ -struct slabJournalEntry { - SlabBlockNumber sbn; - JournalOperation operation; -}; - -/** A single slab journal entry in its on-disk form */ -typedef union { - struct __attribute__((packed)) { - uint8_t offsetLow8; - uint8_t offsetMid8; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned offsetHigh7 : 7; - unsigned increment : 1; -#else - unsigned increment : 1; - unsigned offsetHigh7 : 7; -#endif - } fields; - - // A raw view of the packed encoding. - uint8_t raw[3]; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - // This view is only valid on little-endian machines and is only present for - // ease of directly examining packed entries in GDB. - struct __attribute__((packed)) { - unsigned offset : 23; - unsigned increment : 1; - } littleEndian; -#endif -} __attribute__((packed)) PackedSlabJournalEntry; - -/** The unpacked representation of the header of a slab journal block */ -typedef struct { - /** Sequence number for head of journal */ - SequenceNumber head; - /** Sequence number for this block */ - SequenceNumber sequenceNumber; - /** The nonce for a given VDO instance */ - Nonce nonce; - /** Recovery journal point for last entry */ - JournalPoint recoveryPoint; - /** Metadata type */ - VDOMetadataType metadataType; - /** Whether this block contains block map increments */ - bool hasBlockMapIncrements; - /** The number of entries in the block */ - JournalEntryCount entryCount; -} SlabJournalBlockHeader; - -/** - * The packed, on-disk representation of a slab journal block header. - * All fields are kept in little-endian byte order. - **/ -typedef union __attribute__((packed)) { - struct __attribute__((packed)) { - /** 64-bit sequence number for head of journal */ - byte head[8]; - /** 64-bit sequence number for this block */ - byte sequenceNumber[8]; - /** Recovery journal point for last entry, packed into 64 bits */ - PackedJournalPoint recoveryPoint; - /** The 64-bit nonce for a given VDO instance */ - byte nonce[8]; - /** 8-bit metadata type (should always be two, for the slab journal) */ - uint8_t metadataType; - /** Whether this block contains block map increments */ - bool hasBlockMapIncrements; - /** 16-bit count of the entries encoded in the block */ - byte entryCount[2]; - } fields; - - // A raw view of the packed encoding. - uint8_t raw[8 + 8 + 8 + 8 + 1 + 1 + 2]; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - // This view is only valid on little-endian machines and is only present for - // ease of directly examining packed entries in GDB. - struct __attribute__((packed)) { - SequenceNumber head; - SequenceNumber sequenceNumber; - PackedJournalPoint recoveryPoint; - Nonce nonce; - VDOMetadataType metadataType; - bool hasBlockMapIncrements; - JournalEntryCount entryCount; - } littleEndian; -#endif -} PackedSlabJournalBlockHeader; - -enum { - SLAB_JOURNAL_PAYLOAD_SIZE - = VDO_BLOCK_SIZE - sizeof(PackedSlabJournalBlockHeader), - SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE * 8) / 25, - SLAB_JOURNAL_ENTRY_TYPES_SIZE = ((SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK - 1) - / 8) + 1, - SLAB_JOURNAL_ENTRIES_PER_BLOCK = (SLAB_JOURNAL_PAYLOAD_SIZE - / sizeof(PackedSlabJournalEntry)), -}; - -/** The payload of a slab journal block which has block map increments */ -typedef struct { - /* The entries themselves */ - PackedSlabJournalEntry entries[SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK]; - /* The bit map indicating which entries are block map increments */ - byte entryTypes[SLAB_JOURNAL_ENTRY_TYPES_SIZE]; -} __attribute__((packed)) FullSlabJournalEntries; - -typedef union { - /* Entries which include block map increments */ - FullSlabJournalEntries fullEntries; - /* Entries which are only data updates */ - PackedSlabJournalEntry entries[SLAB_JOURNAL_ENTRIES_PER_BLOCK]; - /* Ensure the payload fills to the end of the block */ - byte space[SLAB_JOURNAL_PAYLOAD_SIZE]; -} __attribute__((packed)) SlabJournalPayload; - -typedef struct { - PackedSlabJournalBlockHeader header; - SlabJournalPayload payload; -} __attribute__((packed)) PackedSlabJournalBlock; - -typedef struct { - uint16_t count; - SequenceNumber recoveryStart; -} JournalLock; - -struct slabJournal { - /** A waiter object for getting a VIO pool entry */ - Waiter resourceWaiter; - /** A waiter object for updating the slab summary */ - Waiter slabSummaryWaiter; - /** A waiter object for getting an extent with which to flush */ - Waiter flushWaiter; - /** The queue of VIOs waiting to make an entry */ - WaitQueue entryWaiters; - /** The parent slab reference of this journal */ - Slab *slab; - - /** Whether a tail block commit is pending */ - bool waitingToCommit; - /** Whether the journal is updating the slab summary */ - bool updatingSlabSummary; - /** Whether the journal is adding entries from the entryWaiters queue */ - bool addingEntries; - /** Whether a partial write is in progress */ - bool partialWriteInProgress; - - /** The oldest block in the journal on disk */ - SequenceNumber head; - /** The oldest block in the journal which may not be reaped */ - SequenceNumber unreapable; - /** The end of the half-open interval of the active journal */ - SequenceNumber tail; - /** The next journal block to be committed */ - SequenceNumber nextCommit; - /** The tail sequence number that is written in the slab summary */ - SequenceNumber summarized; - /** The tail sequence number that was last summarized in slab summary */ - SequenceNumber lastSummarized; - - /** The sequence number of the recovery journal lock */ - SequenceNumber recoveryLock; - - /** - * The number of entries which fit in a single block. Can't use the constant - * because unit tests change this number. - **/ - JournalEntryCount entriesPerBlock; - /** - * The number of full entries which fit in a single block. Can't use the - * constant because unit tests change this number. - **/ - JournalEntryCount fullEntriesPerBlock; - - /** The recovery journal of the VDO (slab journal holds locks on it) */ - RecoveryJournal *recoveryJournal; - - /** The slab summary to update tail block location */ - SlabSummaryZone *summary; - /** The statistics shared by all slab journals in our physical zone */ - AtomicSlabJournalStatistics *events; - /** A ring of the VIO pool entries for outstanding journal block writes */ - RingNode uncommittedBlocks; - - /** - * The current tail block header state. This will be packed into - * the block just before it is written. - **/ - SlabJournalBlockHeader tailHeader; - /** A pointer to a block-sized buffer holding the packed block data */ - PackedSlabJournalBlock *block; - - /** The number of blocks in the on-disk journal */ - BlockCount size; - /** The number of blocks at which to start pushing reference blocks */ - BlockCount flushingThreshold; - /** The number of blocks at which all reference blocks should be writing */ - BlockCount flushingDeadline; - /** The number of blocks at which to wait for reference blocks to write */ - BlockCount blockingThreshold; - /** The number of blocks at which to scrub the slab before coming online */ - BlockCount scrubbingThreshold; - - /** This node is for BlockAllocator to keep a queue of dirty journals */ - RingNode dirtyNode; - - /** The lock for the oldest unreaped block of the journal */ - JournalLock *reapLock; - /** The locks for each on disk block */ - JournalLock locks[]; -}; - -/** - * Get the slab journal block offset of the given sequence number. - * - * @param journal The slab journal - * @param sequence The sequence number - * - * @return the offset corresponding to the sequence number - **/ -__attribute__((warn_unused_result)) -static inline TailBlockOffset -getSlabJournalBlockOffset(SlabJournal *journal, SequenceNumber sequence) -{ - return (sequence % journal->size); -} - -/** - * Encode a slab journal entry (exposed for unit tests). - * - * @param tailHeader The unpacked header for the block - * @param payload The journal block payload to hold the entry - * @param sbn The slab block number of the entry to encode - * @param operation The type of the entry - **/ -void encodeSlabJournalEntry(SlabJournalBlockHeader *tailHeader, - SlabJournalPayload *payload, - SlabBlockNumber sbn, - JournalOperation operation); - -/** - * Decode a slab journal entry. - * - * @param block The journal block holding the entry - * @param entryCount The number of the entry - * - * @return The decoded entry - **/ -SlabJournalEntry decodeSlabJournalEntry(PackedSlabJournalBlock *block, - JournalEntryCount entryCount) - __attribute__((warn_unused_result)); - -/** - * Generate the packed encoding of a slab journal entry. - * - * @param packed The entry into which to pack the values - * @param sbn The slab block number of the entry to encode - * @param isIncrement The increment flag - **/ -static inline void packSlabJournalEntry(PackedSlabJournalEntry *packed, - SlabBlockNumber sbn, - bool isIncrement) -{ - packed->fields.offsetLow8 = (sbn & 0x0000FF); - packed->fields.offsetMid8 = (sbn & 0x00FF00) >> 8; - packed->fields.offsetHigh7 = (sbn & 0x7F0000) >> 16; - packed->fields.increment = isIncrement ? 1 : 0; -} - -/** - * Decode the packed representation of a slab journal entry. - * - * @param packed The packed entry to decode - * - * @return The decoded slab journal entry - **/ -__attribute__((warn_unused_result)) -static inline -SlabJournalEntry unpackSlabJournalEntry(const PackedSlabJournalEntry *packed) -{ - SlabJournalEntry entry; - entry.sbn = packed->fields.offsetHigh7; - entry.sbn <<= 8; - entry.sbn |= packed->fields.offsetMid8; - entry.sbn <<= 8; - entry.sbn |= packed->fields.offsetLow8; - entry.operation - = (packed->fields.increment ? DATA_INCREMENT : DATA_DECREMENT); - return entry; -} - -/** - * Generate the packed representation of a slab block header. - * - * @param header The header containing the values to encode - * @param packed The header into which to pack the values - **/ -static inline -void packSlabJournalBlockHeader(const SlabJournalBlockHeader *header, - PackedSlabJournalBlockHeader *packed) -{ - storeUInt64LE(packed->fields.head, header->head); - storeUInt64LE(packed->fields.sequenceNumber, header->sequenceNumber); - storeUInt64LE(packed->fields.nonce, header->nonce); - storeUInt16LE(packed->fields.entryCount, header->entryCount); - - packed->fields.metadataType = header->metadataType; - packed->fields.hasBlockMapIncrements = header->hasBlockMapIncrements; - - packJournalPoint(&header->recoveryPoint, &packed->fields.recoveryPoint); -} - -/** - * Decode the packed representation of a slab block header. - * - * @param packed The packed header to decode - * @param header The header into which to unpack the values - **/ -static inline -void unpackSlabJournalBlockHeader(const PackedSlabJournalBlockHeader *packed, - SlabJournalBlockHeader *header) -{ - *header = (SlabJournalBlockHeader) { - .head = getUInt64LE(packed->fields.head), - .sequenceNumber = getUInt64LE(packed->fields.sequenceNumber), - .nonce = getUInt64LE(packed->fields.nonce), - .entryCount = getUInt16LE(packed->fields.entryCount), - .metadataType = packed->fields.metadataType, - .hasBlockMapIncrements = packed->fields.hasBlockMapIncrements, - }; - unpackJournalPoint(&packed->fields.recoveryPoint, &header->recoveryPoint); -} - -#endif // SLAB_JOURNAL_INTERNALS_H diff --git a/vdo/base/slabScrubber.c b/vdo/base/slabScrubber.c deleted file mode 100644 index e37e9c8..0000000 --- a/vdo/base/slabScrubber.c +++ /dev/null @@ -1,516 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubber.c#6 $ - */ - -#include "slabScrubberInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "adminState.h" -#include "blockAllocator.h" -#include "constants.h" -#include "readOnlyNotifier.h" -#include "recoveryJournal.h" -#include "refCounts.h" -#include "refCountsInternals.h" -#include "slab.h" -#include "slabJournalInternals.h" - -/** - * Allocate the buffer and extent used for reading the slab journal when - * scrubbing a slab. - * - * @param scrubber The slab scrubber for which to allocate - * @param layer The physical layer on which the scrubber resides - * @param slabJournalSize The size of a slab journal - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int allocateExtentAndBuffer(SlabScrubber *scrubber, - PhysicalLayer *layer, - BlockCount slabJournalSize) -{ - size_t bufferSize = VDO_BLOCK_SIZE * slabJournalSize; - int result = ALLOCATE(bufferSize, char, __func__, &scrubber->journalData); - if (result != VDO_SUCCESS) { - return result; - } - - return createExtent(layer, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, - slabJournalSize, scrubber->journalData, - &scrubber->extent); -} - -/**********************************************************************/ -int makeSlabScrubber(PhysicalLayer *layer, - BlockCount slabJournalSize, - ReadOnlyNotifier *readOnlyNotifier, - SlabScrubber **scrubberPtr) -{ - SlabScrubber *scrubber; - int result = ALLOCATE(1, SlabScrubber, __func__, &scrubber); - if (result != VDO_SUCCESS) { - return result; - } - - result = allocateExtentAndBuffer(scrubber, layer, slabJournalSize); - if (result != VDO_SUCCESS) { - freeSlabScrubber(&scrubber); - return result; - } - - initializeCompletion(&scrubber->completion, SLAB_SCRUBBER_COMPLETION, layer); - initializeRing(&scrubber->highPrioritySlabs); - initializeRing(&scrubber->slabs); - scrubber->readOnlyNotifier = readOnlyNotifier; - scrubber->adminState.state = ADMIN_STATE_SUSPENDED; - *scrubberPtr = scrubber; - return VDO_SUCCESS; -} - -/** - * Free the extent and buffer used for reading slab journals. - * - * @param scrubber The scrubber - **/ -static void freeExtentAndBuffer(SlabScrubber *scrubber) -{ - freeExtent(&scrubber->extent); - if (scrubber->journalData != NULL) { - FREE(scrubber->journalData); - scrubber->journalData = NULL; - } -} - -/**********************************************************************/ -void freeSlabScrubber(SlabScrubber **scrubberPtr) -{ - if (*scrubberPtr == NULL) { - return; - } - - SlabScrubber *scrubber = *scrubberPtr; - freeExtentAndBuffer(scrubber); - FREE(scrubber); - *scrubberPtr = NULL; -} - -/** - * Get the next slab to scrub. - * - * @param scrubber The slab scrubber - * - * @return The next slab to scrub or NULL if there are none - **/ -static Slab *getNextSlab(SlabScrubber *scrubber) -{ - if (!isRingEmpty(&scrubber->highPrioritySlabs)) { - return slabFromRingNode(scrubber->highPrioritySlabs.next); - } - - if (!isRingEmpty(&scrubber->slabs)) { - return slabFromRingNode(scrubber->slabs.next); - } - - return NULL; -} - -/**********************************************************************/ -bool hasSlabsToScrub(SlabScrubber *scrubber) -{ - return (getNextSlab(scrubber) != NULL); -} - -/**********************************************************************/ -SlabCount getScrubberSlabCount(const SlabScrubber *scrubber) -{ - return relaxedLoad64(&scrubber->slabCount); -} - -/**********************************************************************/ -void registerSlabForScrubbing(SlabScrubber *scrubber, - Slab *slab, - bool highPriority) -{ - ASSERT_LOG_ONLY((slab->status != SLAB_REBUILT), - "slab to be scrubbed is unrecovered"); - - if (slab->status != SLAB_REQUIRES_SCRUBBING) { - return; - } - - unspliceRingNode(&slab->ringNode); - if (!slab->wasQueuedForScrubbing) { - relaxedAdd64(&scrubber->slabCount, 1); - slab->wasQueuedForScrubbing = true; - } - - if (highPriority) { - slab->status = SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING; - pushRingNode(&scrubber->highPrioritySlabs, &slab->ringNode); - return; - } - - pushRingNode(&scrubber->slabs, &slab->ringNode); -} - -/** - * Stop scrubbing, either because there are no more slabs to scrub or because - * there's been an error. - * - * @param scrubber The scrubber - **/ -static void finishScrubbing(SlabScrubber *scrubber) -{ - if (!hasSlabsToScrub(scrubber)) { - freeExtentAndBuffer(scrubber); - } - - // Inform whoever is waiting that scrubbing has completed. - completeCompletion(&scrubber->completion); - - bool notify = hasWaiters(&scrubber->waiters); - - // Note that the scrubber has stopped, and inform anyone who might be waiting - // for that to happen. - if (!finishDraining(&scrubber->adminState)) { - scrubber->adminState.state = ADMIN_STATE_SUSPENDED; - } - - /* - * We can't notify waiters until after we've finished draining or they'll - * just requeue. Fortunately if there were waiters, we can't have been freed - * yet. - */ - if (notify) { - notifyAllWaiters(&scrubber->waiters, NULL, NULL); - } -} - -/**********************************************************************/ -static void scrubNextSlab(SlabScrubber *scrubber); - -/** - * Notify the scrubber that a slab has been scrubbed. This callback is - * registered in applyJournalEntries(). - * - * @param completion The slab rebuild completion - **/ -static void slabScrubbed(VDOCompletion *completion) -{ - SlabScrubber *scrubber = completion->parent; - finishScrubbingSlab(scrubber->slab); - relaxedAdd64(&scrubber->slabCount, -1); - scrubNextSlab(scrubber); -} - -/** - * Abort scrubbing due to an error. - * - * @param scrubber The slab scrubber - * @param result The error - **/ -static void abortScrubbing(SlabScrubber *scrubber, int result) -{ - enterReadOnlyMode(scrubber->readOnlyNotifier, result); - setCompletionResult(&scrubber->completion, result); - scrubNextSlab(scrubber); -} - -/** - * Handle errors while rebuilding a slab. - * - * @param completion The slab rebuild completion - **/ -static void handleScrubberError(VDOCompletion *completion) -{ - abortScrubbing(completion->parent, completion->result); -} - -/** - * Apply all the entries in a block to the reference counts. - * - * @param block A block with entries to apply - * @param entryCount The number of entries to apply - * @param blockNumber The sequence number of the block - * @param slab The slab to apply the entries to - * - * @return VDO_SUCCESS or an error code - **/ -static int applyBlockEntries(PackedSlabJournalBlock *block, - JournalEntryCount entryCount, - SequenceNumber blockNumber, - Slab *slab) -{ - JournalPoint entryPoint = { - .sequenceNumber = blockNumber, - .entryCount = 0, - }; - - SlabBlockNumber maxSBN = slab->end - slab->start; - while (entryPoint.entryCount < entryCount) { - SlabJournalEntry entry = decodeSlabJournalEntry(block, - entryPoint.entryCount); - if (entry.sbn > maxSBN) { - // This entry is out of bounds. - return logErrorWithStringError(VDO_CORRUPT_JOURNAL, "Slab journal entry" - " (%llu, %u) had invalid offset" - " %u in slab (size %u blocks)", - blockNumber, entryPoint.entryCount, - entry.sbn, maxSBN); - } - - int result = replayReferenceCountChange(slab->referenceCounts, &entryPoint, - entry); - if (result != VDO_SUCCESS) { - logErrorWithStringError(result, "Slab journal entry (%llu, %u)" - " (%s of offset %" PRIu32 ") could not be" - " applied in slab %u", - blockNumber, entryPoint.entryCount, - getJournalOperationName(entry.operation), - entry.sbn, slab->slabNumber); - return result; - } - entryPoint.entryCount++; - } - - return VDO_SUCCESS; -} - -/** - * Find the relevant extent of the slab journal and apply all valid entries. - * This is a callback registered in startScrubbing(). - * - * @param completion The metadata read extent completion - **/ -static void applyJournalEntries(VDOCompletion *completion) -{ - SlabScrubber *scrubber = completion->parent; - Slab *slab = scrubber->slab; - SlabJournal *journal = slab->journal; - RefCounts *referenceCounts = slab->referenceCounts; - - // Find the boundaries of the useful part of the journal. - SequenceNumber tail = journal->tail; - TailBlockOffset endIndex = getSlabJournalBlockOffset(journal, tail - 1); - char *endData = scrubber->journalData + (endIndex * VDO_BLOCK_SIZE); - PackedSlabJournalBlock *endBlock = (PackedSlabJournalBlock *) endData; - - SequenceNumber head = getUInt64LE(endBlock->header.fields.head); - TailBlockOffset headIndex = getSlabJournalBlockOffset(journal, head); - BlockCount index = headIndex; - - JournalPoint refCountsPoint = referenceCounts->slabJournalPoint; - JournalPoint lastEntryApplied = refCountsPoint; - for (SequenceNumber sequence = head; sequence < tail; sequence++) { - char *blockData = scrubber->journalData + (index * VDO_BLOCK_SIZE); - PackedSlabJournalBlock *block = (PackedSlabJournalBlock *) blockData; - SlabJournalBlockHeader header; - unpackSlabJournalBlockHeader(&block->header, &header); - - if ((header.nonce != slab->allocator->nonce) - || (header.metadataType != VDO_METADATA_SLAB_JOURNAL) - || (header.sequenceNumber != sequence) - || (header.entryCount > journal->entriesPerBlock) - || (header.hasBlockMapIncrements - && (header.entryCount > journal->fullEntriesPerBlock))) { - // The block is not what we expect it to be. - logError("Slab journal block for slab %u was invalid", - slab->slabNumber); - abortScrubbing(scrubber, VDO_CORRUPT_JOURNAL); - return; - } - - int result = applyBlockEntries(block, header.entryCount, sequence, slab); - if (result != VDO_SUCCESS) { - abortScrubbing(scrubber, result); - return; - } - - lastEntryApplied.sequenceNumber = sequence; - lastEntryApplied.entryCount = header.entryCount - 1; - index++; - if (index == journal->size) { - index = 0; - } - } - - // At the end of rebuild, the refCounts should be accurate to the end - // of the journal we just applied. - int result = ASSERT(!beforeJournalPoint(&lastEntryApplied, &refCountsPoint), - "Refcounts are not more accurate than the slab journal"); - if (result != VDO_SUCCESS) { - abortScrubbing(scrubber, result); - return; - } - - // Save out the rebuilt reference blocks. - prepareCompletion(completion, slabScrubbed, handleScrubberError, - completion->callbackThreadID, scrubber); - startSlabAction(slab, ADMIN_STATE_SAVE_FOR_SCRUBBING, completion); -} - -/** - * Read the current slab's journal from disk now that it has been flushed. - * This callback is registered in scrubNextSlab(). - * - * @param completion The scrubber's extent completion - **/ -static void startScrubbing(VDOCompletion *completion) -{ - SlabScrubber *scrubber = completion->parent; - Slab *slab = scrubber->slab; - if (getSummarizedCleanliness(slab->allocator->summary, slab->slabNumber)) { - slabScrubbed(completion); - return; - } - - prepareCompletion(&scrubber->extent->completion, applyJournalEntries, - handleScrubberError, completion->callbackThreadID, - completion->parent); - readMetadataExtent(scrubber->extent, slab->journalOrigin); -} - -/** - * Scrub the next slab if there is one. - * - * @param scrubber The scrubber - **/ -static void scrubNextSlab(SlabScrubber *scrubber) -{ - // Note: this notify call is always safe only because scrubbing can only - // be started when the VDO is quiescent. - notifyAllWaiters(&scrubber->waiters, NULL, NULL); - if (isReadOnly(scrubber->readOnlyNotifier)) { - setCompletionResult(&scrubber->completion, VDO_READ_ONLY); - finishScrubbing(scrubber); - return; - } - - Slab *slab = getNextSlab(scrubber); - if ((slab == NULL) - || (scrubber->highPriorityOnly - && isRingEmpty(&scrubber->highPrioritySlabs))) { - scrubber->highPriorityOnly = false; - finishScrubbing(scrubber); - return; - } - - if (finishDraining(&scrubber->adminState)) { - return; - } - - unspliceRingNode(&slab->ringNode); - scrubber->slab = slab; - VDOCompletion *completion = extentAsCompletion(scrubber->extent); - prepareCompletion(completion, startScrubbing, - handleScrubberError, scrubber->completion.callbackThreadID, - scrubber); - startSlabAction(slab, ADMIN_STATE_SCRUBBING, completion); -} - -/**********************************************************************/ -void scrubSlabs(SlabScrubber *scrubber, - void *parent, - VDOAction *callback, - VDOAction *errorHandler) -{ - resumeIfQuiescent(&scrubber->adminState); - ThreadID threadID = getCallbackThreadID(); - prepareCompletion(&scrubber->completion, callback, errorHandler, threadID, - parent); - if (!hasSlabsToScrub(scrubber)) { - finishScrubbing(scrubber); - return; - } - - scrubNextSlab(scrubber); -} - -/**********************************************************************/ -void scrubHighPrioritySlabs(SlabScrubber *scrubber, - bool scrubAtLeastOne, - VDOCompletion *parent, - VDOAction *callback, - VDOAction *errorHandler) -{ - if (scrubAtLeastOne && isRingEmpty(&scrubber->highPrioritySlabs)) { - Slab *slab = getNextSlab(scrubber); - if (slab != NULL) { - registerSlabForScrubbing(scrubber, slab, true); - } - } - scrubber->highPriorityOnly = true; - scrubSlabs(scrubber, parent, callback, errorHandler); -} - -/**********************************************************************/ -void stopScrubbing(SlabScrubber *scrubber, VDOCompletion *parent) -{ - if (isQuiescent(&scrubber->adminState)) { - completeCompletion(parent); - } else { - startDraining(&scrubber->adminState, ADMIN_STATE_SUSPENDING, parent, NULL); - } -} - -/**********************************************************************/ -void resumeScrubbing(SlabScrubber *scrubber, VDOCompletion *parent) -{ - if (!hasSlabsToScrub(scrubber)) { - completeCompletion(parent); - return; - } - - int result = resumeIfQuiescent(&scrubber->adminState); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - scrubNextSlab(scrubber); - completeCompletion(parent); -} - -/**********************************************************************/ -int enqueueCleanSlabWaiter(SlabScrubber *scrubber, Waiter *waiter) -{ - if (isReadOnly(scrubber->readOnlyNotifier)) { - return VDO_READ_ONLY; - } - - if (isQuiescent(&scrubber->adminState)) { - return VDO_NO_SPACE; - } - - return enqueueWaiter(&scrubber->waiters, waiter); -} - -/**********************************************************************/ -void dumpSlabScrubber(const SlabScrubber *scrubber) -{ - logInfo("slabScrubber slabCount %u waiters %zu %s%s", - getScrubberSlabCount(scrubber), - countWaiters(&scrubber->waiters), - getAdminStateName(&scrubber->adminState), - scrubber->highPriorityOnly ? ", highPriorityOnly " : ""); -} diff --git a/vdo/base/slabScrubber.h b/vdo/base/slabScrubber.h deleted file mode 100644 index ca13e63..0000000 --- a/vdo/base/slabScrubber.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubber.h#4 $ - */ - -#ifndef SLAB_SCRUBBER_H -#define SLAB_SCRUBBER_H - -#include "completion.h" -#include "types.h" -#include "waitQueue.h" - -/** - * Create a slab scrubber - * - * @param layer The physical layer of the VDO - * @param slabJournalSize The size of a slab journal in blocks - * @param readOnlyNotifier The context for entering read-only mode - * @param scrubberPtr A pointer to hold the scrubber - * - * @return VDO_SUCCESS or an error - **/ -int makeSlabScrubber(PhysicalLayer *layer, - BlockCount slabJournalSize, - ReadOnlyNotifier *readOnlyNotifier, - SlabScrubber **scrubberPtr) - __attribute__((warn_unused_result)); - -/** - * Free a slab scrubber and null out the reference to it. - * - * @param scrubberPtr A pointer to the scrubber to destroy - **/ -void freeSlabScrubber(SlabScrubber **scrubberPtr); - -/** - * Check whether a scrubber has slabs to scrub. - * - * @param scrubber The scrubber to check - * - * @return true if the scrubber has slabs to scrub - **/ -bool hasSlabsToScrub(SlabScrubber *scrubber) - __attribute__((warn_unused_result)); - -/** - * Register a slab with a scrubber. - * - * @param scrubber The scrubber - * @param slab The slab to scrub - * @param highPriority true if the slab should be put on the - * high-priority queue - **/ -void registerSlabForScrubbing(SlabScrubber *scrubber, - Slab *slab, - bool highPriority); - -/** - * Scrub all the slabs which have been registered with a slab scrubber. - * - * @param scrubber The scrubber - * @param parent The object to notify when scrubbing is complete - * @param callback The function to run when scrubbing is complete - * @param errorHandler The handler for scrubbing errors - **/ -void scrubSlabs(SlabScrubber *scrubber, - void *parent, - VDOAction *callback, - VDOAction *errorHandler); - -/** - * Scrub any slabs which have been registered at high priority with a slab - * scrubber. - * - * @param scrubber The scrubber - * @param scrubAtLeastOne true if one slab should always be - * scrubbed, even if there are no high-priority slabs - * (and there is at least one low priority slab) - * @param parent The completion to notify when scrubbing is complete - * @param callback The function to run when scrubbing is complete - * @param errorHandler The handler for scrubbing errors - **/ -void scrubHighPrioritySlabs(SlabScrubber *scrubber, - bool scrubAtLeastOne, - VDOCompletion *parent, - VDOAction *callback, - VDOAction *errorHandler); - -/** - * Tell the scrubber to stop scrubbing after it finishes the slab it is - * currently working on. - * - * @param scrubber The scrubber to stop - * @param parent The completion to notify when scrubbing has stopped - **/ -void stopScrubbing(SlabScrubber *scrubber, VDOCompletion *parent); - -/** - * Tell the scrubber to resume scrubbing if it has been stopped. - * - * @param scrubber The scrubber to resume - * @param parent The object to notify once scrubbing has resumed - **/ -void resumeScrubbing(SlabScrubber *scrubber, VDOCompletion *parent); - -/** - * Wait for a clean slab. - * - * @param scrubber The scrubber on which to wait - * @param waiter The waiter - * - * @return VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no - * slabs to scrub, and some other error otherwise - **/ -int enqueueCleanSlabWaiter(SlabScrubber *scrubber, Waiter *waiter); - -/** - * Get the number of slabs that are unrecovered or being scrubbed. - * - * @param scrubber The scrubber to query - * - * @return the number of slabs that are unrecovered or being scrubbed - **/ -SlabCount getScrubberSlabCount(const SlabScrubber *scrubber) - __attribute__((warn_unused_result)); - -/** - * Dump information about a slab scrubber to the log for debugging. - * - * @param scrubber The scrubber to dump - **/ -void dumpSlabScrubber(const SlabScrubber *scrubber); - -#endif /* SLAB_SCRUBBER_H */ diff --git a/vdo/base/slabScrubberInternals.h b/vdo/base/slabScrubberInternals.h deleted file mode 100644 index 3d3e8cd..0000000 --- a/vdo/base/slabScrubberInternals.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabScrubberInternals.h#5 $ - */ - -#ifndef SLAB_SCRUBBER_INTERNALS_H -#define SLAB_SCRUBBER_INTERNALS_H - -#include "slabScrubber.h" - -#include "adminState.h" -#include "atomic.h" -#include "extent.h" -#include "ringNode.h" - -struct slabScrubber { - VDOCompletion completion; - /** The queue of slabs to scrub first */ - RingNode highPrioritySlabs; - /** The queue of slabs to scrub once there are no highPrioritySlabs */ - RingNode slabs; - /** The queue of VIOs waiting for a slab to be scrubbed */ - WaitQueue waiters; - - // The number of slabs that are unrecovered or being scrubbed. This field is - // modified by the physical zone thread, but is queried by other threads. - Atomic64 slabCount; - - /** The administrative state of the scrubber */ - AdminState adminState; - /** Whether to only scrub high-priority slabs */ - bool highPriorityOnly; - /** The context for entering read-only mode */ - ReadOnlyNotifier *readOnlyNotifier; - /** The slab currently being scrubbed */ - Slab *slab; - /** The extent for loading slab journal blocks */ - VDOExtent *extent; - /** A buffer to store the slab journal blocks */ - char *journalData; -}; - -#endif // SLAB_SCRUBBER_INTERNALS_H diff --git a/vdo/base/slabSummary.c b/vdo/base/slabSummary.c deleted file mode 100644 index 7021c67..0000000 --- a/vdo/base/slabSummary.c +++ /dev/null @@ -1,651 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.c#7 $ - */ - -#include "slabSummary.h" - -#include "memoryAlloc.h" - -#include "adminState.h" -#include "constants.h" -#include "extent.h" -#include "readOnlyNotifier.h" -#include "slabSummaryInternals.h" -#include "threadConfig.h" -#include "types.h" - -// SIZING - -/**********************************************************************/ -static BlockCount getSlabSummaryZoneSize(BlockSize blockSize) -{ - SlabCount entriesPerBlock = blockSize / sizeof(SlabSummaryEntry); - BlockCount blocksNeeded = MAX_SLABS / entriesPerBlock; - return blocksNeeded; -} - -/**********************************************************************/ -BlockCount getSlabSummarySize(BlockSize blockSize) -{ - return getSlabSummaryZoneSize(blockSize) * MAX_PHYSICAL_ZONES; -} - -// FULLNESS HINT COMPUTATION - -/** - * Translate a slab's free block count into a 'fullness hint' that can be - * stored in a SlabSummaryEntry's 7 bits that are dedicated to its free count. - * - * Note: the number of free blocks must be strictly less than 2^23 blocks, - * even though theoretically slabs could contain precisely 2^23 blocks; there - * is an assumption that at least one block is used by metadata. This - * assumption is necessary; otherwise, the fullness hint might overflow. - * The fullness hint formula is roughly (fullness >> 16) & 0x7f, but - * ((1 << 23) >> 16) & 0x7f is the same as (0 >> 16) & 0x7f, namely 0, which - * is clearly a bad hint if it could indicate both 2^23 free blocks or 0 free - * blocks. - * - * @param summary The summary which is being updated - * @param freeBlocks The number of free blocks - * - * @return A fullness hint, which can be stored in 7 bits. - **/ -__attribute__((warn_unused_result)) -static uint8_t computeFullnessHint(SlabSummary *summary, BlockCount freeBlocks) -{ - ASSERT_LOG_ONLY((freeBlocks < (1 << 23)), - "free blocks must be less than 2^23"); - - if (freeBlocks == 0) { - return 0; - } - - BlockCount hint = freeBlocks >> summary->hintShift; - return ((hint == 0) ? 1 : hint); -} - -/** - * Translate a slab's free block hint into an approximate count, such that - * computeFullnessHint() is the inverse function of getApproximateFreeBlocks() - * (i.e. computeFullnessHint(getApproximateFreeBlocks(x)) == x). - * - * @param summary The summary from which the hint was obtained - * @param freeBlockHint The hint read from the summary - * - * @return An approximation to the free block count - **/ -__attribute__((warn_unused_result)) -static BlockCount getApproximateFreeBlocks(SlabSummary *summary, - uint8_t freeBlockHint) -{ - return ((BlockCount) freeBlockHint) << summary->hintShift; -} - -// MAKE/FREE FUNCTIONS - -/**********************************************************************/ -static void launchWrite(SlabSummaryBlock *summaryBlock); - -/** - * Initialize a SlabSummaryBlock. - * - * @param layer The backing layer - * @param summaryZone The parent SlabSummaryZone - * @param threadID The ID of the thread of physical zone of this block - * @param entries The entries this block manages - * @param index The index of this block in its zone's summary - * @param slabSummaryBlock The block to intialize - * - * @return VDO_SUCCESS or an error - **/ -static int initializeSlabSummaryBlock(PhysicalLayer *layer, - SlabSummaryZone *summaryZone, - ThreadID threadID, - SlabSummaryEntry *entries, - BlockCount index, - SlabSummaryBlock *slabSummaryBlock) -{ - int result = ALLOCATE(VDO_BLOCK_SIZE, char, __func__, - &slabSummaryBlock->outgoingEntries); - if (result != VDO_SUCCESS) { - return result; - } - - result = createVIO(layer, VIO_TYPE_SLAB_SUMMARY, VIO_PRIORITY_METADATA, - slabSummaryBlock, slabSummaryBlock->outgoingEntries, - &slabSummaryBlock->vio); - if (result != VDO_SUCCESS) { - return result; - } - - slabSummaryBlock->vio->completion.callbackThreadID = threadID; - slabSummaryBlock->zone = summaryZone; - slabSummaryBlock->entries = entries; - slabSummaryBlock->index = index; - return VDO_SUCCESS; -} - -/** - * Create a new, empty SlabSummaryZone object. - * - * @param summary The summary to which the new zone will belong - * @param layer The layer - * @param zoneNumber The zone this is - * @param threadID The ID of the thread for this zone - * @param entries The buffer to hold the entries in this zone - * - * @return VDO_SUCCESS or an error - **/ -static int makeSlabSummaryZone(SlabSummary *summary, - PhysicalLayer *layer, - ZoneCount zoneNumber, - ThreadID threadID, - SlabSummaryEntry *entries) -{ - int result = ALLOCATE_EXTENDED(SlabSummaryZone, summary->blocksPerZone, - SlabSummaryBlock, __func__, - &summary->zones[zoneNumber]); - if (result != VDO_SUCCESS) { - return result; - } - - SlabSummaryZone *summaryZone = summary->zones[zoneNumber]; - summaryZone->summary = summary; - summaryZone->zoneNumber = zoneNumber; - summaryZone->entries = entries; - - if (layer->createMetadataVIO == NULL) { - // Blocks are only used for writing, and without a createVIO() call, - // we'll never be writing anything. - return VDO_SUCCESS; - } - - // Initialize each block. - for (BlockCount i = 0; i < summary->blocksPerZone; i++) { - result = initializeSlabSummaryBlock(layer, summaryZone, threadID, entries, - i, &summaryZone->summaryBlocks[i]); - if (result != VDO_SUCCESS) { - return result; - } - entries += summary->entriesPerBlock; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeSlabSummary(PhysicalLayer *layer, - Partition *partition, - const ThreadConfig *threadConfig, - unsigned int slabSizeShift, - BlockCount maximumFreeBlocksPerSlab, - ReadOnlyNotifier *readOnlyNotifier, - SlabSummary **slabSummaryPtr) -{ - BlockCount blocksPerZone = getSlabSummaryZoneSize(VDO_BLOCK_SIZE); - SlabCount entriesPerBlock = MAX_SLABS / blocksPerZone; - int result = ASSERT((entriesPerBlock * blocksPerZone) == MAX_SLABS, - "block size must be a multiple of entry size"); - if (result != VDO_SUCCESS) { - return result; - } - - if (partition == NULL) { - // Don't make a slab summary for the formatter since it doesn't need it. - return VDO_SUCCESS; - } - - SlabSummary *summary; - result = ALLOCATE_EXTENDED(SlabSummary, threadConfig->physicalZoneCount, - SlabSummaryZone *, __func__, &summary); - if (result != VDO_SUCCESS) { - return result; - } - - summary->zoneCount = threadConfig->physicalZoneCount; - summary->readOnlyNotifier = readOnlyNotifier; - summary->hintShift = (slabSizeShift > 6) ? (slabSizeShift - 6) : 0; - summary->blocksPerZone = blocksPerZone; - summary->entriesPerBlock = entriesPerBlock; - - size_t totalEntries = MAX_SLABS * MAX_PHYSICAL_ZONES; - size_t entryBytes = totalEntries * sizeof(SlabSummaryEntry); - result = layer->allocateIOBuffer(layer, entryBytes, "summary entries", - (char **) &summary->entries); - if (result != VDO_SUCCESS) { - freeSlabSummary(&summary); - return result; - } - - // Initialize all the entries. - uint8_t hint = computeFullnessHint(summary, maximumFreeBlocksPerSlab); - for (size_t i = 0; i < totalEntries; i++) { - // This default tail block offset must be reflected in - // slabJournal.c::readSlabJournalTail(). - summary->entries[i] = (SlabSummaryEntry) { - .tailBlockOffset = 0, - .fullnessHint = hint, - .loadRefCounts = false, - .isDirty = false, - }; - } - - setSlabSummaryOrigin(summary, partition); - for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) { - result = makeSlabSummaryZone(summary, layer, zone, - getPhysicalZoneThread(threadConfig, zone), - summary->entries + (MAX_SLABS * zone)); - if (result != VDO_SUCCESS) { - freeSlabSummary(&summary); - return result; - } - } - - *slabSummaryPtr = summary; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeSlabSummary(SlabSummary **slabSummaryPtr) -{ - if (*slabSummaryPtr == NULL) { - return; - } - - SlabSummary *summary = *slabSummaryPtr; - for (ZoneCount zone = 0; zone < summary->zoneCount; zone++) { - SlabSummaryZone *summaryZone = summary->zones[zone]; - if (summaryZone != NULL) { - for (BlockCount i = 0; i < summary->blocksPerZone; i++) { - freeVIO(&summaryZone->summaryBlocks[i].vio); - FREE(summaryZone->summaryBlocks[i].outgoingEntries); - } - FREE(summaryZone); - } - } - FREE(summary->entries); - FREE(summary); - *slabSummaryPtr = NULL; -} - -/**********************************************************************/ -SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone) -{ - return summary->zones[zone]; -} - -// WRITING FUNCTIONALITY - -/** - * Check whether a summary zone has finished draining. - * - * @param summaryZone The zone to check - **/ -static void checkForDrainComplete(SlabSummaryZone *summaryZone) -{ - if (!isDraining(&summaryZone->state) || (summaryZone->writeCount > 0)) { - return; - } - - finishOperationWithResult(&summaryZone->state, - (isReadOnly(summaryZone->summary->readOnlyNotifier) - ? VDO_READ_ONLY : VDO_SUCCESS)); -} - -/** - * Wake all the waiters in a given queue. If the VDO is in read-only mode they - * will be given a VDO_READ_ONLY error code as their context, otherwise they - * will be given VDO_SUCCESS. - * - * @param summaryZone The slab summary which owns the queue - * @param queue The queue to notify - **/ -static void notifyWaiters(SlabSummaryZone *summaryZone, WaitQueue *queue) -{ - int result = (isReadOnly(summaryZone->summary->readOnlyNotifier) - ? VDO_READ_ONLY : VDO_SUCCESS); - notifyAllWaiters(queue, NULL, &result); -} - -/** - * Finish processing a block which attempted to write, whether or not the - * attempt succeeded. - * - * @param block The block - **/ -static void finishUpdatingSlabSummaryBlock(SlabSummaryBlock *block) -{ - notifyWaiters(block->zone, &block->currentUpdateWaiters); - block->writing = false; - block->zone->writeCount--; - if (hasWaiters(&block->nextUpdateWaiters)) { - launchWrite(block); - } else { - checkForDrainComplete(block->zone); - } -} - -/** - * This is the callback for a successful block write. - * - * @param completion The write VIO - **/ -static void finishUpdate(VDOCompletion *completion) -{ - SlabSummaryBlock *block = completion->parent; - atomicAdd64(&block->zone->summary->statistics.blocksWritten, 1); - finishUpdatingSlabSummaryBlock(block); -} - -/** - * Handle an error writing a slab summary block. - * - * @param completion The write VIO - **/ -static void handleWriteError(VDOCompletion *completion) -{ - SlabSummaryBlock *block = completion->parent; - enterReadOnlyMode(block->zone->summary->readOnlyNotifier, - completion->result); - finishUpdatingSlabSummaryBlock(block); -} - -/** - * Write a slab summary block unless it is currently out for writing. - * - * @param [in] block The block that needs to be committed - **/ -static void launchWrite(SlabSummaryBlock *block) -{ - if (block->writing) { - return; - } - - SlabSummaryZone *zone = block->zone; - zone->writeCount++; - transferAllWaiters(&block->nextUpdateWaiters, &block->currentUpdateWaiters); - block->writing = true; - - SlabSummary *summary = zone->summary; - if (isReadOnly(summary->readOnlyNotifier)) { - finishUpdatingSlabSummaryBlock(block); - return; - } - - memcpy(block->outgoingEntries, block->entries, - sizeof(SlabSummaryEntry) * summary->entriesPerBlock); - - // Flush before writing to ensure that the slab journal tail blocks and - // reference updates covered by this summary update are stable (VDO-2332). - PhysicalBlockNumber pbn = (summary->origin - + (summary->blocksPerZone * zone->zoneNumber) - + block->index); - launchWriteMetadataVIOWithFlush(block->vio, pbn, finishUpdate, - handleWriteError, true, false); -} - -/** - * Initiate a drain. - * - * Implements AdminInitiator. - **/ -static void initiateDrain(AdminState *state) -{ - checkForDrainComplete(container_of(state, SlabSummaryZone, state)); -} - -/**********************************************************************/ -void drainSlabSummaryZone(SlabSummaryZone *summaryZone, - AdminStateCode operation, - VDOCompletion *parent) -{ - startDraining(&summaryZone->state, operation, parent, initiateDrain); -} - -/**********************************************************************/ -void resumeSlabSummaryZone(SlabSummaryZone *summaryZone, VDOCompletion *parent) -{ - finishCompletion(parent, resumeIfQuiescent(&summaryZone->state)); -} - -// READ/UPDATE FUNCTIONS - -/** - * Get the summary block, and offset into it, for storing the summary for a - * slab. - * - * @param summaryZone The SlabSummaryZone being queried - * @param slabNumber The slab whose summary location is sought - * - * @return A pointer to the SlabSummaryEntryBlock containing this - * SlabSummaryEntry - **/ -static SlabSummaryBlock *getSummaryBlockForSlab(SlabSummaryZone *summaryZone, - SlabCount slabNumber) -{ - SlabCount entriesPerBlock = summaryZone->summary->entriesPerBlock; - return &summaryZone->summaryBlocks[slabNumber / entriesPerBlock]; -} - -/**********************************************************************/ -void updateSlabSummaryEntry(SlabSummaryZone *summaryZone, - Waiter *waiter, - SlabCount slabNumber, - TailBlockOffset tailBlockOffset, - bool loadRefCounts, - bool isClean, - BlockCount freeBlocks) -{ - SlabSummaryBlock *block = getSummaryBlockForSlab(summaryZone, slabNumber); - int result; - if (isReadOnly(summaryZone->summary->readOnlyNotifier)) { - result = VDO_READ_ONLY; - } else if (isDraining(&summaryZone->state) - || isQuiescent(&summaryZone->state)) { - result = VDO_INVALID_ADMIN_STATE; - } else { - uint8_t hint = computeFullnessHint(summaryZone->summary, freeBlocks); - SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; - *entry = (SlabSummaryEntry) { - .tailBlockOffset = tailBlockOffset, - .loadRefCounts = (entry->loadRefCounts || loadRefCounts), - .isDirty = !isClean, - .fullnessHint = hint, - }; - result = enqueueWaiter(&block->nextUpdateWaiters, waiter); - } - - if (result != VDO_SUCCESS) { - waiter->callback(waiter, &result); - return; - } - - launchWrite(block); -} - -/**********************************************************************/ -TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone, - SlabCount slabNumber) -{ - return summaryZone->entries[slabNumber].tailBlockOffset; -} - -/**********************************************************************/ -bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber) -{ - return summaryZone->entries[slabNumber].loadRefCounts; -} - -/**********************************************************************/ -bool getSummarizedCleanliness(SlabSummaryZone *summaryZone, - SlabCount slabNumber) -{ - return !summaryZone->entries[slabNumber].isDirty; -} - -/**********************************************************************/ -BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone, - SlabCount slabNumber) -{ - SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; - return getApproximateFreeBlocks(summaryZone->summary, entry->fullnessHint); -} - -/**********************************************************************/ -void getSummarizedRefCountsState(SlabSummaryZone *summaryZone, - SlabCount slabNumber, - size_t *freeBlockHint, - bool *isClean) -{ - SlabSummaryEntry *entry = &summaryZone->entries[slabNumber]; - *freeBlockHint = entry->fullnessHint; - *isClean = !entry->isDirty; -} - -/**********************************************************************/ -void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone, - SlabCount slabCount, - SlabStatus *statuses) -{ - for (SlabCount i = 0; i < slabCount; i++) { - statuses[i] = (SlabStatus) { - .slabNumber = i, - .isClean = !summaryZone->entries[i].isDirty, - .emptiness = summaryZone->entries[i].fullnessHint - }; - } -} - -// RESIZE FUNCTIONS - -/**********************************************************************/ -void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition) -{ - summary->origin = getFixedLayoutPartitionOffset(partition); -} - -// COMBINING FUNCTIONS (LOAD) - -/** - * Clean up after saving out the combined slab summary. This callback is - * registered in finishLoadingSummary() and loadSlabSummary(). - * - * @param completion The extent which was used to write the summary data - **/ -static void finishCombiningZones(VDOCompletion *completion) -{ - SlabSummary *summary = completion->parent; - int result = completion->result; - VDOExtent *extent = asVDOExtent(completion); - freeExtent(&extent); - finishLoadingWithResult(&summary->zones[0]->state, result); -} - -/**********************************************************************/ -void combineZones(SlabSummary *summary) -{ - // Combine all the old summary data into the portion of the buffer - // corresponding to the first zone. - ZoneCount zone = 0; - if (summary->zonesToCombine > 1) { - for (SlabCount entryNumber = 0; entryNumber < MAX_SLABS; entryNumber++) { - if (zone != 0) { - memcpy(summary->entries + entryNumber, - summary->entries + (zone * MAX_SLABS) + entryNumber, - sizeof(SlabSummaryEntry)); - } - zone++; - if (zone == summary->zonesToCombine) { - zone = 0; - } - } - } - - // Copy the combined data to each zones's region of the buffer. - for (zone = 1; zone < MAX_PHYSICAL_ZONES; zone++) { - memcpy(summary->entries + (zone * MAX_SLABS), summary->entries, - MAX_SLABS * sizeof(SlabSummaryEntry)); - } -} - -/** - * Combine the slab summary data from all the previously written zones - * and copy the combined summary to each partition's data region. Then write - * the combined summary back out to disk. This callback is registered in - * loadSlabSummary(). - * - * @param completion The extent which was used to read the summary data - **/ -static void finishLoadingSummary(VDOCompletion *completion) -{ - SlabSummary *summary = completion->parent; - VDOExtent *extent = asVDOExtent(completion); - - // Combine the zones so each zone is correct for all slabs. - combineZones(summary); - - // Write the combined summary back out. - extent->completion.callback = finishCombiningZones; - writeMetadataExtent(extent, summary->origin); -} - -/**********************************************************************/ -void loadSlabSummary(SlabSummary *summary, - AdminStateCode operation, - ZoneCount zonesToCombine, - VDOCompletion *parent) -{ - SlabSummaryZone *zone = summary->zones[0]; - if (!startLoading(&zone->state, operation, parent, NULL)) { - return; - } - - VDOExtent *extent; - BlockCount blocks = summary->blocksPerZone * MAX_PHYSICAL_ZONES; - int result = createExtent(parent->layer, VIO_TYPE_SLAB_SUMMARY, - VIO_PRIORITY_METADATA, blocks, - (char *) summary->entries, &extent); - if (result != VDO_SUCCESS) { - finishLoadingWithResult(&zone->state, result); - return; - } - - if ((operation == ADMIN_STATE_FORMATTING) - || (operation == ADMIN_STATE_LOADING_FOR_REBUILD)) { - prepareCompletion(&extent->completion, finishCombiningZones, - finishCombiningZones, 0, summary); - writeMetadataExtent(extent, summary->origin); - return; - } - - summary->zonesToCombine = zonesToCombine; - prepareCompletion(&extent->completion, finishLoadingSummary, - finishCombiningZones, 0, summary); - readMetadataExtent(extent, summary->origin); -} - -/**********************************************************************/ -SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary) -{ - const AtomicSlabSummaryStatistics *atoms = &summary->statistics; - return (SlabSummaryStatistics) { - .blocksWritten = atomicLoad64(&atoms->blocksWritten), - }; -} diff --git a/vdo/base/slabSummary.h b/vdo/base/slabSummary.h deleted file mode 100644 index 4ce32cb..0000000 --- a/vdo/base/slabSummary.h +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummary.h#5 $ - */ - -#ifndef SLAB_SUMMARY_H -#define SLAB_SUMMARY_H - -#include "completion.h" -#include "fixedLayout.h" -#include "slab.h" -#include "statistics.h" -#include "types.h" -#include "waitQueue.h" - -/** - * The SlabSummary provides hints during load and recovery about the state - * of the slabs in order to avoid the need to read the slab journals in their - * entirety before a VDO can come online. - * - * The information in the summary for each slab includes the rough number of - * free blocks (which is used to prioritize scrubbing), the cleanliness of a - * slab (so that clean slabs containing free space will be used on restart), - * and the location of the tail block of the slab's journal. - * - * The SlabSummary has its own partition at the end of the volume which is - * sized to allow for a complete copy of the summary for each of up to 16 - * physical zones. - * - * During resize, the SlabSummary moves its backing partition and is saved once - * moved; the SlabSummary is not permitted to overwrite the previous recovery - * journal space. - * - * The SlabSummary does not have its own version information, but relies on the - * master version number. - **/ - -/** - * The offset of a slab journal tail block. - **/ -typedef uint8_t TailBlockOffset; - -/** - * A slab status is a very small structure for use in determining the ordering - * of slabs in the scrubbing process. - **/ -typedef struct slabStatus { - SlabCount slabNumber; - bool isClean; - uint8_t emptiness; -} SlabStatus; - -/** - * Returns the size on disk of the SlabSummary structure. - * - * @param blockSize The block size of the physical layer - * - * @return the blocks required to store the SlabSummary on disk - **/ -BlockCount getSlabSummarySize(BlockSize blockSize) -__attribute__((warn_unused_result)); - -/** - * Create a slab summary. - * - * @param [in] layer The layer - * @param [in] partition The partition to hold the summary - * @param [in] threadConfig The thread config of the VDO - * @param [in] slabSizeShift The number of bits in the slab size - * @param [in] maximumFreeBlocksPerSlab The maximum number of free blocks a - * slab can have - * @param [in] readOnlyNotifier The context for entering read-only - * mode - * @param [out] slabSummaryPtr A pointer to hold the summary - * - * @return VDO_SUCCESS or an error - **/ -int makeSlabSummary(PhysicalLayer *layer, - Partition *partition, - const ThreadConfig *threadConfig, - unsigned int slabSizeShift, - BlockCount maximumFreeBlocksPerSlab, - ReadOnlyNotifier *readOnlyNotifier, - SlabSummary **slabSummaryPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy a SlabSummary and NULL out the reference to it. - * - * @param [in,out] slabSummaryPtr A pointer to the SlabSummary to free - **/ -void freeSlabSummary(SlabSummary **slabSummaryPtr); - -/** - * Get the portion of the slab summary for a specified zone. - * - * @param summary The slab summary - * @param zone The zone - * - * @return The portion of the slab summary for the specified zone - **/ -SlabSummaryZone *getSummaryForZone(SlabSummary *summary, ZoneCount zone) - __attribute__((warn_unused_result)); - -/** - * Drain a zone of the slab summary. - * - * @param summaryZone The zone to drain - * @param operation The type of drain to perform - * @param parent The object to notify when the suspend is complete - **/ -void drainSlabSummaryZone(SlabSummaryZone *summaryZone, - AdminStateCode operation, - VDOCompletion *parent); - -/** - * Resume a zone of the slab summary. - * - * @param summaryZone The zone to resume - * @param parent The object to notify when the zone is resumed - **/ -void resumeSlabSummaryZone(SlabSummaryZone *summaryZone, - VDOCompletion *parent); - -/** - * Update the entry for a slab. - * - * @param summaryZone The SlabSummaryZone for the zone of the slab - * @param waiter The waiter that is updating the summary - * @param slabNumber The slab number to update - * @param tailBlockOffset The offset of slab journal's tail block - * @param loadRefCounts Whether the refCounts must be loaded from the layer - * on the next load - * @param isClean Whether the slab is clean - * @param freeBlocks The number of free blocks - **/ -void updateSlabSummaryEntry(SlabSummaryZone *summaryZone, - Waiter *waiter, - SlabCount slabNumber, - TailBlockOffset tailBlockOffset, - bool loadRefCounts, - bool isClean, - BlockCount freeBlocks); - -/** - * Get the stored tail block offset for a slab. - * - * @param summaryZone The SlabSummaryZone to use - * @param slabNumber The slab number to get the offset for - * - * @return The tail block offset for the slab - **/ -TailBlockOffset getSummarizedTailBlockOffset(SlabSummaryZone *summaryZone, - SlabCount slabNumber) - __attribute__((warn_unused_result)); - -/** - * Whether refCounts must be loaded from the layer. - * - * @param summaryZone The SlabSummaryZone to use - * @param slabNumber The slab number to get information for - * - * @return Whether refCounts must be loaded - **/ -bool mustLoadRefCounts(SlabSummaryZone *summaryZone, SlabCount slabNumber) - __attribute__((warn_unused_result)); - -/** - * Get the stored cleanliness information for a single slab. - * - * @param summaryZone The SlabSummaryZone to use - * @param slabNumber The slab number to get information for - * - * @return Whether the slab is clean - **/ -bool getSummarizedCleanliness(SlabSummaryZone *summaryZone, - SlabCount slabNumber) - __attribute__((warn_unused_result)); - -/** - * Get the stored emptiness information for a single slab. - * - * @param summaryZone The SlabSummaryZone to use - * @param slabNumber The slab number to get information for - * - * @return An approximation to the free blocks in the slab - **/ -BlockCount getSummarizedFreeBlockCount(SlabSummaryZone *summaryZone, - SlabCount slabNumber) - __attribute__((warn_unused_result)); - -/** - * Get the stored RefCounts state information for a single slab. Used - * in testing only. - * - * @param [in] summaryZone The SlabSummaryZone to use - * @param [in] slabNumber The slab number to get information for - * @param [out] freeBlockHint The approximate number of free blocks - * @param [out] isClean Whether the slab is clean - **/ -void getSummarizedRefCountsState(SlabSummaryZone *summaryZone, - SlabCount slabNumber, - size_t *freeBlockHint, - bool *isClean); - -/** - * Get the stored slab statuses for all slabs in a zone. - * - * @param [in] summaryZone The SlabSummaryZone to use - * @param [in] slabCount The number of slabs to fetch - * @param [in,out] statuses An array of SlabStatuses to populate - **/ -void getSummarizedSlabStatuses(SlabSummaryZone *summaryZone, - SlabCount slabCount, - SlabStatus *statuses); - -/** - * Set the origin of the slab summary relative to the physical layer. - * - * @param summary The SlabSummary to update - * @param partition The slab summary partition - **/ -void setSlabSummaryOrigin(SlabSummary *summary, Partition *partition); - -/** - * Read in all the slab summary data from the slab summary partition, - * combine all the previously used zones into a single zone, and then - * write the combined summary back out to each possible zones' summary - * region. - * - * @param summary The summary to load - * @param operation The type of load to perform - * @param zonesToCombine The number of zones to be combined; if set to 0, - * all of the summary will be initialized as new. - * @param parent The parent of this operation - **/ -void loadSlabSummary(SlabSummary *summary, - AdminStateCode operation, - ZoneCount zonesToCombine, - VDOCompletion *parent); - -/** - * Fetch the cumulative statistics for all slab summary zones in a summary. - * - * @param summary The summary in question - * - * @return the cumulative slab summary statistics for the summary - **/ -SlabSummaryStatistics getSlabSummaryStatistics(const SlabSummary *summary) - __attribute__((warn_unused_result)); - -#endif // SLAB_SUMMARY_H diff --git a/vdo/base/slabSummaryInternals.h b/vdo/base/slabSummaryInternals.h deleted file mode 100644 index 8ac071c..0000000 --- a/vdo/base/slabSummaryInternals.h +++ /dev/null @@ -1,126 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/slabSummaryInternals.h#7 $ - */ - -#ifndef SLAB_SUMMARY_INTERNALS_H -#define SLAB_SUMMARY_INTERNALS_H - -#include "slabSummary.h" - -#include "adminState.h" -#include "atomic.h" - -typedef struct slabSummaryEntry { - /** Bits 7..0: The offset of the tail block within the slab journal */ - TailBlockOffset tailBlockOffset; - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - /** Bits 13..8: A hint about the fullness of the slab */ - unsigned int fullnessHint : 6; - /** Bit 14: Whether the refCounts must be loaded from the layer */ - unsigned int loadRefCounts : 1; - /** Bit 15: The believed cleanliness of this slab */ - unsigned int isDirty : 1; -#else - /** Bit 15: The believed cleanliness of this slab */ - unsigned int isDirty : 1; - /** Bit 14: Whether the refCounts must be loaded from the layer */ - unsigned int loadRefCounts : 1; - /** Bits 13..8: A hint about the fullness of the slab */ - unsigned int fullnessHint : 6; -#endif -} __attribute__((packed)) SlabSummaryEntry; - -typedef struct slabSummaryBlock { - /** The zone to which this block belongs */ - SlabSummaryZone *zone; - /** The index of this block in its zone's summary */ - BlockCount index; - /** Whether this block has a write outstanding */ - bool writing; - /** Ring of updates waiting on the outstanding write */ - WaitQueue currentUpdateWaiters; - /** Ring of updates waiting on the next write */ - WaitQueue nextUpdateWaiters; - /** The active SlabSummaryEntry array for this block */ - SlabSummaryEntry *entries; - /** The VIO used to write this block */ - VIO *vio; - /** The packed entries, one block long, backing the VIO */ - char *outgoingEntries; -} SlabSummaryBlock; - -/** - * The statistics for all the slab summary zones owned by this slab summary. - * These fields are all mutated only by their physical zone threads, but are - * read by other threads when gathering statistics for the entire depot. - **/ -typedef struct atomicSlabSummaryStatistics { - /** Number of blocks written */ - Atomic64 blocksWritten; -} AtomicSlabSummaryStatistics; - -struct slabSummaryZone { - /** The summary of which this is a zone */ - SlabSummary *summary; - /** The number of this zone */ - ZoneCount zoneNumber; - /** Count of the number of blocks currently out for writing */ - BlockCount writeCount; - /** The state of this zone */ - AdminState state; - /** The array (owned by the blocks) of all entries */ - SlabSummaryEntry *entries; - /** The array of SlabSummaryEntryBlocks */ - SlabSummaryBlock summaryBlocks[]; -}; - -struct slabSummary { - /** The context for entering read-only mode */ - ReadOnlyNotifier *readOnlyNotifier; - /** The statistics for this slab summary */ - AtomicSlabSummaryStatistics statistics; - /** The start of the slab summary partition relative to the layer */ - PhysicalBlockNumber origin; - /** The number of bits to shift to get a 7-bit fullness hint */ - unsigned int hintShift; - /** The number of blocks (calculated based on MAX_SLABS) */ - BlockCount blocksPerZone; - /** The number of slabs per block (calculated from block size) */ - SlabCount entriesPerBlock; - /** The entries for all of the zones the partition can hold */ - SlabSummaryEntry *entries; - /** The number of zones which were active at the time of the last update */ - ZoneCount zonesToCombine; - /** The current number of active zones */ - ZoneCount zoneCount; - /** The currently active zones */ - SlabSummaryZone *zones[]; -}; - -/** - * Treating the current entries buffer as the on-disk value of all zones, - * update every zone to the correct values for every slab. - * - * @param summary The summary whose entries should be combined - **/ -void combineZones(SlabSummary *summary); - -#endif // SLAB_SUMMARY_INTERNALS_H diff --git a/vdo/base/statistics.h b/vdo/base/statistics.h deleted file mode 100644 index 2511076..0000000 --- a/vdo/base/statistics.h +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -#ifndef STATISTICS_H -#define STATISTICS_H - -#include "header.h" -#include "types.h" - -enum { - STATISTICS_VERSION = 31, -}; - -typedef struct { - /** The total number of slabs from which blocks may be allocated */ - uint64_t slabCount; - /** The total number of slabs from which blocks have ever been allocated */ - uint64_t slabsOpened; - /** The number of times since loading that a slab has been re-opened */ - uint64_t slabsReopened; -} BlockAllocatorStatistics; - -/** - * Counters for tracking the number of items written (blocks, requests, etc.) - * that keep track of totals at steps in the write pipeline. Three counters - * allow the number of buffered, in-memory items and the number of in-flight, - * unacknowledged writes to be derived, while still tracking totals for - * reporting purposes - **/ -typedef struct { - /** The total number of items on which processing has started */ - uint64_t started; - /** The total number of items for which a write operation has been issued */ - uint64_t written; - /** The total number of items for which a write operation has completed */ - uint64_t committed; -} CommitStatistics; - -/** Counters for events in the recovery journal */ -typedef struct { - /** Number of times the on-disk journal was full */ - uint64_t diskFull; - /** Number of times the recovery journal requested slab journal commits. */ - uint64_t slabJournalCommitsRequested; - /** Write/Commit totals for individual journal entries */ - CommitStatistics entries; - /** Write/Commit totals for journal blocks */ - CommitStatistics blocks; -} RecoveryJournalStatistics; - -/** The statistics for the compressed block packer. */ -typedef struct { - /** Number of compressed data items written since startup */ - uint64_t compressedFragmentsWritten; - /** Number of blocks containing compressed items written since startup */ - uint64_t compressedBlocksWritten; - /** Number of VIOs that are pending in the packer */ - uint64_t compressedFragmentsInPacker; -} PackerStatistics; - -/** The statistics for the slab journals. */ -typedef struct { - /** Number of times the on-disk journal was full */ - uint64_t diskFullCount; - /** Number of times an entry was added over the flush threshold */ - uint64_t flushCount; - /** Number of times an entry was added over the block threshold */ - uint64_t blockedCount; - /** Number of times a tail block was written */ - uint64_t blocksWritten; - /** Number of times we had to wait for the tail to write */ - uint64_t tailBusyCount; -} SlabJournalStatistics; - -/** The statistics for the slab summary. */ -typedef struct { - /** Number of blocks written */ - uint64_t blocksWritten; -} SlabSummaryStatistics; - -/** The statistics for the reference counts. */ -typedef struct { - /** Number of reference blocks written */ - uint64_t blocksWritten; -} RefCountsStatistics; - -/** The statistics for the block map. */ -typedef struct { - /** number of dirty (resident) pages */ - uint32_t dirtyPages; - /** number of clean (resident) pages */ - uint32_t cleanPages; - /** number of free pages */ - uint32_t freePages; - /** number of pages in failed state */ - uint32_t failedPages; - /** number of pages incoming */ - uint32_t incomingPages; - /** number of pages outgoing */ - uint32_t outgoingPages; - /** how many times free page not avail */ - uint32_t cachePressure; - /** number of getVDOPageAsync() for read */ - uint64_t readCount; - /** number or getVDOPageAsync() for write */ - uint64_t writeCount; - /** number of times pages failed to read */ - uint64_t failedReads; - /** number of times pages failed to write */ - uint64_t failedWrites; - /** number of gets that are reclaimed */ - uint64_t reclaimed; - /** number of gets for outgoing pages */ - uint64_t readOutgoing; - /** number of gets that were already there */ - uint64_t foundInCache; - /** number of gets requiring discard */ - uint64_t discardRequired; - /** number of gets enqueued for their page */ - uint64_t waitForPage; - /** number of gets that have to fetch */ - uint64_t fetchRequired; - /** number of page fetches */ - uint64_t pagesLoaded; - /** number of page saves */ - uint64_t pagesSaved; - /** the number of flushes issued */ - uint64_t flushCount; -} BlockMapStatistics; - -/** The dedupe statistics from hash locks */ -typedef struct { - /** Number of times the UDS advice proved correct */ - uint64_t dedupeAdviceValid; - /** Number of times the UDS advice proved incorrect */ - uint64_t dedupeAdviceStale; - /** Number of writes with the same data as another in-flight write */ - uint64_t concurrentDataMatches; - /** Number of writes whose hash collided with an in-flight write */ - uint64_t concurrentHashCollisions; -} HashLockStatistics; - -/** Counts of error conditions in VDO. */ -typedef struct { - /** number of times VDO got an invalid dedupe advice PBN from UDS */ - uint64_t invalidAdvicePBNCount; - /** number of times a VIO completed with a VDO_NO_SPACE error */ - uint64_t noSpaceErrorCount; - /** number of times a VIO completed with a VDO_READ_ONLY error */ - uint64_t readOnlyErrorCount; -} ErrorStatistics; - -/** The statistics of the vdo service. */ -struct vdoStatistics { - uint32_t version; - uint32_t releaseVersion; - /** Number of blocks used for data */ - uint64_t dataBlocksUsed; - /** Number of blocks used for VDO metadata */ - uint64_t overheadBlocksUsed; - /** Number of logical blocks that are currently mapped to physical blocks */ - uint64_t logicalBlocksUsed; - /** number of physical blocks */ - BlockCount physicalBlocks; - /** number of logical blocks */ - BlockCount logicalBlocks; - /** Size of the block map page cache, in bytes */ - uint64_t blockMapCacheSize; - /** String describing the active write policy of the VDO */ - char writePolicy[15]; - /** The physical block size */ - uint64_t blockSize; - /** Number of times the VDO has successfully recovered */ - uint64_t completeRecoveries; - /** Number of times the VDO has recovered from read-only mode */ - uint64_t readOnlyRecoveries; - /** String describing the operating mode of the VDO */ - char mode[15]; - /** Whether the VDO is in recovery mode */ - bool inRecoveryMode; - /** What percentage of recovery mode work has been completed */ - uint8_t recoveryPercentage; - /** The statistics for the compressed block packer */ - PackerStatistics packer; - /** Counters for events in the block allocator */ - BlockAllocatorStatistics allocator; - /** Counters for events in the recovery journal */ - RecoveryJournalStatistics journal; - /** The statistics for the slab journals */ - SlabJournalStatistics slabJournal; - /** The statistics for the slab summary */ - SlabSummaryStatistics slabSummary; - /** The statistics for the reference counts */ - RefCountsStatistics refCounts; - /** The statistics for the block map */ - BlockMapStatistics blockMap; - /** The dedupe statistics from hash locks */ - HashLockStatistics hashLock; - /** Counts of error conditions */ - ErrorStatistics errors; -}; - -/** - * Get the proc file path for reading VDOStatistics. - * - * @return The proc file path - **/ -static inline const char *getVDOStatisticsProcFile(void) { - return "dedupe_stats"; -} - -#endif /* not STATISTICS_H */ diff --git a/vdo/base/statusCodes.c b/vdo/base/statusCodes.c deleted file mode 100644 index 40be3fd..0000000 --- a/vdo/base/statusCodes.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/statusCodes.c#3 $ - */ - -#include "statusCodes.h" - -#include "errors.h" -#include "permassert.h" -#include "threadOnce.h" - -const struct errorInfo vdoStatusList[] = { - { "VDO_NOT_IMPLEMENTED", "Not implemented" }, - { "VDO_OUT_OF_RANGE", "Out of range" }, - { "VDO_REF_COUNT_INVALID", "Reference count would become invalid" }, - { "VDO_NO_SPACE", "Out of space" }, - { "VDO_UNEXPECTED_EOF", "Unexpected EOF on block read" }, - { "VDO_BAD_CONFIGURATION", "Bad configuration option" }, - { "VDO_SOCKET_ERROR", "Socket error" }, - { "VDO_BAD_ALIGNMENT", "Mis-aligned block reference" }, - { "VDO_COMPONENT_BUSY", "Prior operation still in progress" }, - { "VDO_BAD_PAGE", "Corrupt or incorrect page" }, - { "VDO_UNSUPPORTED_VERSION", "Unsupported component version" }, - { "VDO_INCORRECT_COMPONENT", "Component id mismatch in decoder" }, - { "VDO_PARAMETER_MISMATCH", "Parameters have conflicting values" }, - { "VDO_BLOCK_SIZE_TOO_SMALL", "The block size is too small" }, - { "VDO_UNKNOWN_PARTITION", "No partition exists with a given id" }, - { "VDO_PARTITION_EXISTS", "A partition already exists with a given id"}, - { "VDO_NOT_READ_ONLY", "The device is not in read-only mode" }, - { "VDO_INCREMENT_TOO_SMALL", "Physical block growth of too few blocks" }, - { "VDO_CHECKSUM_MISMATCH", "Incorrect checksum" }, - { "VDO_RECOVERY_JOURNAL_FULL", "The recovery journal is full" }, - { "VDO_LOCK_ERROR", "A lock is held incorrectly" }, - { "VDO_READ_ONLY", "The device is in read-only mode" }, - { "VDO_SHUTTING_DOWN", "The device is shutting down" }, - { "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" }, - { "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" }, - { "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" }, - { "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" }, - { "VDO_UNKNOWN_COMMAND", "The extended command is not known" }, - { "VDO_COMMAND_ERROR", "Bad extended command parameters" }, - { "VDO_CANNOT_DETERMINE_SIZE", "Cannot determine config sizes to fit" }, - { "VDO_BAD_MAPPING", "Invalid page mapping" }, - { "VDO_READ_CACHE_BUSY", "Read cache has no free slots" }, - { "VDO_BIO_CREATION_FAILED", "Bio creation failed" }, - { "VDO_BAD_MAGIC", "Bad magic number" }, - { "VDO_BAD_NONCE", "Bad nonce" }, - { "VDO_JOURNAL_OVERFLOW", "Journal sequence number overflow" }, - { "VDO_INVALID_ADMIN_STATE", "Invalid operation for current state" }, -}; - -#ifndef __KERNEL__ -static OnceState vdoStatusCodesRegistered = ONCE_STATE_INITIALIZER; -static int statusCodeRegistrationResult; - -/**********************************************************************/ -static void doStatusCodeRegistration(void) -{ - STATIC_ASSERT((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE) - == COUNT_OF(vdoStatusList)); - - int result = registerErrorBlock("VDO Status", - VDO_STATUS_CODE_BASE, - VDO_STATUS_CODE_BLOCK_END, - vdoStatusList, - sizeof(vdoStatusList)); - /* - * The following test handles cases where libvdo is statically linked - * against both the test modules and the test driver (because multiple - * instances of this module call their own copy of this function - * once each, resulting in multiple calls to registerErrorBlock which - * is shared in libuds). - */ - if (result == UDS_DUPLICATE_NAME) { - result = UDS_SUCCESS; - } - - statusCodeRegistrationResult - = (result == UDS_SUCCESS) ? VDO_SUCCESS : result; -} -#endif - -/**********************************************************************/ -int registerStatusCodes(void) -{ -#ifdef __KERNEL__ - return VDO_SUCCESS; -#else - performOnce(&vdoStatusCodesRegistered, doStatusCodeRegistration); - return statusCodeRegistrationResult; -#endif -} diff --git a/vdo/base/statusCodes.h b/vdo/base/statusCodes.h deleted file mode 100644 index dd3a3ff..0000000 --- a/vdo/base/statusCodes.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/statusCodes.h#2 $ - */ - -#ifndef STATUS_CODES_H -#define STATUS_CODES_H - -#include "errors.h" - -enum { - UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, - VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, - VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, - PRP_BLOCK_START = VDO_BLOCK_END, - PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, -}; - -/** - * VDO-specific status codes. - **/ -enum vdoStatusCodes { - /** successful result */ - VDO_SUCCESS = 0, - /** base of all VDO errors */ - VDO_STATUS_CODE_BASE = VDO_BLOCK_START, - /** we haven't written this yet */ - VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE, - /** input out of range */ - VDO_OUT_OF_RANGE, - /** an invalid reference count would result */ - VDO_REF_COUNT_INVALID, - /** a free block could not be allocated */ - VDO_NO_SPACE, - /** unexpected EOF on block read */ - VDO_UNEXPECTED_EOF, - /** improper or missing configuration option */ - VDO_BAD_CONFIGURATION, - /** socket opening or binding problem */ - VDO_SOCKET_ERROR, - /** read or write on non-aligned offset */ - VDO_BAD_ALIGNMENT, - /** prior operation still in progress */ - VDO_COMPONENT_BUSY, - /** page contents incorrect or corrupt data */ - VDO_BAD_PAGE, - /** unsupported version of some component */ - VDO_UNSUPPORTED_VERSION, - /** component id mismatch in decoder */ - VDO_INCORRECT_COMPONENT, - /** parameters have conflicting values */ - VDO_PARAMETER_MISMATCH, - /** the block size is too small */ - VDO_BLOCK_SIZE_TOO_SMALL, - /** no partition exists with a given id */ - VDO_UNKNOWN_PARTITION, - /** a partition already exists with a given id */ - VDO_PARTITION_EXISTS, - /** the VDO is not in read-only mode */ - VDO_NOT_READ_ONLY, - /** physical block growth of too few blocks */ - VDO_INCREMENT_TOO_SMALL, - /** incorrect checksum */ - VDO_CHECKSUM_MISMATCH, - /** the recovery journal is full */ - VDO_RECOVERY_JOURNAL_FULL, - /** a lock is held incorrectly */ - VDO_LOCK_ERROR, - /** the VDO is in read-only mode */ - VDO_READ_ONLY, - /** the VDO is shutting down */ - VDO_SHUTTING_DOWN, - /** the recovery journal has corrupt entries */ - VDO_CORRUPT_JOURNAL, - /** exceeds maximum number of slabs supported */ - VDO_TOO_MANY_SLABS, - /** a compressed block fragment is invalid */ - VDO_INVALID_FRAGMENT, - /** action is unsupported while rebuilding */ - VDO_RETRY_AFTER_REBUILD, - /** the extended command is not known */ - VDO_UNKNOWN_COMMAND, - /** bad extended command parameters */ - VDO_COMMAND_ERROR, - /** cannot determine sizes to fit */ - VDO_CANNOT_DETERMINE_SIZE, - /** a block map entry is invalid */ - VDO_BAD_MAPPING, - /** read cache has no free slots */ - VDO_READ_CACHE_BUSY, - /** bio_add_page failed */ - VDO_BIO_CREATION_FAILED, - /** bad magic number */ - VDO_BAD_MAGIC, - /** bad nonce */ - VDO_BAD_NONCE, - /** sequence number overflow */ - VDO_JOURNAL_OVERFLOW, - /** the VDO is not in a state to perform an admin operation */ - VDO_INVALID_ADMIN_STATE, - /** one more than last error code */ - VDO_STATUS_CODE_LAST, - VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END -}; - -extern const struct errorInfo vdoStatusList[]; - -/** - * Register the VDO status codes if needed. - * - * @return a success or error code - **/ -int registerStatusCodes(void); - -#endif // STATUS_CODES_H diff --git a/vdo/base/superBlock.c b/vdo/base/superBlock.c deleted file mode 100644 index a7376e9..0000000 --- a/vdo/base/superBlock.c +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/superBlock.c#5 $ - */ - -#include "superBlock.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "completion.h" -#include "constants.h" -#include "header.h" -#include "releaseVersions.h" -#include "statusCodes.h" -#include "types.h" -#include "vio.h" - -struct superBlock { - /** The parent for asynchronous load and save operations */ - VDOCompletion *parent; - /** The VIO for reading and writing the super block to disk */ - VIO *vio; - /** The buffer for encoding and decoding component data */ - Buffer *componentBuffer; - /** - * A sector-sized buffer wrapping the first sector of encodedSuperBlock, for - * encoding and decoding the entire super block. - **/ - Buffer *blockBuffer; - /** A 1-block buffer holding the encoded on-disk super block */ - byte *encodedSuperBlock; - /** The release version number loaded from the volume */ - ReleaseVersionNumber loadedReleaseVersion; - /** Whether this super block may not be written */ - bool unwriteable; -}; - -enum { - SUPER_BLOCK_FIXED_SIZE - = ENCODED_HEADER_SIZE + sizeof(ReleaseVersionNumber) + CHECKSUM_SIZE, - MAX_COMPONENT_DATA_SIZE = VDO_SECTOR_SIZE - SUPER_BLOCK_FIXED_SIZE, -}; - -static const Header SUPER_BLOCK_HEADER_12_0 = { - .id = SUPER_BLOCK, - .version = { - .majorVersion = 12, - .minorVersion = 0, - }, - - // This is the minimum size, if the super block contains no components. - .size = SUPER_BLOCK_FIXED_SIZE - ENCODED_HEADER_SIZE, -}; - -/** - * Allocate a super block. Callers must free the allocated super block even - * on error. - * - * @param layer The physical layer which holds the super block on disk - * @param superBlockPtr A pointer to hold the new super block - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int allocateSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) -{ - int result = ALLOCATE(1, SuperBlock, __func__, superBlockPtr); - if (result != UDS_SUCCESS) { - return result; - } - - SuperBlock *superBlock = *superBlockPtr; - result = makeBuffer(MAX_COMPONENT_DATA_SIZE, &superBlock->componentBuffer); - if (result != UDS_SUCCESS) { - return result; - } - - result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, - "encoded super block", - (char **) &superBlock->encodedSuperBlock); - if (result != UDS_SUCCESS) { - return result; - } - - // Even though the buffer is a full block, to avoid the potential corruption - // from a torn write, the entire encoding must fit in the first sector. - result = wrapBuffer(superBlock->encodedSuperBlock, VDO_SECTOR_SIZE, 0, - &superBlock->blockBuffer); - if (result != UDS_SUCCESS) { - return result; - } - - if (layer->createMetadataVIO == NULL) { - return VDO_SUCCESS; - } - - return createVIO(layer, VIO_TYPE_SUPER_BLOCK, VIO_PRIORITY_METADATA, - superBlock, (char *) superBlock->encodedSuperBlock, - &superBlock->vio); -} - -/**********************************************************************/ -int makeSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) -{ - SuperBlock *superBlock; - int result = allocateSuperBlock(layer, &superBlock); - if (result != VDO_SUCCESS) { - freeSuperBlock(&superBlock); - return result; - } - - // For a new super block, use the current release. - superBlock->loadedReleaseVersion = CURRENT_RELEASE_VERSION_NUMBER; - *superBlockPtr = superBlock; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeSuperBlock(SuperBlock **superBlockPtr) -{ - if (*superBlockPtr == NULL) { - return; - } - - SuperBlock *superBlock = *superBlockPtr; - freeBuffer(&superBlock->blockBuffer); - freeBuffer(&superBlock->componentBuffer); - freeVIO(&superBlock->vio); - FREE(superBlock->encodedSuperBlock); - FREE(superBlock); - *superBlockPtr = NULL; -} - -/** - * Encode a super block into its on-disk representation. - * - * @param layer The physical layer which implements the checksum - * @param superBlock The super block to encode - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int encodeSuperBlock(PhysicalLayer *layer, SuperBlock *superBlock) -{ - Buffer *buffer = superBlock->blockBuffer; - int result = resetBufferEnd(buffer, 0); - if (result != VDO_SUCCESS) { - return result; - } - - size_t componentDataSize = contentLength(superBlock->componentBuffer); - - // Encode the header. - Header header = SUPER_BLOCK_HEADER_12_0; - header.size += componentDataSize; - result = encodeHeader(&header, buffer); - if (result != UDS_SUCCESS) { - return result; - } - - // Encode the loaded release version. - result = putUInt32LEIntoBuffer(buffer, superBlock->loadedReleaseVersion); - if (result != UDS_SUCCESS) { - return result; - } - - // Copy the already-encoded component data. - result = putBytes(buffer, componentDataSize, - getBufferContents(superBlock->componentBuffer)); - if (result != UDS_SUCCESS) { - return result; - } - - // Compute and encode the checksum. - CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, - superBlock->encodedSuperBlock, - contentLength(buffer)); - result = putUInt32LEIntoBuffer(buffer, checksum); - if (result != UDS_SUCCESS) { - return result; - } - - return UDS_SUCCESS; -} - -/**********************************************************************/ -int saveSuperBlock(PhysicalLayer *layer, - SuperBlock *superBlock, - PhysicalBlockNumber superBlockOffset) -{ - int result = encodeSuperBlock(layer, superBlock); - if (result != VDO_SUCCESS) { - return result; - } - - return layer->writer(layer, superBlockOffset, 1, - (char *) superBlock->encodedSuperBlock, NULL); -} - -/** - * Finish the parent of a super block load or save operation. This - * callback is registered in saveSuperBlockAsync() and loadSuperBlockAsync. - * - * @param completion The super block VIO - **/ -static void finishSuperBlockParent(VDOCompletion *completion) -{ - SuperBlock *superBlock = completion->parent; - VDOCompletion *parent = superBlock->parent; - superBlock->parent = NULL; - finishCompletion(parent, completion->result); -} - -/** - * Log a super block save error. This error handler is registered in - * saveSuperBlockAsync(). - * - * @param completion The super block VIO - **/ -static void handleSaveError(VDOCompletion *completion) -{ - logErrorWithStringError(completion->result, "super block save failed"); - /* - * Mark the super block as unwritable so that we won't attempt to write it - * again. This avoids the case where a growth attempt fails writing the - * super block with the new size, but the subsequent attempt to write out - * the read-only state succeeds. In this case, writes which happened just - * before the suspend would not be visible if the VDO is restarted without - * rebuilding, but, after a read-only rebuild, the effects of those writes - * would reappear. - */ - ((SuperBlock *) completion->parent)->unwriteable = true; - completion->callback(completion); -} - -/**********************************************************************/ -void saveSuperBlockAsync(SuperBlock *superBlock, - PhysicalBlockNumber superBlockOffset, - VDOCompletion *parent) -{ - if (superBlock->unwriteable) { - finishCompletion(parent, VDO_READ_ONLY); - return; - } - - if (superBlock->parent != NULL) { - finishCompletion(parent, VDO_COMPONENT_BUSY); - return; - } - - PhysicalLayer *layer = parent->layer; - int result = encodeSuperBlock(layer, superBlock); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - superBlock->parent = parent; - superBlock->vio->completion.callbackThreadID = parent->callbackThreadID; - launchWriteMetadataVIOWithFlush(superBlock->vio, superBlockOffset, - finishSuperBlockParent, handleSaveError, - true, true); -} - -/** - * Decode a super block from its on-disk representation. - * - * @param layer The physical layer which implements the checksum - * @param superBlock The super block to decode - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int decodeSuperBlock(PhysicalLayer *layer, SuperBlock *superBlock) -{ - // Reset the block buffer to start decoding the entire first sector. - Buffer *buffer = superBlock->blockBuffer; - clearBuffer(buffer); - - // Decode and validate the header. - Header header; - int result = decodeHeader(buffer, &header); - if (result != VDO_SUCCESS) { - return result; - } - - result = validateHeader(&SUPER_BLOCK_HEADER_12_0, &header, false, __func__); - if (result != VDO_SUCCESS) { - return result; - } - - if (header.size > contentLength(buffer)) { - // We can't check release version or checksum until we know the content - // size, so we have to assume a version mismatch on unexpected values. - return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, - "super block contents too large: %zu", - header.size); - } - - // Restrict the buffer to the actual payload bytes that remain. - result = resetBufferEnd(buffer, uncompactedAmount(buffer) + header.size); - if (result != VDO_SUCCESS) { - return result; - } - - // Decode and store the release version number. It will be checked when the - // VDO master version is decoded and validated. - result = getUInt32LEFromBuffer(buffer, &superBlock->loadedReleaseVersion); - if (result != VDO_SUCCESS) { - return result; - } - - // The component data is all the rest, except for the checksum. - size_t componentDataSize = contentLength(buffer) - sizeof(CRC32Checksum); - result = putBuffer(superBlock->componentBuffer, buffer, componentDataSize); - if (result != VDO_SUCCESS) { - return result; - } - - // Checksum everything up to but not including the saved checksum itself. - CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, - superBlock->encodedSuperBlock, - uncompactedAmount(buffer)); - - // Decode and verify the saved checksum. - CRC32Checksum savedChecksum; - result = getUInt32LEFromBuffer(buffer, &savedChecksum); - if (result != VDO_SUCCESS) { - return result; - } - - result = ASSERT(contentLength(buffer) == 0, - "must have decoded entire superblock payload"); - if (result != VDO_SUCCESS) { - return result; - } - - return ((checksum != savedChecksum) ? VDO_CHECKSUM_MISMATCH : VDO_SUCCESS); -} - -/**********************************************************************/ -int loadSuperBlock(PhysicalLayer *layer, - PhysicalBlockNumber superBlockOffset, - SuperBlock **superBlockPtr) -{ - SuperBlock *superBlock = NULL; - int result = allocateSuperBlock(layer, &superBlock); - if (result != VDO_SUCCESS) { - freeSuperBlock(&superBlock); - return result; - } - - result = layer->reader(layer, superBlockOffset, 1, - (char *) superBlock->encodedSuperBlock, NULL); - if (result != VDO_SUCCESS) { - freeSuperBlock(&superBlock); - return result; - } - - result = decodeSuperBlock(layer, superBlock); - if (result != VDO_SUCCESS) { - freeSuperBlock(&superBlock); - return result; - } - - *superBlockPtr = superBlock; - return result; -} - -/** - * Continue after loading the super block. This callback is registered - * in loadSuperBlockAsync(). - * - * @param completion The super block VIO - **/ -static void finishReadingSuperBlock(VDOCompletion *completion) -{ - SuperBlock *superBlock = completion->parent; - VDOCompletion *parent = superBlock->parent; - superBlock->parent = NULL; - finishCompletion(parent, decodeSuperBlock(completion->layer, superBlock)); -} - -/**********************************************************************/ -void loadSuperBlockAsync(VDOCompletion *parent, - PhysicalBlockNumber superBlockOffset, - SuperBlock **superBlockPtr) -{ - PhysicalLayer *layer = parent->layer; - SuperBlock *superBlock = NULL; - int result = allocateSuperBlock(layer, &superBlock); - if (result != VDO_SUCCESS) { - freeSuperBlock(&superBlock); - finishCompletion(parent, result); - return; - } - - *superBlockPtr = superBlock; - - superBlock->parent = parent; - superBlock->vio->completion.callbackThreadID = parent->callbackThreadID; - launchReadMetadataVIO(superBlock->vio, superBlockOffset, - finishReadingSuperBlock, finishSuperBlockParent); -} - -/**********************************************************************/ -Buffer *getComponentBuffer(SuperBlock *superBlock) -{ - return superBlock->componentBuffer; -} - -/**********************************************************************/ -ReleaseVersionNumber getLoadedReleaseVersion(const SuperBlock *superBlock) -{ - return superBlock->loadedReleaseVersion; -} - -/**********************************************************************/ -size_t getFixedSuperBlockSize(void) -{ - return SUPER_BLOCK_FIXED_SIZE; -} diff --git a/vdo/base/superBlock.h b/vdo/base/superBlock.h deleted file mode 100644 index bfed7c6..0000000 --- a/vdo/base/superBlock.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/superBlock.h#2 $ - */ - -#ifndef SUPER_BLOCK_H -#define SUPER_BLOCK_H - -#include "buffer.h" - -#include "completion.h" -#include "types.h" - -typedef struct superBlock SuperBlock; - -/** - * Make a new super block. - * - * @param [in] layer The layer on which to write this super block - * @param [out] superBlockPtr A pointer to hold the new super block - * - * @return VDO_SUCCESS or an error - **/ -int makeSuperBlock(PhysicalLayer *layer, SuperBlock **superBlockPtr) - __attribute__((warn_unused_result)); - -/** - * Free a super block and null out the reference to it. - * - * @param superBlockPtr the reference to the super block to free - **/ -void freeSuperBlock(SuperBlock **superBlockPtr); - -/** - * Save a super block. - * - * @param layer The physical layer on which to save the super block - * @param superBlock The super block to save - * @param superBlockOffset The location of the super block - * - * @return VDO_SUCCESS or an error - **/ -int saveSuperBlock(PhysicalLayer *layer, - SuperBlock *superBlock, - PhysicalBlockNumber superBlockOffset) - __attribute__((warn_unused_result)); - -/** - * Save a super block asynchronously. - * - * @param superBlock The super block to save - * @param superBlockOffset The location at which to write the super block - * @param parent The object to notify when the save is complete - **/ -void saveSuperBlockAsync(SuperBlock *superBlock, - PhysicalBlockNumber superBlockOffset, - VDOCompletion *parent); - -/** - * Allocate a super block and read its contents from storage. - * - * @param [in] layer The layer from which to load the super block - * @param [in] superBlockOffset The location from which to read the super - * block - * @param [out] superBlockPtr A pointer to hold the loaded super block - * - * @return VDO_SUCCESS or an error - **/ -int loadSuperBlock(PhysicalLayer *layer, - PhysicalBlockNumber superBlockOffset, - SuperBlock **superBlockPtr) - __attribute__((warn_unused_result)); - -/** - * Allocate a super block and read its contents from storage asynchronously. If - * a load error occurs before the super block's own completion can be allocated, - * the parent will be finished with the error. - * - * @param [in] parent The completion to finish after loading the - * super block - * @param [in] superBlockOffset The location from which to read the super - * block - * @param [out] superBlockPtr A pointer to hold the super block - **/ -void loadSuperBlockAsync(VDOCompletion *parent, - PhysicalBlockNumber superBlockOffset, - SuperBlock **superBlockPtr); - -/** - * Get a buffer which contains the component data from a super block. - * - * @param superBlock The super block from which to get the component data - * - * @return the component data in a buffer - **/ -Buffer *getComponentBuffer(SuperBlock *superBlock) - __attribute__((warn_unused_result)); - -/** - * Get the release version number that was loaded from the volume when the - * SuperBlock was decoded. - * - * @param superBlock The super block to query - * - * @return the release version number that was decoded from the volume - **/ -ReleaseVersionNumber getLoadedReleaseVersion(const SuperBlock *superBlock) - __attribute__((warn_unused_result)); - -/** - * Get the encoded size of the fixed (non-component data) portion of a super - * block (this is for unit testing). - * - * @return The encoded size of the fixed portion of the super block - **/ -size_t getFixedSuperBlockSize(void) - __attribute__((warn_unused_result)); - -#endif /* SUPER_BLOCK_H */ diff --git a/vdo/base/threadConfig.c b/vdo/base/threadConfig.c deleted file mode 100644 index b671b73..0000000 --- a/vdo/base/threadConfig.c +++ /dev/null @@ -1,268 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/threadConfig.c#2 $ - */ - -#include "threadConfig.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "constants.h" -#include "types.h" - -/**********************************************************************/ -static int allocateThreadConfig(ZoneCount logicalZoneCount, - ZoneCount physicalZoneCount, - ZoneCount hashZoneCount, - ZoneCount baseThreadCount, - ThreadConfig **configPtr) -{ - ThreadConfig *config; - int result = ALLOCATE(1, ThreadConfig, "thread config", &config); - if (result != VDO_SUCCESS) { - return result; - } - - result = ALLOCATE(logicalZoneCount, ThreadID, "logical thread array", - &config->logicalThreads); - if (result != VDO_SUCCESS) { - freeThreadConfig(&config); - return result; - } - - result = ALLOCATE(physicalZoneCount, ThreadID, "physical thread array", - &config->physicalThreads); - if (result != VDO_SUCCESS) { - freeThreadConfig(&config); - return result; - } - - result = ALLOCATE(hashZoneCount, ThreadID, "hash thread array", - &config->hashZoneThreads); - if (result != VDO_SUCCESS) { - freeThreadConfig(&config); - return result; - } - - config->logicalZoneCount = logicalZoneCount; - config->physicalZoneCount = physicalZoneCount; - config->hashZoneCount = hashZoneCount; - config->baseThreadCount = baseThreadCount; - - *configPtr = config; - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void assignThreadIDs(ThreadID threadIDs[], - ZoneCount count, - ThreadID *idPtr) -{ - for (ZoneCount zone = 0; zone < count; zone++) { - threadIDs[zone] = (*idPtr)++; - } -} - -/**********************************************************************/ -int makeThreadConfig(ZoneCount logicalZoneCount, - ZoneCount physicalZoneCount, - ZoneCount hashZoneCount, - ThreadConfig **configPtr) -{ - if ((logicalZoneCount == 0) - && (physicalZoneCount == 0) - && (hashZoneCount == 0)) { - return makeOneThreadConfig(configPtr); - } - - if (physicalZoneCount > MAX_PHYSICAL_ZONES) { - return logErrorWithStringError(VDO_BAD_CONFIGURATION, - "Physical zone count %u exceeds maximum " - "(%u)", - physicalZoneCount, MAX_PHYSICAL_ZONES); - } - - if (logicalZoneCount > MAX_LOGICAL_ZONES) { - return logErrorWithStringError(VDO_BAD_CONFIGURATION, - "Logical zone count %u exceeds maximum " - "(%u)", - logicalZoneCount, MAX_LOGICAL_ZONES); - } - - ThreadConfig *config; - ThreadCount total = logicalZoneCount + physicalZoneCount + hashZoneCount + 2; - int result = allocateThreadConfig(logicalZoneCount, physicalZoneCount, - hashZoneCount, total, &config); - if (result != VDO_SUCCESS) { - return result; - } - - ThreadID id = 0; - config->adminThread = id; - config->journalThread = id++; - config->packerThread = id++; - assignThreadIDs(config->logicalThreads, logicalZoneCount, &id); - assignThreadIDs(config->physicalThreads, physicalZoneCount, &id); - assignThreadIDs(config->hashZoneThreads, hashZoneCount, &id); - - ASSERT_LOG_ONLY(id == total, "correct number of thread IDs assigned"); - - *configPtr = config; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeZeroThreadConfig(ThreadConfig **configPtr) -{ - ThreadConfig *config; - int result = ALLOCATE(1, ThreadConfig, __func__, &config); - if (result != VDO_SUCCESS) { - return result; - } - - config->logicalZoneCount = 0; - config->physicalZoneCount = 0; - config->hashZoneCount = 0; - config->baseThreadCount = 0; - *configPtr = config; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeOneThreadConfig(ThreadConfig **configPtr) -{ - ThreadConfig *config; - int result = allocateThreadConfig(1, 1, 1, 1, &config); - if (result != VDO_SUCCESS) { - return result; - } - - config->logicalThreads[0] = 0; - config->physicalThreads[0] = 0; - config->hashZoneThreads[0] = 0; - *configPtr = config; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int copyThreadConfig(const ThreadConfig *oldConfig, ThreadConfig **configPtr) -{ - ThreadConfig *config; - int result = allocateThreadConfig(oldConfig->logicalZoneCount, - oldConfig->physicalZoneCount, - oldConfig->hashZoneCount, - oldConfig->baseThreadCount, - &config); - if (result != VDO_SUCCESS) { - return result; - } - - config->adminThread = oldConfig->adminThread; - config->journalThread = oldConfig->journalThread; - config->packerThread = oldConfig->packerThread; - for (ZoneCount i = 0; i < config->logicalZoneCount; i++) { - config->logicalThreads[i] = oldConfig->logicalThreads[i]; - } - for (ZoneCount i = 0; i < config->physicalZoneCount; i++) { - config->physicalThreads[i] = oldConfig->physicalThreads[i]; - } - for (ZoneCount i = 0; i < config->hashZoneCount; i++) { - config->hashZoneThreads[i] = oldConfig->hashZoneThreads[i]; - } - - *configPtr = config; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeThreadConfig(ThreadConfig **configPtr) -{ - if (*configPtr == NULL) { - return; - } - - ThreadConfig *config = *configPtr; - *configPtr = NULL; - - FREE(config->logicalThreads); - FREE(config->physicalThreads); - FREE(config->hashZoneThreads); - FREE(config); -} - -/**********************************************************************/ -static bool getZoneThreadName(const ThreadID threadIDs[], - ZoneCount count, - ThreadID id, - const char *prefix, - char *buffer, - size_t bufferLength) -{ - if (id >= threadIDs[0]) { - ThreadID index = id - threadIDs[0]; - if (index < count) { - snprintf(buffer, bufferLength, "%s%d", prefix, index); - return true; - } - } - return false; -} - -/**********************************************************************/ -void getVDOThreadName(const ThreadConfig *threadConfig, - ThreadID threadID, - char *buffer, - size_t bufferLength) -{ - if (threadConfig->baseThreadCount == 1) { - // Historically this was the "request queue" thread. - snprintf(buffer, bufferLength, "reqQ"); - return; - } - if (threadID == threadConfig->journalThread) { - snprintf(buffer, bufferLength, "journalQ"); - return; - } else if (threadID == threadConfig->adminThread) { - // Theoretically this could be different from the journal thread. - snprintf(buffer, bufferLength, "adminQ"); - return; - } else if (threadID == threadConfig->packerThread) { - snprintf(buffer, bufferLength, "packerQ"); - return; - } - if (getZoneThreadName(threadConfig->logicalThreads, - threadConfig->logicalZoneCount, - threadID, "logQ", buffer, bufferLength)) { - return; - } - if (getZoneThreadName(threadConfig->physicalThreads, - threadConfig->physicalZoneCount, - threadID, "physQ", buffer, bufferLength)) { - return; - } - if (getZoneThreadName(threadConfig->hashZoneThreads, - threadConfig->hashZoneCount, - threadID, "hashQ", buffer, bufferLength)) { - return; - } - - // Some sort of misconfiguration? - snprintf(buffer, bufferLength, "reqQ%d", threadID); -} diff --git a/vdo/base/threadConfig.h b/vdo/base/threadConfig.h deleted file mode 100644 index 6401651..0000000 --- a/vdo/base/threadConfig.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/threadConfig.h#1 $ - */ - -#ifndef THREAD_CONFIG_H -#define THREAD_CONFIG_H - -#include "permassert.h" - -#include "types.h" - -struct threadConfig { - ZoneCount logicalZoneCount; - ZoneCount physicalZoneCount; - ZoneCount hashZoneCount; - ThreadCount baseThreadCount; - ThreadID adminThread; - ThreadID journalThread; - ThreadID packerThread; - ThreadID *logicalThreads; - ThreadID *physicalThreads; - ThreadID *hashZoneThreads; -}; - -/** - * Make a thread configuration. If both the logical zone count and the - * physical zone count are set to 0, a one thread configuration will be - * made. - * - * @param [in] logicalZoneCount The number of logical zones - * @param [in] physicalZoneCount The number of physical zones - * @param [in] hashZoneCount The number of hash zones - * @param [out] configPtr A pointer to hold the new thread - * configuration - * - * @return VDO_SUCCESS or an error - **/ -int makeThreadConfig(ZoneCount logicalZoneCount, - ZoneCount physicalZoneCount, - ZoneCount hashZoneCount, - ThreadConfig **configPtr) - __attribute__((warn_unused_result)); - -/** - * Make a thread configuration that uses no threads. This is the configuration - * for VDOs which are constructed from user mode that have only a synchronous - * layer. - * - * @param [out] configPtr A pointer to hold the new thread configuration - * - * @return VDO_SUCCESS or an error - **/ -int makeZeroThreadConfig(ThreadConfig **configPtr); - -/** - * Make a thread configuration that uses only one thread. - * - * @param [out] configPtr A pointer to hold the new thread configuration - * - * @return VDO_SUCCESS or an error - **/ -int makeOneThreadConfig(ThreadConfig **configPtr) - __attribute__((warn_unused_result)); - -/** - * Make a new thread config which is a copy of an existing one. - * - * @param [in] oldConfig The thread configuration to copy - * @param [out] configPtr A pointer to hold the new thread configuration - * - * @return VDO_SUCCESS or an error - **/ -int copyThreadConfig(const ThreadConfig *oldConfig, ThreadConfig **configPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy a thread configuration and null out the reference to it. - * - * @param configPtr The reference to the thread configuration to destroy - **/ -void freeThreadConfig(ThreadConfig **configPtr); - -/** - * Get the thread id for a given logical zone. - * - * @param threadConfig the thread config - * @param logicalZone the number of the logical zone - * - * @return the thread id for the given zone - **/ -__attribute__((warn_unused_result)) -static inline ThreadID getLogicalZoneThread(const ThreadConfig *threadConfig, - ZoneCount logicalZone) -{ - ASSERT_LOG_ONLY((logicalZone <= threadConfig->logicalZoneCount), - "logical zone valid"); - return threadConfig->logicalThreads[logicalZone]; -} - -/** - * Get the thread id for a given physical zone. - * - * @param threadConfig the thread config - * @param physicalZone the number of the physical zone - * - * @return the thread id for the given zone - **/ -__attribute__((warn_unused_result)) -static inline ThreadID getPhysicalZoneThread(const ThreadConfig *threadConfig, - ZoneCount physicalZone) -{ - ASSERT_LOG_ONLY((physicalZone <= threadConfig->physicalZoneCount), - "physical zone valid"); - return threadConfig->physicalThreads[physicalZone]; -} - -/** - * Get the thread id for a given hash zone. - * - * @param threadConfig the thread config - * @param hashZone the number of the hash zone - * - * @return the thread id for the given zone - **/ -__attribute__((warn_unused_result)) -static inline ThreadID getHashZoneThread(const ThreadConfig *threadConfig, - ZoneCount hashZone) -{ - ASSERT_LOG_ONLY((hashZone <= threadConfig->hashZoneCount), - "hash zone valid"); - return threadConfig->hashZoneThreads[hashZone]; -} - -/** - * Get the thread id for the journal zone. - * - * @param threadConfig the thread config - * - * @return the thread id for the journal zone - **/ -__attribute__((warn_unused_result)) -static inline ThreadID getJournalZoneThread(const ThreadConfig *threadConfig) -{ - return threadConfig->journalThread; -} - -/** - * Get the thread id for the packer zone. - * - * @param threadConfig the thread config - * - * @return the thread id for the packer zone - **/ -__attribute__((warn_unused_result)) -static inline ThreadID getPackerZoneThread(const ThreadConfig *threadConfig) -{ - return threadConfig->packerThread; -} - -/** - * Get the thread ID for admin requests. - * - * @param threadConfig The thread config - * - * @return the thread id to use for admin requests - **/ -__attribute__((warn_unused_result)) -static inline ThreadID getAdminThread(const ThreadConfig *threadConfig) -{ - return threadConfig->adminThread; -} - -/** - * Format the name of the worker thread desired to support a given - * work queue. The physical layer may add a prefix identifying the - * product; the output from this function should just identify the - * thread. - * - * @param threadConfig The thread configuration - * @param threadID The thread id - * @param buffer Where to put the formatted name - * @param bufferLength Size of the output buffer - **/ -void getVDOThreadName(const ThreadConfig *threadConfig, - ThreadID threadID, - char *buffer, - size_t bufferLength); - -#endif /* THREAD_CONFIG_H */ diff --git a/vdo/base/trace.c b/vdo/base/trace.c deleted file mode 100644 index 7b4e33f..0000000 --- a/vdo/base/trace.c +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/trace.c#1 $ - */ - -#include "trace.h" - -#include "logger.h" -#include "stringUtils.h" -#include "timeUtils.h" - -TRACE_LOCATION_SECTION TraceLocationRecord baseTraceLocation[] = { - { - .function = "", - .line = 0, - }, -}; - -/**********************************************************************/ -void addTraceRecord(Trace *trace, TraceLocation location) -{ - if (trace->used < NUM_TRACE_RECORDS) { - TraceRecord *record = &trace->records[trace->used]; - trace->used++; - - record->when = nowUsec(); - record->tid = getThreadId(); - record->location = location - baseTraceLocation; - } -} - -/* - * The record display format used is a comma-separated list, each item - * containing: optional function name; "@" + timestamp with seconds - * and microseconds for the first record; if not the first record, "+" - * and offset in microseconds from previous timestamp. - * - * If the buffer's too small, it'll end with an ellipsis. - */ -void formatTrace(Trace *trace, - char *buffer, - size_t bufferLength, - size_t *msgLen) -{ - if (trace == NULL) { - return; - } - memset(buffer, 0, bufferLength); - char *buf = buffer; - char *bufferEnd = buffer + bufferLength - 1; - if (trace->used > 0) { - TraceRecord *record = &trace->records[0]; - TraceLocationRecord *location = baseTraceLocation + record->location; - snprintf(buf, bufferEnd - buf, "Trace[%s@%llu.%06llu", - location->function, record->when / 1000000, - record->when % 1000000); - buf += strlen(buf); - - for (unsigned int i = 1; i < trace->used; i++) { - TraceRecord *prev = record; - record++; - - snprintf(buf, bufferEnd - buf, ","); - buf += strlen(buf); - - location = baseTraceLocation + record->location; - unsigned long timeDiff = record->when - prev->when; - snprintf(buf, bufferEnd - buf, "%s+%lu", - location->function, timeDiff); - buf += strlen(buf); - } - if (bufferLength > 7) { - if (buffer[bufferLength-5] != '\0') { - // too long - strcpy(buffer+bufferLength-5, "...]"); - } else { - strcpy(buf, "]"); - } - } - } - *msgLen = (buf - buffer); -} diff --git a/vdo/base/trace.h b/vdo/base/trace.h deleted file mode 100644 index 59dabf9..0000000 --- a/vdo/base/trace.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/trace.h#1 $ - */ - -#ifndef TRACE_H -#define TRACE_H - -#ifndef __KERNEL__ -#include "cpu.h" -#endif - -#include "threads.h" - -/* - * We need these records to be glued together with no intervening - * bytes. That makes it rather sensitive to how the compiler, - * assembler, and linker may add padding. Force extra alignment to - * make it more reliable. - * - * Trace point descriptor language: - * - * The descriptor string provided at a trace point can have one or - * more components, separated by ";". The first (or only) component is - * a string to be formatted and shown in the flowchart graph. The - * remaining components must be of the form "var=string", and assign - * string values to "variables" that last through the processing of - * the remainder of the current trace being read. - * - * The string displayed has variable substitutions done for any - * occurrences of "$var" in the string. - * - * So, the descriptor sequence: - * kvdoWriteVIO;io=writeData;j=normal - * submitBio($io) - * writeJournalBlock($j) - * would cause the graph generator to show the strings: - * kvdoWriteVIO - * submitBio(writeData) - * writeJournalBlock(normal) - * - * Substitutions are done in the variable assignment strings when - * they're processed, so "foo=x($bar)" sets "foo" using the current - * value of "bar"; it doesn't cause "bar" to be looked up when "$foo" - * is seen later. - * - * The variable named "F" is automatically updated with the name of - * the function associated with the descriptor, so you don't have to - * explicitly repeat the name of the function if you just want to - * augment it with more information. This may be desirable if a trace - * point is expected to be reached more than once at different stages - * of processing, or in a static function with a generic-sounding name - * that needs disambiguation for graphing. - * - * If no descriptor string is provided, the - * function:lineNumber:threadName string reported via systemtap will - * be used in the graph. - * - * Current variable names used: - * cb=(various) random info to log when enqueueing VIO callback - * dup=post,update deduplication operation - * io=(various) kind of I/O and data it's being done on - * j=normal,dedupe kind of journal update being done - * js=mapWrite,writeZero,unmap which step of journaling we're doing - */ -typedef const struct __attribute__((aligned(16))) traceLocationRecord { - const char *function; - int line; - const char *description; -} TraceLocationRecord; - -/* - * With well under 100 locations defined at the moment, even with no - * idea where &baseTraceLocation will fall relative to the others, we - * only need to support a range of -100..+100. - */ -typedef int32_t TraceLocationNumber; - -/* The type to pass around */ -typedef TraceLocationRecord *TraceLocation; - -/* - * N.B.: This code uses GCC extensions to create static, initialized - * objects inline, describing the current function and line number. - * The objects are collected into a table we can index with small - * signed integers relative to &baseTraceLocation. - * - * We need baseTraceLocation because there's no standard way to get - * the address of the start of this array we're defining. And because - * we're not playing any (additional) special linker tricks to ensure - * ordering of the object files, the offsets may be signed, and we - * don't know the range beyond the fact that we don't have hundreds of - * these records lying around. - * - * By specifying a name that starts with neither .data nor .rodata, we - * leave it to the toolchain to pick a location for us, based on - * things like whether the section needs write access, which it does - * for a PIC library but not for a kernel module. - */ - -#define TRACE_LOCATION_SECTION \ - __attribute__((section(".kvdo_trace_locations"))) - -extern TRACE_LOCATION_SECTION TraceLocationRecord baseTraceLocation[]; - -#define TRACE_JOIN2(a,b) a##b -#define TRACE_JOIN(a,b) TRACE_JOIN2(a,b) -#define THIS_LOCATION(DESCRIPTION) \ - __extension__ \ - ({ \ - static TRACE_LOCATION_SECTION \ - TraceLocationRecord TRACE_JOIN(loc,__LINE__) = { \ - .function = __func__, \ - .line = __LINE__, \ - .description = DESCRIPTION, \ - }; \ - &TRACE_JOIN(loc,__LINE__); \ - }) - -typedef struct traceRecord { - uint64_t when; // counted in usec - pid_t tid; - TraceLocationNumber location; -} TraceRecord; - -enum { NUM_TRACE_RECORDS = 71 }; - -typedef struct trace { - unsigned int used; - TraceRecord records[NUM_TRACE_RECORDS]; -} Trace; - -/** - * Store a new record in the trace data. - * - * @param trace The trace data to be updated - * @param location The source-location descriptor to be recorded - **/ -void addTraceRecord(Trace *trace, TraceLocation location); - -/** - * Format trace data into a string for logging. - * - * @param [in] trace The trace data to be logged - * @param [in] buffer The buffer in which to store the string - * @param [in] bufferLength Length of the buffer - * @param [out] msgLen Length of the formatted string - **/ -void formatTrace(Trace *trace, - char *buffer, - size_t bufferLength, - size_t *msgLen); - -#endif /* TRACE_H */ diff --git a/vdo/base/types.h b/vdo/base/types.h deleted file mode 100644 index d820da6..0000000 --- a/vdo/base/types.h +++ /dev/null @@ -1,445 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/types.h#14 $ - */ - -#ifndef TYPES_H -#define TYPES_H - -#include "blockMappingState.h" -#include "common.h" -#include "statusCodes.h" - -/** - * A size type in blocks. - **/ -typedef uint64_t BlockCount; - -/** - * The size of a block. - **/ -typedef uint16_t BlockSize; - -/** - * A count of compressed fragments - **/ -typedef uint8_t CompressedFragmentCount; - -/** - * A CRC-32 checksum - **/ -typedef uint32_t CRC32Checksum; - -/** - * A height within a tree. - **/ -typedef uint8_t Height; - -/** - * The logical block number as used by the consumer. - **/ -typedef uint64_t LogicalBlockNumber; - -/** - * The type of the nonce used to identify instances of VDO. - **/ -typedef uint64_t Nonce; - -/** - * A size in pages. - **/ -typedef uint32_t PageCount; - -/** - * A page number. - **/ -typedef uint32_t PageNumber; - -/** - * The size of a page. Must be evenly divisible by block size. - **/ -typedef uint32_t PageSize; - -/** - * The physical (well, less logical) block number at which the block is found - * on the underlying device. - **/ -typedef uint64_t PhysicalBlockNumber; - -/** - * A release version number. These numbers are used to make the numbering - * space for component versions independent across release branches. - * - * Really an enum, but we have to specify the size for encoding; see - * releaseVersions.h for the enumeration values. - **/ -typedef uint32_t ReleaseVersionNumber; - -/** - * A count of tree roots. - **/ -typedef uint8_t RootCount; - -/** - * A number of sectors. - **/ -typedef uint8_t SectorCount; - -/** - * A sequence number. - **/ -typedef uint64_t SequenceNumber; - -/** - * A size type in slabs. - **/ -typedef uint16_t SlabCount; - -/** - * A slot in a bin or block map page. - **/ -typedef uint16_t SlotNumber; - -/** - * A number of VIOs. - **/ -typedef uint16_t VIOCount; - -/** - * A VDO thread configuration. - **/ -typedef struct threadConfig ThreadConfig; - -/** - * A thread counter - **/ -typedef uint8_t ThreadCount; - -/** - * A thread ID - * - * Base-code threads are numbered sequentially starting from 0. - **/ -typedef uint8_t ThreadID; - -/** - * The thread ID returned when the current base code thread ID cannot be found - * or is otherwise undefined. - **/ -static const ThreadID INVALID_THREAD_ID = (ThreadID) -1; - -/** - * A zone counter - **/ -typedef uint8_t ZoneCount; - -/** - * The type of request a VIO is performing - **/ -typedef enum __attribute__((packed)) vioOperation { - VIO_UNSPECIFIED_OPERATION = 0, - VIO_READ = 1, - VIO_WRITE = 2, - VIO_READ_MODIFY_WRITE = VIO_READ | VIO_WRITE, - VIO_READ_WRITE_MASK = VIO_READ_MODIFY_WRITE, - VIO_FLUSH_BEFORE = 4, - VIO_FLUSH_AFTER = 8, -} VIOOperation; - -/** - * VIO types for statistics and instrumentation. - **/ -typedef enum __attribute__((packed)) { - VIO_TYPE_UNINITIALIZED = 0, - VIO_TYPE_DATA, - VIO_TYPE_BLOCK_ALLOCATOR, - VIO_TYPE_BLOCK_MAP, - VIO_TYPE_BLOCK_MAP_INTERIOR, - VIO_TYPE_COMPRESSED_BLOCK, - VIO_TYPE_PARTITION_COPY, - VIO_TYPE_RECOVERY_JOURNAL, - VIO_TYPE_SLAB_JOURNAL, - VIO_TYPE_SLAB_SUMMARY, - VIO_TYPE_SUPER_BLOCK, - VIO_TYPE_TEST, -} VIOType; - -/** - * The current operation on a physical block (from the point of view of the - * recovery journal, slab journals, and reference counts. - **/ -typedef enum __attribute__((packed)) { - DATA_DECREMENT = 0, - DATA_INCREMENT = 1, - BLOCK_MAP_DECREMENT = 2, - BLOCK_MAP_INCREMENT = 3, -} JournalOperation; - -/** - * Partition IDs are encoded in the volume layout in the super block. - **/ -typedef enum __attribute__((packed)) { - BLOCK_MAP_PARTITION = 0, - BLOCK_ALLOCATOR_PARTITION = 1, - RECOVERY_JOURNAL_PARTITION = 2, - SLAB_SUMMARY_PARTITION = 3, -} PartitionID; - -/** - * Check whether a VIOType is for servicing an external data request. - * - * @param vioType The VIOType to check - **/ -static inline bool isDataVIOType(VIOType vioType) -{ - return (vioType == VIO_TYPE_DATA); -} - -/** - * Check whether a VIOType is for compressed block writes - * - * @param vioType The VIOType to check - **/ -static inline bool isCompressedWriteVIOType(VIOType vioType) -{ - return (vioType == VIO_TYPE_COMPRESSED_BLOCK); -} - -/** - * Check whether a VIOType is for metadata - * - * @param vioType The VIOType to check - **/ -static inline bool isMetadataVIOType(VIOType vioType) -{ - return ((vioType != VIO_TYPE_UNINITIALIZED) - && !isDataVIOType(vioType) - && !isCompressedWriteVIOType(vioType)); -} - -/** - * Priority levels for asynchronous I/O operations performed on a VIO. - **/ -typedef enum __attribute__((packed)) vioPriority { - VIO_PRIORITY_LOW = 0, - VIO_PRIORITY_DATA = VIO_PRIORITY_LOW, - VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA, - VIO_PRIORITY_METADATA, - VIO_PRIORITY_HIGH, -} VIOPriority; - -/** - * Metadata types for the VDO. - **/ -typedef enum __attribute__((packed)) { - VDO_METADATA_RECOVERY_JOURNAL = 1, - VDO_METADATA_SLAB_JOURNAL, -} VDOMetadataType; - -/** - * The possible write policy values. - **/ -typedef enum { - WRITE_POLICY_SYNC, ///< All writes are synchronous, i. e., they - ///< are acknowledged only when the data is - ///< written to stable storage. - WRITE_POLICY_ASYNC, ///< Writes are acknowledged when the data is - ///< cached for writing to stable storage, subject - ///< to resiliency guarantees specified elsewhere. - ///< After a crash, the data will be either old or - ///< new value for unflushed writes, never garbage. - WRITE_POLICY_ASYNC_UNSAFE, ///< Writes are acknowledged when the data is - ///< cached for writing to stable storage, subject - ///< to resiliency guarantees specified elsewhere. - WRITE_POLICY_AUTO, ///< The appropriate policy is chosen based on the - ///< underlying device -} WritePolicy; - -typedef enum { - ZONE_TYPE_ADMIN, - ZONE_TYPE_JOURNAL, - ZONE_TYPE_LOGICAL, - ZONE_TYPE_PHYSICAL, -} ZoneType; - -/** - * A position in the block map where a block map entry is stored. - **/ -typedef struct { - PhysicalBlockNumber pbn; - SlotNumber slot; -} BlockMapSlot; - -/** - * A position in the arboreal block map at a specific level. - **/ -typedef struct { - PageNumber pageIndex; - BlockMapSlot blockMapSlot; -} BlockMapTreeSlot; - -/** - * The configuration of a single slab derived from the configured block size - * and slab size. - **/ -typedef struct slabConfig { - BlockCount slabBlocks; ///< total number of blocks in the slab - BlockCount dataBlocks; ///< number of blocks available for data - BlockCount referenceCountBlocks; ///< number of blocks for refCounts - BlockCount slabJournalBlocks; ///< number of blocks for the slab journal - /** - * Number of blocks after which the slab journal starts pushing out a - * ReferenceBlock for each new entry it receives. - **/ - BlockCount slabJournalFlushingThreshold; - /** - * Number of blocks after which the slab journal pushes out all - * ReferenceBlocks and makes all VIOs wait. - **/ - BlockCount slabJournalBlockingThreshold; - /** - * Number of blocks after which the slab must be scrubbed before coming - * online. - **/ - BlockCount slabJournalScrubbingThreshold; -} __attribute__((packed)) SlabConfig; - -/** - * The configuration of the VDO service. - **/ -typedef struct vdoConfig { - BlockCount logicalBlocks; ///< number of logical blocks - BlockCount physicalBlocks; ///< number of physical blocks - BlockCount slabSize; ///< number of blocks in a slab - BlockCount recoveryJournalSize; ///< number of recovery journal blocks - BlockCount slabJournalBlocks; ///< number of slab journal blocks -} __attribute__((packed)) VDOConfig; - -/** - * The configuration parameters of the VDO service specified at load time. - **/ -typedef struct vdoLoadConfig { - /** the offset on the physical layer where the VDO begins */ - PhysicalBlockNumber firstBlockOffset; - /** the expected release version number of the VDO */ - ReleaseVersionNumber releaseVersion; - /** the expected nonce of the VDO */ - Nonce nonce; - /** the thread configuration of the VDO */ - ThreadConfig *threadConfig; - /** the page cache size, in pages */ - PageCount cacheSize; - /** whether writes are synchronous */ - WritePolicy writePolicy; - /** the maximum age of a dirty block map page in recovery journal blocks */ - BlockCount maximumAge; -} VDOLoadConfig; - -/** - * Forward declarations of abstract types - **/ -typedef struct actionManager ActionManager; -typedef struct allocatingVIO AllocatingVIO; -typedef struct allocationSelector AllocationSelector; -typedef struct blockAllocator BlockAllocator; -typedef struct blockMap BlockMap; -typedef struct blockMapTreeZone BlockMapTreeZone; -typedef struct blockMapZone BlockMapZone; -typedef struct dataVIO DataVIO; -typedef struct flusher Flusher; -typedef struct forest Forest; -typedef struct hashLock HashLock; -typedef struct hashZone HashZone; -typedef struct indexConfig IndexConfig; -typedef struct inputBin InputBin; -typedef struct lbnLock LBNLock; -typedef struct lockCounter LockCounter; -typedef struct logicalZone LogicalZone; -typedef struct logicalZones LogicalZones; -typedef struct pbnLock PBNLock; -typedef struct physicalLayer PhysicalLayer; -typedef struct physicalZone PhysicalZone; -typedef struct recoveryJournal RecoveryJournal; -typedef struct readOnlyNotifier ReadOnlyNotifier; -typedef struct refCounts RefCounts; -typedef struct vdoSlab Slab; -typedef struct slabDepot SlabDepot; -typedef struct slabJournal SlabJournal; -typedef struct slabJournalEntry SlabJournalEntry; -typedef struct slabScrubber SlabScrubber; -typedef struct slabSummary SlabSummary; -typedef struct slabSummaryZone SlabSummaryZone; -typedef struct vdo VDO; -typedef struct vdoCompletion VDOCompletion; -typedef struct vdoExtent VDOExtent; -typedef struct vdoFlush VDOFlush; -typedef struct vdoLayout VDOLayout; -typedef struct vdoStatistics VDOStatistics; -typedef struct vio VIO; -typedef struct vioPool VIOPool; - -typedef struct { - PhysicalBlockNumber pbn; - BlockMappingState state; -} DataLocation; - -typedef struct { - PhysicalBlockNumber pbn; - BlockMappingState state; - PhysicalZone *zone; -} ZonedPBN; - -/** - * Callback which will be called by the VDO when all of the VIOs in the - * extent have been processed. - * - * @param extent The extent which is complete - **/ -typedef void VDOExtentCallback(VDOExtent *extent); - -/** - * An asynchronous operation. - * - * @param vio The VIO on which to operate - **/ -typedef void AsyncOperation(VIO *vio); - -/** - * An asynchronous compressed write operation. - * - * @param allocatingVIO The AllocatingVIO to write - **/ -typedef void CompressedWriter(AllocatingVIO *allocatingVIO); - -/** - * An asynchronous data operation. - * - * @param dataVIO The DataVIO on which to operate - **/ -typedef void AsyncDataOperation(DataVIO *dataVIO); - -/** - * A reference to a completion which (the reference) can be enqueued - * for completion on a specified thread. - **/ -typedef struct enqueueable { - VDOCompletion *completion; -} Enqueueable; - -#endif // TYPES_H diff --git a/vdo/base/upgrade.c b/vdo/base/upgrade.c deleted file mode 100644 index 4d58d6f..0000000 --- a/vdo/base/upgrade.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/upgrade.c#6 $ - */ - -#include "upgrade.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "blockMap.h" -#include "readOnlyNotifier.h" -#include "recoveryJournal.h" -#include "releaseVersions.h" -#include "slabDepot.h" -#include "statusCodes.h" -#include "superBlock.h" -#include "vdoInternal.h" -#include "volumeGeometry.h" - -/* The latest supported Sodium version */ -/* Commented out because not currently used. - * static const VersionNumber SODIUM_MASTER_VERSION_67_0 = { - * .majorVersion = 67, - * .minorVersion = 0, - * }; - */ - -/* The component data version for current Sodium */ -static const VersionNumber SODIUM_COMPONENT_DATA_41_0 = { - .majorVersion = 41, - .minorVersion = 0, -}; - -/** - * Current Sodium's configuration of the VDO component. - **/ -typedef struct { - VDOState state; - uint64_t completeRecoveries; - uint64_t readOnlyRecoveries; - VDOConfig config; - Nonce nonce; -} __attribute__((packed)) SodiumComponent41_0; - -/** - * Checks whether the release version loaded in the superblock is the - * current VDO version. - * - * @param vdo The VDO to validate - * - * @return true if the release version number is the current version - **/ -static bool isCurrentReleaseVersion(VDO *vdo) -{ - ReleaseVersionNumber loadedVersion - = getLoadedReleaseVersion(vdo->superBlock); - - return (loadedVersion == CURRENT_RELEASE_VERSION_NUMBER); -} - -/** - * Loads the VDO master version into the VDO and checks that the version - * can be understood by VDO. - * - * @param vdo The VDO to validate - * - * @return VDO_SUCCESS or an error if the loaded version is not supported - **/ -static int validateSodiumVersion(VDO *vdo) -{ - int result = decodeVDOVersion(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - if (isCurrentReleaseVersion(vdo)) { - return VDO_SUCCESS; - } - - ReleaseVersionNumber loadedVersion - = getLoadedReleaseVersion(vdo->superBlock); - return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, - "Release version %d, load version %d.%d" - " cannot be upgraded", loadedVersion, - vdo->loadVersion.majorVersion, - vdo->loadVersion.minorVersion); -} - -/** - * Decode a SodiumComponent41_0. - * - * @param buffer The component data buffer - * @param component The component structure to decode into - * - * @return VDO_SUCCESS or an error code - **/ -static int decodeSodium41_0Component(Buffer *buffer, - SodiumComponent41_0 *component) -{ - return getBytesFromBuffer(buffer, sizeof(*component), component); -} - -/** - * Decode the component data for the VDO itself from the component data - * buffer in the super block. - * - * @param vdo The VDO to decode - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int decodeSodiumComponent(VDO *vdo) -{ - Buffer *buffer = getComponentBuffer(vdo->superBlock); - VersionNumber version; - int result = decodeVersionNumber(buffer, &version); - if (result != VDO_SUCCESS) { - return result; - } - - SodiumComponent41_0 component; - if (areSameVersion(SODIUM_COMPONENT_DATA_41_0, version)) { - result = decodeSodium41_0Component(buffer, &component); - } else { - return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, - "VDO component data version mismatch," - " expected 41.0, got %d.%d", - version.majorVersion, - version.minorVersion); - } - if (result != VDO_SUCCESS) { - return result; - } - - // Copy the decoded component into the VDO structure. - vdo->state = component.state; - vdo->loadState = component.state; - vdo->completeRecoveries = component.completeRecoveries; - vdo->readOnlyRecoveries = component.readOnlyRecoveries; - vdo->config = component.config; - vdo->nonce = component.nonce; - - logInfo("Converted VDO component data version %d.%d", - version.majorVersion, version.minorVersion); - return VDO_SUCCESS; -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int finishSodiumDecode(VDO *vdo) -{ - Buffer *buffer = getComponentBuffer(vdo->superBlock); - const ThreadConfig *threadConfig = getThreadConfig(vdo); - int result = makeRecoveryJournal(vdo->nonce, vdo->layer, - getVDOPartition(vdo->layout, - RECOVERY_JOURNAL_PARTITION), - vdo->completeRecoveries, - vdo->config.recoveryJournalSize, - RECOVERY_JOURNAL_TAIL_BUFFER_SIZE, - vdo->readOnlyNotifier, threadConfig, - &vdo->recoveryJournal); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeSodiumRecoveryJournal(vdo->recoveryJournal, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeSodiumSlabDepot(buffer, threadConfig, vdo->nonce, vdo->layer, - getVDOPartition(vdo->layout, - SLAB_SUMMARY_PARTITION), - vdo->readOnlyNotifier, vdo->recoveryJournal, - &vdo->depot); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeSodiumBlockMap(buffer, vdo->config.logicalBlocks, - threadConfig, &vdo->blockMap); - if (result != VDO_SUCCESS) { - return result; - } - - ASSERT_LOG_ONLY((contentLength(buffer) == 0), - "All decoded component data was used"); - return VDO_SUCCESS; -} - -/**********************************************************************/ -int upgradePriorVDO(PhysicalLayer *layer) -{ - VolumeGeometry geometry; - int result = loadVolumeGeometry(layer, &geometry); - if (result != VDO_SUCCESS) { - return result; - } - - VDO *vdo; - result = makeVDO(layer, &vdo); - if (result != VDO_SUCCESS) { - return result; - } - - result = loadSuperBlock(vdo->layer, getDataRegionOffset(geometry), - &vdo->superBlock); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return logErrorWithStringError(result, "Could not load VDO super block"); - } - - // Load the necessary pieces to save again. - result = validateSodiumVersion(vdo); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - if (isCurrentReleaseVersion(vdo)) { - logInfo("VDO already up-to-date"); - freeVDO(&vdo); - return VDO_SUCCESS; - } - - result = decodeSodiumComponent(vdo); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - if (requiresRebuild(vdo)) { - // Do not attempt to upgrade a dirty prior version. - freeVDO(&vdo); - return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, - "Cannot upgrade a dirty VDO."); - } - - result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - const ThreadConfig *threadConfig = getThreadConfig(vdo); - result = makeReadOnlyNotifier(inReadOnlyMode(vdo), threadConfig, vdo->layer, - &vdo->readOnlyNotifier); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - result = finishSodiumDecode(vdo); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - // Saving will automatically change the release version to current. - result = saveVDOComponents(vdo); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - logInfo("Successfully saved upgraded VDO"); - freeVDO(&vdo); - - return result; -} diff --git a/vdo/base/upgrade.h b/vdo/base/upgrade.h deleted file mode 100644 index be2bd05..0000000 --- a/vdo/base/upgrade.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/upgrade.h#1 $ - */ - -#ifndef UPGRADE_H -#define UPGRADE_H - -#include "types.h" - -/** - * Reconfigure the superblock of a prior VDO, preparing it for upgrading. - * - * @param layer The layer with a VDO to prepare - * - * @return VDO_SUCCESS or an error - **/ -int upgradePriorVDO(PhysicalLayer *layer) - __attribute__((warn_unused_result)); - -#endif /* UPGRADE_H */ diff --git a/vdo/base/vdo.c b/vdo/base/vdo.c deleted file mode 100644 index b4b9a41..0000000 --- a/vdo/base/vdo.c +++ /dev/null @@ -1,1154 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdo.c#21 $ - */ - -/* - * This file contains the main entry points for normal operations on a VDO as - * well as functions for constructing and destroying VDO instances (in memory). - */ - -#include "vdoInternal.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "extent.h" -#include "hashZone.h" -#include "header.h" -#include "logicalZone.h" -#include "numUtils.h" -#include "packer.h" -#include "physicalZone.h" -#include "readOnlyNotifier.h" -#include "recoveryJournal.h" -#include "releaseVersions.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "statistics.h" -#include "statusCodes.h" -#include "threadConfig.h" -#include "vdoLayout.h" -#include "vioWrite.h" -#include "volumeGeometry.h" - -/** - * The master version of the on-disk format of a VDO. This should be - * incremented any time the on-disk representation of any VDO structure - * changes. Changes which require only online upgrade steps should increment - * the minor version. Changes which require an offline upgrade or which can not - * be upgraded to at all should increment the major version and set the minor - * version to 0. - **/ -static const VersionNumber VDO_MASTER_VERSION_67_0 = { - .majorVersion = 67, - .minorVersion = 0, -}; - -/** - * The current version for the data encoded in the super block. This must - * be changed any time there is a change to encoding of the component data - * of any VDO component. - **/ -static const VersionNumber VDO_COMPONENT_DATA_41_0 = { - .majorVersion = 41, - .minorVersion = 0, -}; - -/** - * This is the structure that captures the VDO fields saved as a SuperBlock - * component. - **/ -typedef struct { - VDOState state; - uint64_t completeRecoveries; - uint64_t readOnlyRecoveries; - VDOConfig config; - Nonce nonce; -} __attribute__((packed)) VDOComponent41_0; - -/**********************************************************************/ -int allocateVDO(PhysicalLayer *layer, VDO **vdoPtr) -{ - int result = registerStatusCodes(); - if (result != VDO_SUCCESS) { - return result; - } - - VDO *vdo; - result = ALLOCATE(1, VDO, __func__, &vdo); - if (result != UDS_SUCCESS) { - return result; - } - - vdo->layer = layer; - if (layer->createEnqueueable != NULL) { - result = initializeAdminCompletion(vdo, &vdo->adminCompletion); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - } - - *vdoPtr = vdo; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeVDO(PhysicalLayer *layer, VDO **vdoPtr) -{ - VDO *vdo; - int result = allocateVDO(layer, &vdo); - if (result != VDO_SUCCESS) { - return result; - } - - result = makeZeroThreadConfig(&vdo->loadConfig.threadConfig); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - *vdoPtr = vdo; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void destroyVDO(VDO *vdo) -{ - freeFlusher(&vdo->flusher); - freePacker(&vdo->packer); - freeRecoveryJournal(&vdo->recoveryJournal); - freeSlabDepot(&vdo->depot); - freeVDOLayout(&vdo->layout); - freeSuperBlock(&vdo->superBlock); - freeBlockMap(&vdo->blockMap); - - const ThreadConfig *threadConfig = getThreadConfig(vdo); - if (vdo->hashZones != NULL) { - for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { - freeHashZone(&vdo->hashZones[zone]); - } - } - FREE(vdo->hashZones); - vdo->hashZones = NULL; - - freeLogicalZones(&vdo->logicalZones); - - if (vdo->physicalZones != NULL) { - for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { - freePhysicalZone(&vdo->physicalZones[zone]); - } - } - FREE(vdo->physicalZones); - vdo->physicalZones = NULL; - - uninitializeAdminCompletion(&vdo->adminCompletion); - freeReadOnlyNotifier(&vdo->readOnlyNotifier); - freeThreadConfig(&vdo->loadConfig.threadConfig); -} - -/**********************************************************************/ -void freeVDO(VDO **vdoPtr) -{ - if (*vdoPtr == NULL) { - return; - } - - destroyVDO(*vdoPtr); - FREE(*vdoPtr); - *vdoPtr = NULL; -} - -/**********************************************************************/ -size_t getComponentDataSize(VDO *vdo) -{ - return (sizeof(VersionNumber) - + sizeof(VersionNumber) - + sizeof(VDOComponent41_0) - + getVDOLayoutEncodedSize(vdo->layout) - + getRecoveryJournalEncodedSize() - + getSlabDepotEncodedSize() - + getBlockMapEncodedSize()); -} - -/** - * Encode the VDO master version. - * - * @param buffer The buffer in which to encode the version - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int encodeMasterVersion(Buffer *buffer) -{ - return encodeVersionNumber(VDO_MASTER_VERSION_67_0, buffer); -} - -/** - * Encode a VDOConfig structure into a buffer. - * - * @param config The config structure to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int encodeVDOConfig(const VDOConfig *config, Buffer *buffer) -{ - int result = putUInt64LEIntoBuffer(buffer, config->logicalBlocks); - if (result != VDO_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, config->physicalBlocks); - if (result != VDO_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, config->slabSize); - if (result != VDO_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, config->recoveryJournalSize); - if (result != VDO_SUCCESS) { - return result; - } - - return putUInt64LEIntoBuffer(buffer, config->slabJournalBlocks); -} - -/** - * Encode the component data for the VDO itself. - * - * @param vdo The vdo to encode - * @param buffer The buffer in which to encode the VDO - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int encodeVDOComponent(const VDO *vdo, Buffer *buffer) -{ - int result = encodeVersionNumber(VDO_COMPONENT_DATA_41_0, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - size_t initialLength = contentLength(buffer); - - result = putUInt32LEIntoBuffer(buffer, vdo->state); - if (result != VDO_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, vdo->completeRecoveries); - if (result != VDO_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, vdo->readOnlyRecoveries); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeVDOConfig(&vdo->config, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, vdo->nonce); - if (result != VDO_SUCCESS) { - return result; - } - - size_t encodedSize = contentLength(buffer) - initialLength; - return ASSERT(encodedSize == sizeof(VDOComponent41_0), - "encoded VDO component size must match structure size"); -} - -/**********************************************************************/ -static int encodeVDO(VDO *vdo) -{ - Buffer *buffer = getComponentBuffer(vdo->superBlock); - int result = resetBufferEnd(buffer, 0); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeMasterVersion(buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeVDOComponent(vdo, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeVDOLayout(vdo->layout, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeRecoveryJournal(vdo->recoveryJournal, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeSlabDepot(vdo->depot, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeBlockMap(vdo->blockMap, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - ASSERT_LOG_ONLY((contentLength(buffer) == getComponentDataSize(vdo)), - "All super block component data was encoded"); - return VDO_SUCCESS; -} - -/**********************************************************************/ -int saveVDOComponents(VDO *vdo) -{ - int result = encodeVDO(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - return saveSuperBlock(vdo->layer, vdo->superBlock, getFirstBlockOffset(vdo)); -} - -/**********************************************************************/ -void saveVDOComponentsAsync(VDO *vdo, VDOCompletion *parent) -{ - int result = encodeVDO(vdo); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - saveSuperBlockAsync(vdo->superBlock, getFirstBlockOffset(vdo), parent); -} - -/**********************************************************************/ -int saveReconfiguredVDO(VDO *vdo) -{ - Buffer *buffer = getComponentBuffer(vdo->superBlock); - size_t componentsSize = contentLength(buffer); - - byte *components; - int result = copyBytes(buffer, componentsSize, &components); - if (result != VDO_SUCCESS) { - return result; - } - - result = resetBufferEnd(buffer, 0); - if (result != VDO_SUCCESS) { - FREE(components); - return result; - } - - result = encodeMasterVersion(buffer); - if (result != VDO_SUCCESS) { - FREE(components); - return result; - } - - result = encodeVDOComponent(vdo, buffer); - if (result != VDO_SUCCESS) { - FREE(components); - return result; - } - - result = putBytes(buffer, componentsSize, components); - FREE(components); - if (result != VDO_SUCCESS) { - return result; - } - - return saveSuperBlock(vdo->layer, vdo->superBlock, getFirstBlockOffset(vdo)); -} - -/**********************************************************************/ -int decodeVDOVersion(VDO *vdo) -{ - return decodeVersionNumber(getComponentBuffer(vdo->superBlock), - &vdo->loadVersion); -} - -/**********************************************************************/ -int validateVDOVersion(VDO *vdo) -{ - int result = decodeVDOVersion(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - ReleaseVersionNumber loadedReleaseVersion - = getLoadedReleaseVersion(vdo->superBlock); - if (vdo->loadConfig.releaseVersion != loadedReleaseVersion) { - return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, - "Geometry release version %" PRIu32 " does " - "not match super block release version %" - PRIu32, - vdo->loadConfig.releaseVersion, - loadedReleaseVersion); - } - - return validateVersion(VDO_MASTER_VERSION_67_0, vdo->loadVersion, "master"); -} - -/** - * Decode a VDOConfig structure from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param config The config structure to receive the decoded values - * - * @return UDS_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int decodeVDOConfig(Buffer *buffer, VDOConfig *config) -{ - BlockCount logicalBlocks; - int result = getUInt64LEFromBuffer(buffer, &logicalBlocks); - if (result != VDO_SUCCESS) { - return result; - } - - BlockCount physicalBlocks; - result = getUInt64LEFromBuffer(buffer, &physicalBlocks); - if (result != VDO_SUCCESS) { - return result; - } - - BlockCount slabSize; - result = getUInt64LEFromBuffer(buffer, &slabSize); - if (result != VDO_SUCCESS) { - return result; - } - - BlockCount recoveryJournalSize; - result = getUInt64LEFromBuffer(buffer, &recoveryJournalSize); - if (result != VDO_SUCCESS) { - return result; - } - - BlockCount slabJournalBlocks; - result = getUInt64LEFromBuffer(buffer, &slabJournalBlocks); - if (result != VDO_SUCCESS) { - return result; - } - - *config = (VDOConfig) { - .logicalBlocks = logicalBlocks, - .physicalBlocks = physicalBlocks, - .slabSize = slabSize, - .recoveryJournalSize = recoveryJournalSize, - .slabJournalBlocks = slabJournalBlocks, - }; - return VDO_SUCCESS; -} - -/** - * Decode the version 41.0 component state for the VDO itself from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param state The state structure to receive the decoded values - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) - static int decodeVDOComponent_41_0(Buffer *buffer, VDOComponent41_0 *state) -{ - size_t initialLength = contentLength(buffer); - - VDOState vdoState; - int result = getUInt32LEFromBuffer(buffer, &vdoState); - if (result != VDO_SUCCESS) { - return result; - } - - uint64_t completeRecoveries; - result = getUInt64LEFromBuffer(buffer, &completeRecoveries); - if (result != VDO_SUCCESS) { - return result; - } - - uint64_t readOnlyRecoveries; - result = getUInt64LEFromBuffer(buffer, &readOnlyRecoveries); - if (result != VDO_SUCCESS) { - return result; - } - - VDOConfig config; - result = decodeVDOConfig(buffer, &config); - if (result != VDO_SUCCESS) { - return result; - } - - Nonce nonce; - result = getUInt64LEFromBuffer(buffer, &nonce); - if (result != VDO_SUCCESS) { - return result; - } - - *state = (VDOComponent41_0) { - .state = vdoState, - .completeRecoveries = completeRecoveries, - .readOnlyRecoveries = readOnlyRecoveries, - .config = config, - .nonce = nonce, - }; - - size_t decodedSize = initialLength - contentLength(buffer); - return ASSERT(decodedSize == sizeof(VDOComponent41_0), - "decoded VDO component size must match structure size"); -} - -/**********************************************************************/ -int decodeVDOComponent(VDO *vdo) -{ - Buffer *buffer = getComponentBuffer(vdo->superBlock); - - VersionNumber version; - int result = decodeVersionNumber(buffer, &version); - if (result != VDO_SUCCESS) { - return result; - } - - result = validateVersion(version, VDO_COMPONENT_DATA_41_0, - "VDO component data"); - if (result != VDO_SUCCESS) { - return result; - } - - VDOComponent41_0 component; - result = decodeVDOComponent_41_0(buffer, &component); - if (result != VDO_SUCCESS) { - return result; - } - - // Copy the decoded component into the VDO structure. - vdo->state = component.state; - vdo->loadState = component.state; - vdo->completeRecoveries = component.completeRecoveries; - vdo->readOnlyRecoveries = component.readOnlyRecoveries; - vdo->config = component.config; - vdo->nonce = component.nonce; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int validateVDOConfig(const VDOConfig *config, - BlockCount blockCount, - bool requireLogical) -{ - int result = ASSERT(config->slabSize > 0, "slab size unspecified"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(isPowerOfTwo(config->slabSize), - "slab size must be a power of two"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(config->slabSize <= (1 << MAX_SLAB_BITS), - "slab size must be less than or equal to 2^%d", - MAX_SLAB_BITS); - if (result != VDO_SUCCESS) { - return result; - } - - result = ASSERT(config->slabJournalBlocks >= MINIMUM_SLAB_JOURNAL_BLOCKS, - "slab journal size meets minimum size"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(config->slabJournalBlocks <= config->slabSize, - "slab journal size is within expected bound"); - if (result != UDS_SUCCESS) { - return result; - } - - SlabConfig slabConfig; - result = configureSlab(config->slabSize, config->slabJournalBlocks, - &slabConfig); - if (result != VDO_SUCCESS) { - return result; - } - - result = ASSERT((slabConfig.dataBlocks >= 1), - "slab must be able to hold at least one block"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(config->physicalBlocks > 0, "physical blocks unspecified"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(config->physicalBlocks <= MAXIMUM_PHYSICAL_BLOCKS, - "physical block count %llu exceeds maximum %llu", - config->physicalBlocks, MAXIMUM_PHYSICAL_BLOCKS); - if (result != UDS_SUCCESS) { - return VDO_OUT_OF_RANGE; - } - - // This can't check equality because FileLayer et al can only known about - // the storage size, which may not match the super block size. - if (blockCount < config->physicalBlocks) { - logError("A physical size of %llu blocks was specified," - " but that is smaller than the %llu blocks" - " configured in the VDO super block", - blockCount, config->physicalBlocks); - return VDO_PARAMETER_MISMATCH; - } - - result = ASSERT(!requireLogical || (config->logicalBlocks > 0), - "logical blocks unspecified"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(config->logicalBlocks <= MAXIMUM_LOGICAL_BLOCKS, - "logical blocks too large"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(config->recoveryJournalSize > 0, - "recovery journal size unspecified"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(isPowerOfTwo(config->recoveryJournalSize), - "recovery journal size must be a power of two"); - if (result != UDS_SUCCESS) { - return result; - } - - return result; -} - -/** - * Notify a VDO that it is going read-only. This will save the read-only state - * to the super block. - * - *

Implements ReadOnlyNotification. - * - * @param listener The VDO - * @param parent The completion to notify in order to acknowledge the - * notification - **/ -static void notifyVDOOfReadOnlyMode(void *listener, VDOCompletion *parent) -{ - VDO *vdo = listener; - if (inReadOnlyMode(vdo)) { - completeCompletion(parent); - } - - vdo->state = VDO_READ_ONLY_MODE; - saveVDOComponentsAsync(vdo, parent); -} - -/**********************************************************************/ -int enableReadOnlyEntry(VDO *vdo) -{ - return registerReadOnlyListener(vdo->readOnlyNotifier, vdo, - notifyVDOOfReadOnlyMode, - getAdminThread(getThreadConfig(vdo))); -} - -/**********************************************************************/ -bool inReadOnlyMode(const VDO *vdo) -{ - return (vdo->state == VDO_READ_ONLY_MODE); -} - -/**********************************************************************/ -bool isClean(const VDO *vdo) -{ - return ((vdo->state == VDO_CLEAN) || (vdo->state == VDO_NEW)); -} - -/**********************************************************************/ -bool wasClean(const VDO *vdo) -{ - return ((vdo->loadState == VDO_CLEAN) || (vdo->loadState == VDO_NEW)); -} - -/**********************************************************************/ -bool wasNew(const VDO *vdo) -{ - return (vdo->loadState == VDO_NEW); -} - -/**********************************************************************/ -bool requiresReadOnlyRebuild(const VDO *vdo) -{ - return ((vdo->loadState == VDO_FORCE_REBUILD) - || (vdo->loadState == VDO_REBUILD_FOR_UPGRADE)); -} - -/**********************************************************************/ -bool requiresRebuild(const VDO *vdo) -{ - return ((vdo->state == VDO_DIRTY) - || (vdo->state == VDO_FORCE_REBUILD) - || (vdo->state == VDO_REPLAYING) - || (vdo->state == VDO_REBUILD_FOR_UPGRADE)); -} - -/**********************************************************************/ -bool requiresRecovery(const VDO *vdo) -{ - return ((vdo->loadState == VDO_DIRTY) || (vdo->loadState == VDO_REPLAYING) - || (vdo->loadState == VDO_RECOVERING)); -} - -/**********************************************************************/ -bool isReplaying(const VDO *vdo) -{ - return (vdo->state == VDO_REPLAYING); -} - -/**********************************************************************/ -bool inRecoveryMode(const VDO *vdo) -{ - return (vdo->state == VDO_RECOVERING); -} - -/**********************************************************************/ -void enterRecoveryMode(VDO *vdo) -{ - assertOnAdminThread(vdo, __func__); - - if (inReadOnlyMode(vdo)) { - return; - } - - logInfo("Entering recovery mode"); - vdo->state = VDO_RECOVERING; -} - -/**********************************************************************/ -void leaveRecoveryMode(VDO *vdo) -{ - assertOnAdminThread(vdo, __func__); - - /* - * Since scrubbing can be stopped by vdoClose during recovery mode, - * do not change the VDO state if there are outstanding unrecovered slabs. - */ - if (inReadOnlyMode(vdo)) { - return; - } - - ASSERT_LOG_ONLY(inRecoveryMode(vdo), "VDO is in recovery mode"); - logInfo("Exiting recovery mode"); - vdo->state = VDO_DIRTY; -} - -/**********************************************************************/ -void makeVDOReadOnly(VDO *vdo, int errorCode) -{ - enterReadOnlyMode(vdo->readOnlyNotifier, errorCode); -} - -/**********************************************************************/ -bool setVDOCompressing(VDO *vdo, bool enableCompression) -{ - bool stateChanged = compareAndSwapBool(&vdo->compressing, !enableCompression, - enableCompression); - if (stateChanged && !enableCompression) { - // Flushing the packer is asynchronous, but we don't care when it - // finishes. - flushPacker(vdo->packer); - } - - logInfo("compression is %s", (enableCompression ? "enabled" : "disabled")); - return (stateChanged ? !enableCompression : enableCompression); -} - -/**********************************************************************/ -bool getVDOCompressing(VDO *vdo) -{ - return atomicLoadBool(&vdo->compressing); -} - -/**********************************************************************/ -static size_t getBlockMapCacheSize(const VDO *vdo) -{ - return ((size_t) vdo->loadConfig.cacheSize) * VDO_BLOCK_SIZE; -} - -/** - * Tally the hash lock statistics from all the hash zones. - * - * @param vdo The vdo to query - * - * @return The sum of the hash lock statistics from all hash zones - **/ -static HashLockStatistics getHashLockStatistics(const VDO *vdo) -{ - HashLockStatistics totals; - memset(&totals, 0, sizeof(totals)); - - const ThreadConfig *threadConfig = getThreadConfig(vdo); - for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { - HashLockStatistics stats = getHashZoneStatistics(vdo->hashZones[zone]); - totals.dedupeAdviceValid += stats.dedupeAdviceValid; - totals.dedupeAdviceStale += stats.dedupeAdviceStale; - totals.concurrentDataMatches += stats.concurrentDataMatches; - totals.concurrentHashCollisions += stats.concurrentHashCollisions; - } - - return totals; -} - -/** - * Get the current error statistics from VDO. - * - * @param vdo The vdo to query - * - * @return a copy of the current VDO error counters - **/ -static ErrorStatistics getVDOErrorStatistics(const VDO *vdo) -{ - /* - * The error counts can be incremented from arbitrary threads and so must be - * incremented atomically, but they are just statistics with no semantics - * that could rely on memory order, so unfenced reads are sufficient. - */ - const AtomicErrorStatistics *atoms = &vdo->errorStats; - return (ErrorStatistics) { - .invalidAdvicePBNCount = relaxedLoad64(&atoms->invalidAdvicePBNCount), - .noSpaceErrorCount = relaxedLoad64(&atoms->noSpaceErrorCount), - .readOnlyErrorCount = relaxedLoad64(&atoms->readOnlyErrorCount), - }; -} - -/**********************************************************************/ -static const char *describeWritePolicy(WritePolicy policy) -{ - switch (policy) { - case WRITE_POLICY_ASYNC: - return "async"; - case WRITE_POLICY_ASYNC_UNSAFE: - return "async-unsafe"; - case WRITE_POLICY_SYNC: - return "sync"; - default: - return "unknown"; - } -} - -/**********************************************************************/ -void getVDOStatistics(const VDO *vdo, VDOStatistics *stats) -{ - // These are immutable properties of the VDO object, so it is safe to - // query them from any thread. - RecoveryJournal *journal = vdo->recoveryJournal; - SlabDepot *depot = vdo->depot; - // XXX config.physicalBlocks is actually mutated during resize and is in a - // packed structure, but resize runs on the admin thread so we're usually OK. - stats->version = STATISTICS_VERSION; - stats->releaseVersion = CURRENT_RELEASE_VERSION_NUMBER; - stats->logicalBlocks = vdo->config.logicalBlocks; - stats->physicalBlocks = vdo->config.physicalBlocks; - stats->blockSize = VDO_BLOCK_SIZE; - stats->completeRecoveries = vdo->completeRecoveries; - stats->readOnlyRecoveries = vdo->readOnlyRecoveries; - stats->blockMapCacheSize = getBlockMapCacheSize(vdo); - snprintf(stats->writePolicy, sizeof(stats->writePolicy), "%s", - describeWritePolicy(getWritePolicy(vdo))); - - // The callees are responsible for thread-safety. - stats->dataBlocksUsed = getPhysicalBlocksAllocated(vdo); - stats->overheadBlocksUsed = getPhysicalBlocksOverhead(vdo); - stats->logicalBlocksUsed = getJournalLogicalBlocksUsed(journal); - stats->allocator = getDepotBlockAllocatorStatistics(depot); - stats->journal = getRecoveryJournalStatistics(journal); - stats->packer = getPackerStatistics(vdo->packer); - stats->slabJournal = getDepotSlabJournalStatistics(depot); - stats->slabSummary = getSlabSummaryStatistics(getSlabSummary(depot)); - stats->refCounts = getDepotRefCountsStatistics(depot); - stats->blockMap = getBlockMapStatistics(vdo->blockMap); - stats->hashLock = getHashLockStatistics(vdo); - stats->errors = getVDOErrorStatistics(vdo); - SlabCount slabTotal = getDepotSlabCount(depot); - stats->recoveryPercentage - = (slabTotal - getDepotUnrecoveredSlabCount(depot)) * 100 / slabTotal; - - // The "state" field is mutable, but we just need a unfenced atomic read. - VDOState state = *((const volatile VDOState *) &vdo->state); - stats->inRecoveryMode = (state == VDO_RECOVERING); - snprintf(stats->mode, sizeof(stats->mode), "%s", describeVDOState(state)); -} - -/**********************************************************************/ -BlockCount getPhysicalBlocksAllocated(const VDO *vdo) -{ - return (getDepotAllocatedBlocks(vdo->depot) - - getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); -} - -/**********************************************************************/ -BlockCount getPhysicalBlocksFree(const VDO *vdo) -{ - return getDepotFreeBlocks(vdo->depot); -} - -/**********************************************************************/ -BlockCount getPhysicalBlocksOverhead(const VDO *vdo) -{ - // XXX config.physicalBlocks is actually mutated during resize and is in a - // packed structure, but resize runs on admin thread so we're usually OK. - return (vdo->config.physicalBlocks - - getDepotDataBlocks(vdo->depot) - + getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); -} - -/**********************************************************************/ -BlockCount getTotalBlockMapBlocks(const VDO *vdo) -{ - return (getNumberOfFixedBlockMapPages(vdo->blockMap) - + getJournalBlockMapDataBlocksUsed(vdo->recoveryJournal)); -} - -/**********************************************************************/ -WritePolicy getWritePolicy(const VDO *vdo) -{ - return vdo->loadConfig.writePolicy; -} - -/**********************************************************************/ -void setWritePolicy(VDO *vdo, WritePolicy new) -{ - vdo->loadConfig.writePolicy = new; -} - -/**********************************************************************/ -const VDOLoadConfig *getVDOLoadConfig(const VDO *vdo) -{ - return &vdo->loadConfig; -} - -/**********************************************************************/ -const ThreadConfig *getThreadConfig(const VDO *vdo) -{ - return vdo->loadConfig.threadConfig; -} - -/**********************************************************************/ -BlockCount getConfiguredBlockMapMaximumAge(const VDO *vdo) -{ - return vdo->loadConfig.maximumAge; -} - -/**********************************************************************/ -PageCount getConfiguredCacheSize(const VDO *vdo) -{ - return vdo->loadConfig.cacheSize; -} - -/**********************************************************************/ -PhysicalBlockNumber getFirstBlockOffset(const VDO *vdo) -{ - return vdo->loadConfig.firstBlockOffset; -} - -/**********************************************************************/ -BlockMap *getBlockMap(const VDO *vdo) -{ - return vdo->blockMap; -} - -/**********************************************************************/ -SlabDepot *getSlabDepot(VDO *vdo) -{ - return vdo->depot; -} - -/**********************************************************************/ -RecoveryJournal *getRecoveryJournal(VDO *vdo) -{ - return vdo->recoveryJournal; -} - -/**********************************************************************/ -void dumpVDOStatus(const VDO *vdo) -{ - dumpFlusher(vdo->flusher); - dumpRecoveryJournalStatistics(vdo->recoveryJournal); - dumpPacker(vdo->packer); - dumpSlabDepot(vdo->depot); - - const ThreadConfig *threadConfig = getThreadConfig(vdo); - for (ZoneCount zone = 0; zone < threadConfig->logicalZoneCount; zone++) { - dumpLogicalZone(getLogicalZone(vdo->logicalZones, zone)); - } - - for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { - dumpPhysicalZone(vdo->physicalZones[zone]); - } - - for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { - dumpHashZone(vdo->hashZones[zone]); - } -} - -/**********************************************************************/ -void setVDOTracingFlags(VDO *vdo, bool vioTracing) -{ - vdo->vioTraceRecording = vioTracing; -} - -/**********************************************************************/ -bool vdoVIOTracingEnabled(const VDO *vdo) -{ - return ((vdo != NULL) && vdo->vioTraceRecording); -} - -/**********************************************************************/ -void assertOnAdminThread(VDO *vdo, const char *name) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() - == getAdminThread(getThreadConfig(vdo))), - "%s called on admin thread", name); -} - -/**********************************************************************/ -void assertOnLogicalZoneThread(const VDO *vdo, - ZoneCount logicalZone, - const char *name) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() - == getLogicalZoneThread(getThreadConfig(vdo), logicalZone)), - "%s called on logical thread", name); -} - -/**********************************************************************/ -void assertOnPhysicalZoneThread(const VDO *vdo, - ZoneCount physicalZone, - const char *name) -{ - ASSERT_LOG_ONLY((getCallbackThreadID() - == getPhysicalZoneThread(getThreadConfig(vdo), - physicalZone)), - "%s called on physical thread", name); -} - -/**********************************************************************/ -HashZone *selectHashZone(const VDO *vdo, const UdsChunkName *name) -{ - /* - * Use a fragment of the chunk name as a hash code. To ensure uniform - * distributions, it must not overlap with fragments used elsewhere. Eight - * bits of hash should suffice since the number of hash zones is small. - */ - // XXX Make a central repository for these offsets ala hashUtils. - // XXX Verify that the first byte is independent enough. - uint32_t hash = name->name[0]; - - /* - * Scale the 8-bit hash fragment to a zone index by treating it as a binary - * fraction and multiplying that by the zone count. If the hash is uniformly - * distributed over [0 .. 2^8-1], then (hash * count / 2^8) should be - * uniformly distributed over [0 .. count-1]. The multiply and shift is much - * faster than a divide (modulus) on X86 CPUs. - */ - return vdo->hashZones[(hash * getThreadConfig(vdo)->hashZoneCount) >> 8]; -} - -/**********************************************************************/ -int getPhysicalZone(const VDO *vdo, - PhysicalBlockNumber pbn, - PhysicalZone **zonePtr) -{ - if (pbn == ZERO_BLOCK) { - *zonePtr = NULL; - return VDO_SUCCESS; - } - - // Used because it does a more restrictive bounds check than getSlab(), and - // done first because it won't trigger read-only mode on an invalid PBN. - if (!isPhysicalDataBlock(vdo->depot, pbn)) { - return VDO_OUT_OF_RANGE; - } - - // With the PBN already checked, we should always succeed in finding a slab. - Slab *slab = getSlab(vdo->depot, pbn); - int result = ASSERT(slab != NULL, "getSlab must succeed on all valid PBNs"); - if (result != VDO_SUCCESS) { - return result; - } - - *zonePtr = vdo->physicalZones[getSlabZoneNumber(slab)]; - return VDO_SUCCESS; -} - -/**********************************************************************/ -ZonedPBN validateDedupeAdvice(VDO *vdo, - const DataLocation *advice, - LogicalBlockNumber lbn) -{ - ZonedPBN noAdvice = { .pbn = ZERO_BLOCK }; - if (advice == NULL) { - return noAdvice; - } - - // Don't use advice that's clearly meaningless. - if ((advice->state == MAPPING_STATE_UNMAPPED) - || (advice->pbn == ZERO_BLOCK)) { - logDebug("Invalid advice from deduplication server: pbn %llu, " - "state %u. Giving up on deduplication of logical block %llu", - advice->pbn, advice->state, lbn); - atomicAdd64(&vdo->errorStats.invalidAdvicePBNCount, 1); - return noAdvice; - } - - PhysicalZone *zone; - int result = getPhysicalZone(vdo, advice->pbn, &zone); - if ((result != VDO_SUCCESS) || (zone == NULL)) { - logDebug("Invalid physical block number from deduplication server: %" - PRIu64 ", giving up on deduplication of logical block %llu", - advice->pbn, lbn); - atomicAdd64(&vdo->errorStats.invalidAdvicePBNCount, 1); - return noAdvice; - } - - return (ZonedPBN) { - .pbn = advice->pbn, - .state = advice->state, - .zone = zone, - }; -} diff --git a/vdo/base/vdo.h b/vdo/base/vdo.h deleted file mode 100644 index 5741112..0000000 --- a/vdo/base/vdo.h +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdo.h#3 $ - */ - -#ifndef VDO_H -#define VDO_H - -#include "types.h" - -/** - * Allocate a VDO and associate it with its physical layer. - * - * @param [in] layer The physical layer the VDO sits on - * @param [out] vdoPtr A pointer to hold the allocated VDO - * - * @return VDO_SUCCESS or an error - **/ -int allocateVDO(PhysicalLayer *layer, VDO **vdoPtr) - __attribute__((warn_unused_result)); - -/** - * Construct a VDO for use in user space with a synchronous layer. - * - * @param [in] layer The physical layer the VDO sits on - * @param [out] vdoPtr A pointer to hold the allocated VDO - * - * @return VDO_SUCCESS or an error - **/ -int makeVDO(PhysicalLayer *layer, VDO **vdoPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy a VDO instance. - * - * @param vdo The VDO to destroy - **/ -void destroyVDO(VDO *vdo); - -/** - * Destroy a VDO instance, free it, and null out the reference to it. - * - * @param vdoPtr A reference to the VDO to free - **/ -void freeVDO(VDO **vdoPtr); - -/** - * Put a VDO into read-only mode and save the read-only state in the super - * block. - * - * @param vdo The VDO to put into read-only mode - * @param errorCode The error which caused the VDO to enter read-only - * mode - **/ -void makeVDOReadOnly(VDO *vdo, int errorCode); - -/** - * Set whether compression is enabled in VDO. - * - * @param vdo The VDO - * @param enableCompression Whether to enable compression in VDO - * - * @return State of compression before new value is set - **/ -bool setVDOCompressing(VDO *vdo, bool enableCompression); - -/** - * Get whether compression is enabled in VDO. - * - * @param vdo The VDO - * - * @return State of compression - **/ -bool getVDOCompressing(VDO *vdo); - -/** - * Get the VDO statistics. - * - * @param [in] vdo The VDO - * @param [out] stats The VDO statistics are returned here - **/ -void getVDOStatistics(const VDO *vdo, VDOStatistics *stats); - -/** - * Get the number of physical blocks in use by user data. - * - * @param vdo The VDO - * - * @return The number of blocks allocated for user data - **/ -BlockCount getPhysicalBlocksAllocated(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the number of unallocated physical blocks. - * - * @param vdo The VDO - * - * @return The number of free blocks - **/ -BlockCount getPhysicalBlocksFree(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the number of physical blocks used by VDO metadata. - * - * @param vdo The VDO - * - * @return The number of overhead blocks - **/ -BlockCount getPhysicalBlocksOverhead(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the total number of blocks used for the block map. - * - * @param vdo The VDO - * - * @return The number of block map blocks - **/ -BlockCount getTotalBlockMapBlocks(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the VDO write policy. - * - * @param vdo The VDO - * - * @return The write policy - **/ -WritePolicy getWritePolicy(const VDO *vdo); - -/** - * Set the VDO write policy. - * - * @param vdo The VDO - * @param new The new write policy - **/ -void setWritePolicy(VDO *vdo, WritePolicy new); - -/** - * Get a copy of the load-time configuration of the VDO. - * - * @param vdo The VDO - * - * @return The load-time configuration of the VDO - **/ -const VDOLoadConfig *getVDOLoadConfig(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the thread config of the VDO. - * - * @param vdo The VDO - * - * @return The thread config - **/ -const ThreadConfig *getThreadConfig(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the configured maximum age of a dirty block map page. - * - * @param vdo The VDO - * - * @return The block map era length - **/ -BlockCount getConfiguredBlockMapMaximumAge(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the configured page cache size of the VDO. - * - * @param vdo The VDO - * - * @return The number of pages for the page cache - **/ -PageCount getConfiguredCacheSize(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the location of the first block of the VDO. - * - * @param vdo The VDO - * - * @return The location of the first block managed by the VDO - **/ -PhysicalBlockNumber getFirstBlockOffset(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether the VDO was new when it was loaded. - * - * @param vdo The VDO to query - * - * @return true if the VDO was new - **/ -bool wasNew(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether a DataLocation containing potential dedupe advice is - * well-formed and addresses a data block in one of the configured physical - * zones of the VDO. If it is, return the location and zone as a ZonedPBN; - * otherwise increment statistics tracking invalid advice and return an - * unmapped ZonedPBN. - * - * @param vdo The VDO - * @param advice The advice to validate (NULL indicates no advice) - * @param lbn The logical block number of the write that requested advice, - * which is only used for debug-level logging of invalid advice - * - * @return The ZonedPBN representing the advice, if valid, otherwise an - * unmapped ZonedPBN if the advice was invalid or NULL - **/ -ZonedPBN validateDedupeAdvice(VDO *vdo, - const DataLocation *advice, - LogicalBlockNumber lbn) - __attribute__((warn_unused_result)); - -// TEST SUPPORT ONLY BEYOND THIS POINT - -/** - * Dump status information about VDO to the log for debugging. - * - * @param vdo The vdo to dump - **/ -void dumpVDOStatus(const VDO *vdo); - -/** - * Set the VIO tracing flag. - * - * @param vdo The VDO - * @param vioTracing Whether VIO tracing is enabled for this device - **/ -void setVDOTracingFlags(VDO *vdo, bool vioTracing); - -/** - * Indicate whether VIO tracing is enabled. - * - * @param vdo The VDO - * - * @return Whether VIO tracing is enabled - **/ -bool vdoVIOTracingEnabled(const VDO *vdo); - -/** - * Indicate whether extent tracing is enabled. - * - * @param vdo The VDO - * - * @return Whether extent tracing is enabled - **/ -bool vdoExtentTracingEnabled(const VDO *vdo); - -#endif /* VDO_H */ diff --git a/vdo/base/vdoDebug.c b/vdo/base/vdoDebug.c deleted file mode 100644 index 6c03ece..0000000 --- a/vdo/base/vdoDebug.c +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoDebug.c#1 $ - */ - -#include "vdoDebug.h" - -#include "logger.h" -#include "stringUtils.h" -#include "vdoInternal.h" - -static const char xLogDebugMessage[] = "x-log-debug-message"; - -/**********************************************************************/ -int initializeVDOCommandCompletion(VDOCommandCompletion *command, - VDO *vdo, - int argc, - char **argv) -{ - *command = (VDOCommandCompletion) { - .vdo = vdo, - .argc = argc, - .argv = argv, - }; - initializeCompletion(&command->completion, VDO_COMMAND_COMPLETION, - vdo->layer); - return initializeEnqueueableCompletion(&command->subCompletion, - VDO_COMMAND_SUB_COMPLETION, - vdo->layer); -} - -/**********************************************************************/ -int destroyVDOCommandCompletion(VDOCommandCompletion *command) -{ - if (command == NULL) { - return VDO_SUCCESS; - } - - destroyEnqueueable(&command->subCompletion); - return command->completion.result; -} - -/**********************************************************************/ -static inline VDOCommandCompletion * -asVDOCommandCompletion(VDOCompletion *completion) -{ - if (completion->type == VDO_COMMAND_COMPLETION) { - return (VDOCommandCompletion *) - ((uintptr_t) completion - offsetof(VDOCommandCompletion, completion)); - } else if (completion->type == VDO_COMMAND_SUB_COMPLETION) { - return (VDOCommandCompletion *) - ((uintptr_t) completion - offsetof(VDOCommandCompletion, subCompletion)); - } else { - ASSERT_LOG_ONLY(((completion->type == VDO_COMMAND_COMPLETION) || - (completion->type == VDO_COMMAND_SUB_COMPLETION)), - "completion type is %s instead of " - "VDO_COMMAND_COMPLETION or VDO_COMMAND_SUB_COMPLETION", - getCompletionTypeName(completion->type)); - return NULL; - } -} - -/**********************************************************************/ -static void logDebugMessage(VDOCommandCompletion *cmd) -{ - static char buffer[256]; - - char *buf = buffer; - char *end = buffer + sizeof(buffer); - - for (int i = 1; i < cmd->argc; ++i) { - buf = appendToBuffer(buf, end, " %s", cmd->argv[i]); - } - if (buf == end) { - strcpy(buf - 4, "..."); - } - logInfo("debug message:%s", buffer); - finishCompletion(&cmd->completion, VDO_SUCCESS); -} - -/**********************************************************************/ -void executeVDOExtendedCommand(VDOCompletion *completion) -{ - VDOCommandCompletion *cmd = asVDOCommandCompletion(completion); - - if ((cmd->vdo == NULL) || (cmd->argc == 0)) { - finishCompletion(&cmd->completion, VDO_COMMAND_ERROR); - return; - } - if (strcmp(cmd->argv[0], xLogDebugMessage) == 0) { - logDebugMessage(cmd); - } else { - finishCompletion(&cmd->completion, VDO_UNKNOWN_COMMAND); - } -} diff --git a/vdo/base/vdoDebug.h b/vdo/base/vdoDebug.h deleted file mode 100644 index c626533..0000000 --- a/vdo/base/vdoDebug.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoDebug.h#1 $ - */ - -#ifndef VDO_DEBUG_H -#define VDO_DEBUG_H - -#include "completion.h" -#include "vdo.h" - -/** - * A completion used to pass information to a potentially asynchronous - * (because it must run in a different zone) extended command. - * - * These commands are dispatched according to argv[0], which is of the form - * "x-some-command-name", and intentionally open ended for debugging. - * - * The command "x-log-debug-message" is currently defined to echo the - * remainder of the arguments into the kernel log via the vdo logger at - * info level. - **/ -typedef struct vdoCommandCompletion { - VDOCompletion completion; - VDOCompletion subCompletion; - VDO *vdo; - int argc; - char **argv; -} VDOCommandCompletion; - -/** - * Initialize a VDO command completion. - * - * @param command The command completion to initialize. - * @param vdo The VDO. - * @param argc An argument count. - * @param argv An argument vector of length argc. - * - * @return VDO_SUCCESS or an error code - **/ -int initializeVDOCommandCompletion(VDOCommandCompletion *command, - VDO *vdo, - int argc, - char **argv); - -/** - * Destroy a VDO command completion. - * - * @param command The command completion. - * - * @return the completion result - **/ -int destroyVDOCommandCompletion(VDOCommandCompletion *command); - -/** - * Perform an asynchronous extended command (usually debugging related). - * - * @param completion The completion embedded in VDOCommandCompletion. - **/ -void executeVDOExtendedCommand(VDOCompletion *completion); - -#endif // VDO_DEBUG_H diff --git a/vdo/base/vdoInternal.h b/vdo/base/vdoInternal.h deleted file mode 100644 index 1337e73..0000000 --- a/vdo/base/vdoInternal.h +++ /dev/null @@ -1,410 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoInternal.h#11 $ - */ - -#ifndef VDO_INTERNAL_H -#define VDO_INTERNAL_H - -#include "vdo.h" - -#include "adminCompletion.h" -#include "adminState.h" -#include "atomic.h" -#include "header.h" -#include "packer.h" -#include "statistics.h" -#include "superBlock.h" -#include "readOnlyNotifier.h" -#include "types.h" -#include "uds.h" -#include "vdoLayout.h" -#include "vdoState.h" - -/** - * Error counters are atomic since updates can arrive concurrently from - * arbitrary threads. - **/ -typedef struct atomicErrorStatistics { - // Dedupe path error stats - Atomic64 invalidAdvicePBNCount; - Atomic64 noSpaceErrorCount; - Atomic64 readOnlyErrorCount; -} AtomicErrorStatistics; - -struct vdo { - /* The state of this VDO */ - VDOState state; - /* The read-only notifier */ - ReadOnlyNotifier *readOnlyNotifier; - /* The number of times this VDO has recovered from a dirty state */ - uint64_t completeRecoveries; - /* The number of times this VDO has recovered from a read-only state */ - uint64_t readOnlyRecoveries; - /* The format-time configuration of this VDO */ - VDOConfig config; - /* The load-time configuration of this VDO */ - VDOLoadConfig loadConfig; - /* The nonce for this VDO */ - Nonce nonce; - - /* The super block */ - SuperBlock *superBlock; - - /* The physical storage below us */ - PhysicalLayer *layer; - - /* Our partitioning of the physical layer's storage */ - VDOLayout *layout; - - /* The block map */ - BlockMap *blockMap; - - /* The journal for block map recovery */ - RecoveryJournal *recoveryJournal; - - /* The slab depot */ - SlabDepot *depot; - - /* The compressed-block packer */ - Packer *packer; - /* Whether incoming data should be compressed */ - AtomicBool compressing; - - /* The handler for flush requests */ - Flusher *flusher; - - /* The master version of the VDO when loaded (for upgrading) */ - VersionNumber loadVersion; - /* The state the VDO was in when loaded (primarily for unit tests) */ - VDOState loadState; - /* Whether VIO tracing is enabled */ - bool vioTraceRecording; - - /* The logical zones of this VDO */ - LogicalZones *logicalZones; - - /* The physical zones of this VDO */ - PhysicalZone **physicalZones; - - /* The hash lock zones of this VDO */ - HashZone **hashZones; - - /* The completion for administrative operations */ - AdminCompletion adminCompletion; - - /* The administrative state of the VDO */ - AdminState adminState; - - /* Whether a close is required */ - bool closeRequired; - - /* Atomic global counts of error events */ - AtomicErrorStatistics errorStats; -}; - -/** - * Get the component data size of a VDO. - * - * @param vdo The VDO whose component data size is desired - * - * @return the component data size of the VDO - **/ -size_t getComponentDataSize(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Encode the VDO and save the super block synchronously. - * - * @param vdo The VDO whose state is being saved - * - * @return VDO_SUCCESS or an error - **/ -int saveVDOComponents(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Encode the VDO and save the super block asynchronously. All non-user mode - * super block savers should use this bottle neck instead of calling - * saveSuperBlockAsync() directly. - * - * @param vdo The VDO whose state is being saved - * @param parent The completion to notify when the save is complete - **/ -void saveVDOComponentsAsync(VDO *vdo, VDOCompletion *parent); - -/** - * Re-encode the VDO component after a reconfiguration and save the super - * block synchronously. This function avoids the need to decode and re-encode - * the other components by simply copying their previous encoding. - * - * @param vdo The VDO which was reconfigured - * - * @return VDO_SUCCESS or an error code - **/ -int saveReconfiguredVDO(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Decode the VDO master version from the component data buffer in the super - * block and store it in the VDO's loadVersion field. - **/ -int decodeVDOVersion(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Loads the VDO master version into the VDO and checks that the version - * can be understood by VDO. - * - * @param vdo The VDO to validate - * - * @return VDO_SUCCESS or an error if the loaded version is not supported - **/ -int validateVDOVersion(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Decode the component data for the VDO itself from the component data buffer - * in the super block. - * - * @param vdo The VDO to decode - * - * @return VDO_SUCCESS or an error - **/ -int decodeVDOComponent(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Validate constraints on VDO config. - * - * @param config The VDO config - * @param blockCount The block count of the VDO - * @param requireLogical Set to true if the number logical blocks - * must be configured (otherwise, it may be zero) - * - * @return a success or error code - **/ -int validateVDOConfig(const VDOConfig *config, - BlockCount blockCount, - bool requireLogical) - __attribute__((warn_unused_result)); - -/** - * Enable a VDO to enter read-only mode on errors. - * - * @param vdo The VDO to enable - * - * @return VDO_SUCCESS or an error - **/ -int enableReadOnlyEntry(VDO *vdo); - -/** - * Get the block map. - * - * @param vdo The VDO whose block map is desired - * - * @return the block map from the VDO - **/ -BlockMap *getBlockMap(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the slab depot from a VDO. - * - * @param vdo The VDO whose slab depot is desired - * - * @return the slab depot from the VDO - **/ -SlabDepot *getSlabDepot(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Get the recovery journal from a VDO. - * - * @param vdo The VDO whose recovery journal is desired - * - * @return the recovery journal from the VDO - **/ -RecoveryJournal *getRecoveryJournal(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether a VDO is in read-only mode. - * - * @param vdo The VDO to query - * - * @return true if the VDO is in read-only mode - **/ -bool inReadOnlyMode(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether the VDO is in a clean state. - * - * @param vdo The VDO to query - * - * @return true if the VDO is clean - **/ -bool isClean(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether the VDO was in a clean state when it was loaded. - * - * @param vdo The VDO to query - * - * @return true if the VDO was clean - **/ -bool wasClean(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether the VDO requires a read-only mode rebuild. - * - * @param vdo The VDO to query - * - * @return true if the VDO requires a read-only rebuild - **/ -bool requiresReadOnlyRebuild(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether a VDO requires rebuilding. - * - * @param vdo The VDO to query - * - * @return true if the VDO must be rebuilt - **/ -bool requiresRebuild(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether a VDO should enter recovery mode. - * - * @param vdo The VDO to query - * - * @return true if the VDO requires recovery - **/ -bool requiresRecovery(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether a VDO was replaying the recovery journal into the block map - * when it crashed. - * - * @param vdo The VDO to query - * - * @return true if the VDO crashed while reconstructing the - * block map - **/ -bool isReplaying(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Check whether the VDO is in recovery mode. - * - * @param vdo The VDO to query - * - * @return true if the VDO is in recovery mode - **/ -bool inRecoveryMode(const VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Put the VDO into recovery mode - * - * @param vdo The VDO - **/ -void enterRecoveryMode(VDO *vdo); - -/** - * Leave recovery mode if slab scrubbing has actually finished. - * - * @param vdo The VDO - **/ -void leaveRecoveryMode(VDO *vdo); - -/** - * Assert that we are running on the admin thread. - * - * @param vdo The VDO - * @param name The name of the function which should be running on the admin - * thread (for logging). - **/ -void assertOnAdminThread(VDO *vdo, const char *name); - -/** - * Assert that this function was called on the specified logical zone thread. - * - * @param vdo The VDO - * @param logicalZone The number of the logical zone - * @param name The name of the calling function - **/ -void assertOnLogicalZoneThread(const VDO *vdo, - ZoneCount logicalZone, - const char *name); - -/** - * Assert that this function was called on the specified physical zone thread. - * - * @param vdo The VDO - * @param physicalZone The number of the physical zone - * @param name The name of the calling function - **/ -void assertOnPhysicalZoneThread(const VDO *vdo, - ZoneCount physicalZone, - const char *name); - -/** - * Select the hash zone responsible for locking a given chunk name. - * - * @param vdo The VDO containing the hash zones - * @param name The chunk name - * - * @return The hash zone responsible for the chunk name - **/ -HashZone *selectHashZone(const VDO *vdo, const UdsChunkName *name) - __attribute__((warn_unused_result)); - -/** - * Get the physical zone responsible for a given physical block number of a - * data block in this VDO instance, or of the zero block (for which a NULL - * zone is returned). For any other block number that is not in the range of - * valid data block numbers in any slab, an error will be returned. This - * function is safe to call on invalid block numbers; it will not put the VDO - * into read-only mode. - * - * @param [in] vdo The VDO containing the physical zones - * @param [in] pbn The PBN of the data block - * @param [out] zonePtr A pointer to return the physical zone - * - * @return VDO_SUCCESS or VDO_OUT_OF_RANGE if the block number is invalid - * or an error code for any other failure - **/ -int getPhysicalZone(const VDO *vdo, - PhysicalBlockNumber pbn, - PhysicalZone **zonePtr) - __attribute__((warn_unused_result)); - -/**********************************************************************/ -// Asynchronous callback to share a duplicate block. This is only public so -// test code may compare it against the current callback in the completion. -void shareBlock(VDOCompletion *completion); - -#endif /* VDO_INTERNAL_H */ diff --git a/vdo/base/vdoLayout.c b/vdo/base/vdoLayout.c deleted file mode 100644 index 3dfce96..0000000 --- a/vdo/base/vdoLayout.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayout.c#2 $ - */ - -#include "vdoLayout.h" -#include "vdoLayoutInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "blockMap.h" -#include "partitionCopy.h" -#include "slab.h" -#include "slabSummary.h" -#include "types.h" -#include "vdoInternal.h" - -#include "statusCodes.h" - -static const PartitionID REQUIRED_PARTITIONS[] = { - BLOCK_MAP_PARTITION, - BLOCK_ALLOCATOR_PARTITION, - RECOVERY_JOURNAL_PARTITION, - SLAB_SUMMARY_PARTITION, -}; - -static const uint8_t REQUIRED_PARTITION_COUNT = 4; - -/** - * Make a fixed layout for a VDO. - * - * @param [in] physicalBlocks The number of physical blocks in the VDO - * @param [in] startingOffset The starting offset of the layout - * @param [in] blockMapBlocks The size of the block map partition - * @param [in] journalBlocks The size of the journal partition - * @param [in] summaryBlocks The size of the slab summary partition - * @param [out] layoutPtr A pointer to hold the new FixedLayout - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int makeVDOFixedLayout(BlockCount physicalBlocks, - PhysicalBlockNumber startingOffset, - BlockCount blockMapBlocks, - BlockCount journalBlocks, - BlockCount summaryBlocks, - FixedLayout **layoutPtr) -{ - BlockCount necessarySize - = (startingOffset + blockMapBlocks + journalBlocks + summaryBlocks); - if (necessarySize > physicalBlocks) { - return logErrorWithStringError(VDO_NO_SPACE, "Not enough space to" - " make a VDO"); - } - - FixedLayout *layout; - int result = makeFixedLayout(physicalBlocks - startingOffset, - startingOffset, &layout); - if (result != VDO_SUCCESS) { - return result; - } - - result = makeFixedLayoutPartition(layout, BLOCK_MAP_PARTITION, - blockMapBlocks, FROM_BEGINNING, 0); - if (result != VDO_SUCCESS) { - freeFixedLayout(&layout); - return result; - } - - result = makeFixedLayoutPartition(layout, SLAB_SUMMARY_PARTITION, - summaryBlocks, FROM_END, 0); - if (result != VDO_SUCCESS) { - freeFixedLayout(&layout); - return result; - } - - result = makeFixedLayoutPartition(layout, RECOVERY_JOURNAL_PARTITION, - journalBlocks, FROM_END, 0); - if (result != VDO_SUCCESS) { - freeFixedLayout(&layout); - return result; - } - - /* - * The block allocator no longer traffics in relative PBNs so the offset - * doesn't matter. We need to keep this partition around both for upgraded - * systems, and because we decided that all of the usable space in the - * volume, other than the super block, should be part of some partition. - */ - result = makeFixedLayoutPartition(layout, BLOCK_ALLOCATOR_PARTITION, - ALL_FREE_BLOCKS, FROM_BEGINNING, - blockMapBlocks); - if (result != VDO_SUCCESS) { - freeFixedLayout(&layout); - return result; - } - - *layoutPtr = layout; - return VDO_SUCCESS; -} - -/** - * Get the offset of a given partition. - * - * @param layout The layout containing the partition - * @param partitionID The ID of the partition whose offset is desired - * - * @return The offset of the partition (in blocks) - **/ -__attribute__((warn_unused_result)) -static BlockCount getPartitionOffset(VDOLayout *layout, - PartitionID partitionID) -{ - return getFixedLayoutPartitionOffset(getVDOPartition(layout, partitionID)); -} - -/**********************************************************************/ -int makeVDOLayout(BlockCount physicalBlocks, - PhysicalBlockNumber startingOffset, - BlockCount blockMapBlocks, - BlockCount journalBlocks, - BlockCount summaryBlocks, - VDOLayout **vdoLayoutPtr) -{ - VDOLayout *vdoLayout; - int result = ALLOCATE(1, VDOLayout, __func__, &vdoLayout); - if (result != VDO_SUCCESS) { - return result; - } - - result = makeVDOFixedLayout(physicalBlocks, startingOffset, blockMapBlocks, - journalBlocks, summaryBlocks, &vdoLayout->layout); - if (result != VDO_SUCCESS) { - freeVDOLayout(&vdoLayout); - return result; - } - - vdoLayout->startingOffset = startingOffset; - - *vdoLayoutPtr = vdoLayout; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int decodeVDOLayout(Buffer *buffer, VDOLayout **vdoLayoutPtr) -{ - VDOLayout *vdoLayout; - int result = ALLOCATE(1, VDOLayout, __func__, &vdoLayout); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeFixedLayout(buffer, &vdoLayout->layout); - if (result != VDO_SUCCESS) { - freeVDOLayout(&vdoLayout); - return result; - } - - // Check that all the expected partitions exist - Partition *partition; - for (uint8_t i = 0; i < REQUIRED_PARTITION_COUNT; i++) { - result = getPartition(vdoLayout->layout, REQUIRED_PARTITIONS[i], - &partition); - if (result != VDO_SUCCESS) { - freeVDOLayout(&vdoLayout); - return logErrorWithStringError(result, - "VDO layout is missing required partition" - " %u", REQUIRED_PARTITIONS[i]); - } - } - - // XXX Assert this is the same as where we loaded the super block. - vdoLayout->startingOffset - = getPartitionOffset(vdoLayout, BLOCK_MAP_PARTITION); - - *vdoLayoutPtr = vdoLayout; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeVDOLayout(VDOLayout **vdoLayoutPtr) -{ - VDOLayout *vdoLayout = *vdoLayoutPtr; - if (vdoLayout == NULL) { - return; - } - - freeCopyCompletion(&vdoLayout->copyCompletion); - freeFixedLayout(&vdoLayout->nextLayout); - freeFixedLayout(&vdoLayout->layout); - freeFixedLayout(&vdoLayout->previousLayout); - FREE(vdoLayout); - *vdoLayoutPtr = NULL; -} - -/** - * Get a partition from a FixedLayout in conditions where we expect that it can - * not fail. - * - * @param layout The FixedLayout from which to get the partition - * @param id The ID of the partition to retrieve - * - * @return The desired partition - **/ -__attribute__((warn_unused_result)) -static Partition *retrievePartition(FixedLayout *layout, PartitionID id) -{ - Partition *partition; - int result = getPartition(layout, id, &partition); - ASSERT_LOG_ONLY(result == VDO_SUCCESS, "VDOLayout has expected partition"); - return partition; -} - -/**********************************************************************/ -Partition *getVDOPartition(VDOLayout *vdoLayout, PartitionID id) -{ - return retrievePartition(vdoLayout->layout, id); -} - -/** - * Get a partition from a VDOLayout's next FixedLayout. This method should - * only be called when the VDOLayout is prepared to grow. - * - * @param vdoLayout The VDOLayout from which to get the partition - * @param id The ID of the desired partition - * - * @return The requested partition - **/ -__attribute__((warn_unused_result)) -static Partition *getPartitionFromNextLayout(VDOLayout *vdoLayout, - PartitionID id) -{ - ASSERT_LOG_ONLY(vdoLayout->nextLayout != NULL, - "VDOLayout is prepared to grow"); - return retrievePartition(vdoLayout->nextLayout, id); -} - -/** - * Get the size of a given partition. - * - * @param layout The layout containing the partition - * @param partitionID The partition ID whose size to find - * - * @return The size of the partition (in blocks) - **/ -__attribute__((warn_unused_result)) -static BlockCount getPartitionSize(VDOLayout *layout, PartitionID partitionID) -{ - return getFixedLayoutPartitionSize(getVDOPartition(layout, partitionID)); -} - -/**********************************************************************/ -int prepareToGrowVDOLayout(VDOLayout *vdoLayout, - BlockCount oldPhysicalBlocks, - BlockCount newPhysicalBlocks, - PhysicalLayer *layer) -{ - if (getNextVDOLayoutSize(vdoLayout) == newPhysicalBlocks) { - // We are already prepared to grow to the new size, so we're done. - return VDO_SUCCESS; - } - - // Make a copy completion if there isn't one - if (vdoLayout->copyCompletion == NULL) { - int result = makeCopyCompletion(layer, &vdoLayout->copyCompletion); - if (result != VDO_SUCCESS) { - return result; - } - } - - // Free any unused preparation. - freeFixedLayout(&vdoLayout->nextLayout); - - // Make a new layout with the existing partition sizes for everything but the - // block allocator partition. - int result = makeVDOFixedLayout(newPhysicalBlocks, - vdoLayout->startingOffset, - getPartitionSize(vdoLayout, - BLOCK_MAP_PARTITION), - getPartitionSize(vdoLayout, - RECOVERY_JOURNAL_PARTITION), - getPartitionSize(vdoLayout, - SLAB_SUMMARY_PARTITION), - &vdoLayout->nextLayout); - if (result != VDO_SUCCESS) { - freeCopyCompletion(&vdoLayout->copyCompletion); - return result; - } - - // Ensure the new journal and summary are entirely within the added blocks. - Partition *slabSummaryPartition - = getPartitionFromNextLayout(vdoLayout, SLAB_SUMMARY_PARTITION); - Partition *recoveryJournalPartition - = getPartitionFromNextLayout(vdoLayout, RECOVERY_JOURNAL_PARTITION); - BlockCount minNewSize - = (oldPhysicalBlocks - + getFixedLayoutPartitionSize(slabSummaryPartition) - + getFixedLayoutPartitionSize(recoveryJournalPartition)); - if (minNewSize > newPhysicalBlocks) { - // Copying the journal and summary would destroy some old metadata. - freeFixedLayout(&vdoLayout->nextLayout); - freeCopyCompletion(&vdoLayout->copyCompletion); - return VDO_INCREMENT_TOO_SMALL; - } - - return VDO_SUCCESS; -} - -/** - * Get the size of a VDO from the specified FixedLayout and the - * starting offset thereof. - * - * @param layout The fixed layout whose size to use - * @param startingOffset The starting offset of the layout - * - * @return The total size of a VDO (in blocks) with the given layout - **/ -__attribute__((warn_unused_result)) -static BlockCount getVDOSize(FixedLayout *layout, BlockCount startingOffset) -{ - // The FixedLayout does not include the super block or any earlier - // metadata; all that is captured in the VDOLayout's starting offset - return getTotalFixedLayoutSize(layout) + startingOffset; -} - -/**********************************************************************/ -BlockCount getNextVDOLayoutSize(VDOLayout *vdoLayout) -{ - return ((vdoLayout->nextLayout == NULL) - ? 0 : getVDOSize(vdoLayout->nextLayout, vdoLayout->startingOffset)); -} - -/**********************************************************************/ -BlockCount getNextBlockAllocatorPartitionSize(VDOLayout *vdoLayout) -{ - if (vdoLayout->nextLayout == NULL) { - return 0; - } - - Partition *partition = getPartitionFromNextLayout(vdoLayout, - BLOCK_ALLOCATOR_PARTITION); - return getFixedLayoutPartitionSize(partition); -} - -/**********************************************************************/ -BlockCount growVDOLayout(VDOLayout *vdoLayout) -{ - ASSERT_LOG_ONLY(vdoLayout->nextLayout != NULL, - "VDO prepared to grow physical"); - vdoLayout->previousLayout = vdoLayout->layout; - vdoLayout->layout = vdoLayout->nextLayout; - vdoLayout->nextLayout = NULL; - - return getVDOSize(vdoLayout->layout, vdoLayout->startingOffset); -} - -/**********************************************************************/ -BlockCount revertVDOLayout(VDOLayout *vdoLayout) -{ - if ((vdoLayout->previousLayout != NULL) - && (vdoLayout->previousLayout != vdoLayout->layout)) { - // Only revert if there's something to revert to. - freeFixedLayout(&vdoLayout->layout); - vdoLayout->layout = vdoLayout->previousLayout; - vdoLayout->previousLayout = NULL; - } - - return getVDOSize(vdoLayout->layout, vdoLayout->startingOffset); -} - -/**********************************************************************/ -void finishVDOLayoutGrowth(VDOLayout *vdoLayout) -{ - if (vdoLayout->layout != vdoLayout->previousLayout) { - freeFixedLayout(&vdoLayout->previousLayout); - } - - if (vdoLayout->layout != vdoLayout->nextLayout) { - freeFixedLayout(&vdoLayout->nextLayout); - } - - freeCopyCompletion(&vdoLayout->copyCompletion); -} - -/**********************************************************************/ -void copyPartition(VDOLayout *layout, - PartitionID partitionID, - VDOCompletion *parent) -{ - copyPartitionAsync(layout->copyCompletion, - getVDOPartition(layout, partitionID), - getPartitionFromNextLayout(layout, partitionID), parent); -} - -/**********************************************************************/ -size_t getVDOLayoutEncodedSize(const VDOLayout *vdoLayout) -{ - return getFixedLayoutEncodedSize(vdoLayout->layout); -} - -/**********************************************************************/ -int encodeVDOLayout(const VDOLayout *vdoLayout, Buffer *buffer) -{ - return encodeFixedLayout(vdoLayout->layout, buffer); -} - diff --git a/vdo/base/vdoLayout.h b/vdo/base/vdoLayout.h deleted file mode 100644 index 3de24ae..0000000 --- a/vdo/base/vdoLayout.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayout.h#2 $ - */ - -/** - * VDOLayout is an object which manages the layout of a VDO. It wraps - * FixedLayout, but includes the knowledge of exactly which partitions a VDO is - * expected to have. Because of this knowledge, the VDOLayout validates the - * FixedLayout encoded in the super block at load time, obviating the need for - * subsequent error checking when other modules need to get partitions from the - * layout. - * - * The VDOLayout also manages the preparation and growth of the layout for grow - * physical operations. - **/ - -#ifndef VDO_LAYOUT_H -#define VDO_LAYOUT_H - -#include "fixedLayout.h" -#include "types.h" - -/** - * Make a VDO layout with the specified parameters. - * - * @param [in] physicalBlocks The number of physical blocks in the VDO - * @param [in] startingOffset The starting offset of the layout - * @param [in] blockMapBlocks The size of the block map partition - * @param [in] journalBlocks The size of the journal partition - * @param [in] summaryBlocks The size of the slab summary partition - * @param [out] vdoLayoutPtr A pointer to hold the new VDOLayout - * - * @return VDO_SUCCESS or an error - **/ -int makeVDOLayout(BlockCount physicalBlocks, - PhysicalBlockNumber startingOffset, - BlockCount blockMapBlocks, - BlockCount journalBlocks, - BlockCount summaryBlocks, - VDOLayout **vdoLayoutPtr) - __attribute__((warn_unused_result)); - -/** - * Decode a VDOLayout from a buffer. - * - * @param [in] buffer The buffer from which to decode - * @param [out] vdoLayoutPtr A pointer to hold the VDOLayout - * - * @return VDO_SUCCESS or an error - **/ -int decodeVDOLayout(Buffer *buffer, VDOLayout **vdoLayoutPtr) - __attribute__((warn_unused_result)); - -/** - * Free a VDOLayout and NULL out the reference to it. - * - * @param vdoLayoutPtr The pointer to a VDOLayout to free - **/ -void freeVDOLayout(VDOLayout **vdoLayoutPtr); - -/** - * Get a partition from a VDOLayout. Because the layout's FixedLayout has - * already been validated, this can not fail. - * - * @param vdoLayout The VDOLayout from which to get the partition - * @param id The ID of the desired partition - * - * @return The requested partition - **/ -Partition *getVDOPartition(VDOLayout *vdoLayout, PartitionID id) - __attribute__((warn_unused_result)); - -/** - * Prepare the layout to be grown. - * - * @param vdoLayout The layout to grow - * @param oldPhysicalBlocks The current size of the VDO - * @param newPhysicalBlocks The size to which the VDO will be grown - * @param layer The layer being grown - * - * @return VDO_SUCCESS or an error code - **/ -int prepareToGrowVDOLayout(VDOLayout *vdoLayout, - BlockCount oldPhysicalBlocks, - BlockCount newPhysicalBlocks, - PhysicalLayer *layer) - __attribute__((warn_unused_result)); - -/** - * Get the size of the next layout. - * - * @param vdoLayout The layout to check - * - * @return The size which was specified when the layout was prepared for growth - * or 0 if the layout is not prepared to grow - **/ -BlockCount getNextVDOLayoutSize(VDOLayout *vdoLayout) - __attribute__((warn_unused_result)); - -/** - * Get the size of the next block allocator partition. - * - * @param vdoLayout The VDOLayout which has been prepared to grow - * - * @return The size of the block allocator partition in the next layout or 0 - * if the layout is not prepared to grow - **/ -BlockCount getNextBlockAllocatorPartitionSize(VDOLayout *vdoLayout) - __attribute__((warn_unused_result)); - -/** - * Grow the layout by swapping in the prepared layout. - * - * @param vdoLayout The layout to grow - * - * @return The new size of the VDO - **/ -BlockCount growVDOLayout(VDOLayout *vdoLayout) - __attribute__((warn_unused_result)); - -/** - * Revert the last growth attempt. - * - * @param vdoLayout The layout to revert - * - * @return The reverted size (in blocks) of the VDO - **/ -BlockCount revertVDOLayout(VDOLayout *vdoLayout) - __attribute__((warn_unused_result)); - -/** - * Clean up any unused resources once an attempt to grow has completed. - * - * @param vdoLayout The layout - **/ -void finishVDOLayoutGrowth(VDOLayout *vdoLayout); - -/** - * Copy a partition from the location specified in the current layout to that in - * the next layout. - * - * @param layout The VDOLayout which is prepared to grow - * @param partitionID The ID of the partition to copy - * @param parent The completion to notify when the copy is complete - **/ -void copyPartition(VDOLayout *layout, - PartitionID partitionID, - VDOCompletion *parent); - -/** - * Get the size of an encoded VDOLayout. - * - * @param vdoLayout The VDOLayout - * - * @return The encoded size of the VDOLayout - **/ -size_t getVDOLayoutEncodedSize(const VDOLayout *vdoLayout) - __attribute__((warn_unused_result)); - -/** - * Encode a VDOLayout into a buffer. - * - * @param vdoLayout The VDOLayout to encode - * @param buffer The buffer to encode into - * - * @return UDS_SUCCESS or an error - **/ -int encodeVDOLayout(const VDOLayout *vdoLayout, Buffer *buffer) - __attribute__((warn_unused_result)); - -#endif // VDO_LAYOUT_H diff --git a/vdo/base/vdoLayoutInternals.h b/vdo/base/vdoLayoutInternals.h deleted file mode 100644 index 5f038fe..0000000 --- a/vdo/base/vdoLayoutInternals.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLayoutInternals.h#2 $ - */ - -#ifndef VDO_LAYOUT_INTERNALS_H -#define VDO_LAYOUT_INTERNALS_H - -#include "fixedLayout.h" -#include "types.h" - -struct vdoLayout { - // The current layout of the VDO - FixedLayout *layout; - // The next layout of the VDO - FixedLayout *nextLayout; - // The previous layout of the VDO - FixedLayout *previousLayout; - // The first block in the layouts - PhysicalBlockNumber startingOffset; - // A pointer to the copy completion (if there is one) - VDOCompletion *copyCompletion; -}; - -#endif // VDO_LAYOUT_INTERNALS_H diff --git a/vdo/base/vdoLoad.c b/vdo/base/vdoLoad.c deleted file mode 100644 index c72f39e..0000000 --- a/vdo/base/vdoLoad.c +++ /dev/null @@ -1,560 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLoad.c#17 $ - */ - -#include "vdoLoad.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "completion.h" -#include "constants.h" -#include "hashZone.h" -#include "header.h" -#include "logicalZone.h" -#include "physicalZone.h" -#include "readOnlyRebuild.h" -#include "recoveryJournal.h" -#include "releaseVersions.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "threadConfig.h" -#include "types.h" -#include "vdoInternal.h" -#include "vdoRecovery.h" -#include "volumeGeometry.h" - -/** - * Extract the VDO from an AdminCompletion, checking that the current operation - * is a load. - * - * @param completion The AdminCompletion's sub-task completion - * - * @return The VDO - **/ -static inline VDO *vdoFromLoadSubTask(VDOCompletion *completion) -{ - return vdoFromAdminSubTask(completion, ADMIN_OPERATION_LOAD); -} - -/** - * Finish aborting a load now that any entry to read-only mode is complete. - * This callback is registered in abortLoad(). - * - * @param completion The sub-task completion - **/ -static void finishAborting(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - vdo->closeRequired = false; - finishParentCallback(completion); -} - -/** - * Make sure the recovery journal is closed when aborting a load. - * - * @param completion The sub-task completion - **/ -static void closeRecoveryJournalForAbort(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - prepareAdminSubTask(vdo, finishAborting, finishAborting); - drainRecoveryJournal(vdo->recoveryJournal, ADMIN_STATE_SAVING, completion); -} - -/** - * Clean up after an error loading a VDO. This error handler is set in - * loadCallback() and loadVDOComponents(). - * - * @param completion The sub-task completion - **/ -static void abortLoad(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - logErrorWithStringError(completion->result, "aborting load"); - if (vdo->readOnlyNotifier == NULL) { - // There are no threads, so we're done - finishParentCallback(completion); - return; - } - - // Preserve the error. - setCompletionResult(completion->parent, completion->result); - if (vdo->recoveryJournal == NULL) { - prepareAdminSubTask(vdo, finishAborting, finishAborting); - } else { - prepareAdminSubTaskOnThread(vdo, closeRecoveryJournalForAbort, - closeRecoveryJournalForAbort, - getJournalZoneThread(getThreadConfig(vdo))); - } - - waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, completion); -} - -/** - * Wait for the VDO to be in read-only mode. - * - * @param completion The sub-task completion - **/ -static void waitForReadOnlyMode(VDOCompletion *completion) -{ - prepareToFinishParent(completion, completion->parent); - setCompletionResult(completion, VDO_READ_ONLY); - VDO *vdo = vdoFromLoadSubTask(completion); - waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, completion); -} - -/** - * Finish loading the VDO after an error, but leave it in read-only - * mode. This error handler is set in makeDirty(), scrubSlabs(), and - * loadVDOComponents(). - * - * @param completion The sub-task completion - **/ -static void continueLoadReadOnly(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - logErrorWithStringError(completion->result, - "Entering read-only mode due to load error"); - enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); - waitForReadOnlyMode(completion); -} - -/** - * Exit recovery mode if necessary now that online slab scrubbing or loading - * is complete. This callback is registrered in scrubSlabs(). - * - * @param completion The slab scrubber completion - **/ -static void finishScrubbingSlabs(VDOCompletion *completion) -{ - VDO *vdo = completion->parent; - assertOnAdminThread(vdo, __func__); - if (inRecoveryMode(vdo)) { - leaveRecoveryMode(vdo); - } else { - logInfo("VDO commencing normal operation"); - } -} - -/** - * Handle an error scrubbing or loading all slabs after the VDO has come - * online. This error handler is registered in scrubSlabs(). - * - * @param completion The slab scrubber completion - **/ -static void handleScrubAllError(VDOCompletion *completion) -{ - VDO *vdo = completion->parent; - enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); -} - -/** - * Initiate slab scrubbing if necessary. This callback is registered in - * prepareToComeOnline(). - * - * @param completion The sub-task completion - **/ -static void scrubSlabs(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - if (!hasUnrecoveredSlabs(vdo->depot)) { - finishParentCallback(completion); - return; - } - - if (requiresRecovery(vdo)) { - enterRecoveryMode(vdo); - } - - prepareAdminSubTask(vdo, finishParentCallback, continueLoadReadOnly); - scrubAllUnrecoveredSlabs(vdo->depot, vdo, finishScrubbingSlabs, - handleScrubAllError, 0, completion); -} - -/** - * This is the error handler for slab scrubbing. It is registered in - * prepareToComeOnline(). - * - * @param completion The sub-task completion - **/ -static void handleScrubbingError(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); - waitForReadOnlyMode(completion); -} - -/** - * This is the callback after the super block is written. It prepares the block - * allocator to come online and start allocating. It is registered in - * makeDirty(). - * - * @param completion The sub-task completion - **/ -static void prepareToComeOnline(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - SlabDepotLoadType loadType = NORMAL_LOAD; - if (requiresReadOnlyRebuild(vdo)) { - loadType = REBUILD_LOAD; - } else if (requiresRecovery(vdo)) { - loadType = RECOVERY_LOAD; - } - - initializeBlockMapFromJournal(vdo->blockMap, vdo->recoveryJournal); - - prepareAdminSubTask(vdo, scrubSlabs, handleScrubbingError); - prepareToAllocate(vdo->depot, loadType, completion); -} - -/** - * Mark the super block as dirty now that everything has been loaded or - * rebuilt. - * - * @param completion The sub-task completion - **/ -static void makeDirty(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - if (isReadOnly(vdo->readOnlyNotifier)) { - finishCompletion(completion->parent, VDO_READ_ONLY); - return; - } - - vdo->state = VDO_DIRTY; - prepareAdminSubTask(vdo, prepareToComeOnline, continueLoadReadOnly); - saveVDOComponentsAsync(vdo, completion); -} - -/** - * Callback to do the destructive parts of a load now that the new VDO device - * is being resumed. - * - * @param completion The sub-task completion - **/ -static void loadCallback(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - assertOnAdminThread(vdo, __func__); - - // Prepare the recovery journal for new entries. - openRecoveryJournal(vdo->recoveryJournal, vdo->depot, vdo->blockMap); - vdo->closeRequired = true; - if (isReadOnly(vdo->readOnlyNotifier)) { - // In read-only mode we don't use the allocator and it may not - // even be readable, so use the default structure. - finishCompletion(completion->parent, VDO_READ_ONLY); - return; - } - - if (requiresReadOnlyRebuild(vdo)) { - prepareAdminSubTask(vdo, makeDirty, abortLoad); - launchRebuild(vdo, completion); - return; - } - - if (requiresRebuild(vdo)) { - prepareAdminSubTask(vdo, makeDirty, continueLoadReadOnly); - launchRecovery(vdo, completion); - return; - } - - prepareAdminSubTask(vdo, makeDirty, continueLoadReadOnly); - loadSlabDepot(vdo->depot, - (wasNew(vdo) ? ADMIN_STATE_FORMATTING : ADMIN_STATE_LOADING), - completion, NULL); -} - -/**********************************************************************/ -int performVDOLoad(VDO *vdo) -{ - return performAdminOperation(vdo, ADMIN_OPERATION_LOAD, NULL, loadCallback, - loadCallback); -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int startVDODecode(VDO *vdo, bool validateConfig) -{ - int result = validateVDOVersion(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeVDOComponent(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - if (!validateConfig) { - return VDO_SUCCESS; - } - - if (vdo->loadConfig.nonce != vdo->nonce) { - return logErrorWithStringError(VDO_BAD_NONCE, "Geometry nonce %" PRIu64 - " does not match superblock nonce %llu", - vdo->loadConfig.nonce, vdo->nonce); - } - - BlockCount blockCount = vdo->layer->getBlockCount(vdo->layer); - return validateVDOConfig(&vdo->config, blockCount, true); -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int finishVDODecode(VDO *vdo) -{ - Buffer *buffer = getComponentBuffer(vdo->superBlock); - const ThreadConfig *threadConfig = getThreadConfig(vdo); - int result = makeRecoveryJournal(vdo->nonce, vdo->layer, - getVDOPartition(vdo->layout, - RECOVERY_JOURNAL_PARTITION), - vdo->completeRecoveries, - vdo->config.recoveryJournalSize, - RECOVERY_JOURNAL_TAIL_BUFFER_SIZE, - vdo->readOnlyNotifier, threadConfig, - &vdo->recoveryJournal); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeRecoveryJournal(vdo->recoveryJournal, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeSlabDepot(buffer, threadConfig, vdo->nonce, vdo->layer, - getVDOPartition(vdo->layout, - SLAB_SUMMARY_PARTITION), - vdo->readOnlyNotifier, vdo->recoveryJournal, - &vdo->depot); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeBlockMap(buffer, vdo->config.logicalBlocks, threadConfig, - &vdo->blockMap); - if (result != VDO_SUCCESS) { - return result; - } - - ASSERT_LOG_ONLY((contentLength(buffer) == 0), - "All decoded component data was used"); - return VDO_SUCCESS; -} - -/** - * Decode the component data portion of a super block and fill in the - * corresponding portions of the VDO being loaded. This will also allocate the - * recovery journal and slab depot. If this method is called with an - * asynchronous layer (i.e. a thread config which specifies at least one base - * thread), the block map and packer will be constructed as well. - * - * @param vdo The VDO being loaded - * @param validateConfig Whether to validate the config - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int decodeVDO(VDO *vdo, bool validateConfig) -{ - int result = startVDODecode(vdo, validateConfig); - if (result != VDO_SUCCESS) { - return result; - } - - const ThreadConfig *threadConfig = getThreadConfig(vdo); - result = makeReadOnlyNotifier(inReadOnlyMode(vdo), threadConfig, vdo->layer, - &vdo->readOnlyNotifier); - if (result != VDO_SUCCESS) { - return result; - } - - result = enableReadOnlyEntry(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); - if (result != VDO_SUCCESS) { - return result; - } - - result = finishVDODecode(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - result = makeFlusher(vdo); - if (result != VDO_SUCCESS) { - return result; - } - - BlockCount maximumAge = getConfiguredBlockMapMaximumAge(vdo); - BlockCount journalLength - = getRecoveryJournalLength(vdo->config.recoveryJournalSize); - if ((maximumAge > (journalLength / 2)) || (maximumAge < 1)) { - return VDO_BAD_CONFIGURATION; - } - result = makeBlockMapCaches(vdo->blockMap, vdo->layer, - vdo->readOnlyNotifier, vdo->recoveryJournal, - vdo->nonce, getConfiguredCacheSize(vdo), - maximumAge); - if (result != VDO_SUCCESS) { - return result; - } - - result = ALLOCATE(threadConfig->hashZoneCount, HashZone *, __func__, - &vdo->hashZones); - if (result != VDO_SUCCESS) { - return result; - } - - for (ZoneCount zone = 0; zone < threadConfig->hashZoneCount; zone++) { - result = makeHashZone(vdo, zone, &vdo->hashZones[zone]); - if (result != VDO_SUCCESS) { - return result; - } - } - - result = makeLogicalZones(vdo, &vdo->logicalZones); - if (result != VDO_SUCCESS) { - return result; - } - - result = ALLOCATE(threadConfig->physicalZoneCount, PhysicalZone *, __func__, - &vdo->physicalZones); - if (result != VDO_SUCCESS) { - return result; - } - - for (ZoneCount zone = 0; zone < threadConfig->physicalZoneCount; zone++) { - result = makePhysicalZone(vdo, zone, &vdo->physicalZones[zone]); - if (result != VDO_SUCCESS) { - return result; - } - } - - return makePacker(vdo->layer, DEFAULT_PACKER_INPUT_BINS, - DEFAULT_PACKER_OUTPUT_BINS, threadConfig, &vdo->packer); -} - -/** - * Load the components of a VDO. This is the super block load callback - * set by loadCallback(). - * - * @param completion The sub-task completion - **/ -static void loadVDOComponents(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - - prepareCompletion(completion, finishParentCallback, abortLoad, - completion->callbackThreadID, completion->parent); - finishCompletion(completion, decodeVDO(vdo, true)); -} - -/** - * Callback to initiate a pre-load, registered in prepareToLoadVDO(). - * - * @param completion The sub-task completion - **/ -static void preLoadCallback(VDOCompletion *completion) -{ - VDO *vdo = vdoFromLoadSubTask(completion); - assertOnAdminThread(vdo, __func__); - prepareAdminSubTask(vdo, loadVDOComponents, abortLoad); - loadSuperBlockAsync(completion, getFirstBlockOffset(vdo), &vdo->superBlock); -} - -/**********************************************************************/ -int prepareToLoadVDO(VDO *vdo, const VDOLoadConfig *loadConfig) -{ - vdo->loadConfig = *loadConfig; - return performAdminOperation(vdo, ADMIN_OPERATION_LOAD, NULL, - preLoadCallback, preLoadCallback); -} - -/**********************************************************************/ -__attribute__((warn_unused_result)) -static int decodeSynchronousVDO(VDO *vdo, bool validateConfig) -{ - int result = startVDODecode(vdo, validateConfig); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeVDOLayout(getComponentBuffer(vdo->superBlock), &vdo->layout); - if (result != VDO_SUCCESS) { - return result; - } - - return finishVDODecode(vdo); -} - -/**********************************************************************/ -int loadVDOSuperblock(PhysicalLayer *layer, - VolumeGeometry *geometry, - bool validateConfig, - VDODecoder *decoder, - VDO **vdoPtr) -{ - VDO *vdo; - int result = makeVDO(layer, &vdo); - if (result != VDO_SUCCESS) { - return result; - } - - setLoadConfigFromGeometry(geometry, &vdo->loadConfig); - result = loadSuperBlock(layer, getFirstBlockOffset(vdo), &vdo->superBlock); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - result = ((decoder == NULL) - ? decodeSynchronousVDO(vdo, validateConfig) - : decoder(vdo, validateConfig)); - if (result != VDO_SUCCESS) { - freeVDO(&vdo); - return result; - } - - *vdoPtr = vdo; - return VDO_SUCCESS; - -} -/**********************************************************************/ -int loadVDO(PhysicalLayer *layer, - bool validateConfig, - VDODecoder *decoder, - VDO **vdoPtr) -{ - VolumeGeometry geometry; - int result = loadVolumeGeometry(layer, &geometry); - if (result != VDO_SUCCESS) { - return result; - } - - return loadVDOSuperblock(layer, &geometry, validateConfig, decoder, vdoPtr); -} diff --git a/vdo/base/vdoLoad.h b/vdo/base/vdoLoad.h deleted file mode 100644 index 893d6e4..0000000 --- a/vdo/base/vdoLoad.h +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoLoad.h#3 $ - */ - -#ifndef VDO_LOAD_H -#define VDO_LOAD_H - -#include "volumeGeometry.h" -#include "types.h" - -/** - * A function which decodes a VDO from a super block. - * - * @param vdo The VDO to be decoded (its super block must already - * be loaded) - * @param validateConfig If true, the VDO's configuration will - * be validated before the decode is attempted - * - * @return VDO_SUCCESS or an error - **/ -typedef int VDODecoder(VDO *vdo, bool validateConfig); - -/** - * Load a VDO for normal operation. This method must not be called from a base - * thread. - * - * @param vdo The VDO to load - * - * @return VDO_SUCCESS or an error - **/ -int performVDOLoad(VDO *vdo) - __attribute__((warn_unused_result)); - -/** - * Perpare a VDO for loading by reading structures off disk. This method does - * not alter the on-disk state. It should be called from the VDO constructor, - * whereas performVDOLoad() will be called during pre-resume if the VDO has - * not been resumed before. - **/ -int prepareToLoadVDO(VDO *vdo, const VDOLoadConfig *loadConfig) - __attribute__((warn_unused_result)); - -/** - * Synchronously load a VDO from a specified super block location for use by - * user-space tools. - * - * @param [in] layer The physical layer the VDO sits on - * @param [in] geometry A pointer to the geometry for the volume - * @param [in] validateConfig Whether to validate the VDO against the layer - * @param [in] decoder The VDO decoder to use, if NULL, the default - * decoder will be used - * @param [out] vdoPtr A pointer to hold the decoded VDO - * - * @return VDO_SUCCESS or an error - **/ -int loadVDOSuperblock(PhysicalLayer *layer, - VolumeGeometry *geometry, - bool validateConfig, - VDODecoder *decoder, - VDO **vdoPtr) - __attribute__((warn_unused_result)); - -/** - * Synchronously load a VDO volume for use by user-space tools. - * - * @param [in] layer The physical layer the VDO sits on - * @param [in] validateConfig Whether to validate the VDO against the layer - * @param [in] decoder The VDO decoder to use, if NULL, the default - * decoder will be used - * @param [out] vdoPtr A pointer to hold the decoded VDO - * - * @return VDO_SUCCESS or an error - **/ -int loadVDO(PhysicalLayer *layer, - bool validateConfig, - VDODecoder *decoder, - VDO **vdoPtr) - __attribute__((warn_unused_result)); - -#endif /* VDO_LOAD_H */ diff --git a/vdo/base/vdoPageCache.c b/vdo/base/vdoPageCache.c deleted file mode 100644 index c8f4585..0000000 --- a/vdo/base/vdoPageCache.c +++ /dev/null @@ -1,1369 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCache.c#11 $ - */ - -#include "vdoPageCacheInternals.h" - -#if __KERNEL__ -#include -#endif - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "adminState.h" -#include "constants.h" -#include "numUtils.h" -#include "readOnlyNotifier.h" -#include "statusCodes.h" -#include "types.h" -#include "vio.h" - -enum { - LOG_INTERVAL = 4000, - DISPLAY_INTERVAL = 100000, -}; - -/**********************************************************************/ -static char *getPageBuffer(PageInfo *info) -{ - VDOPageCache *cache = info->cache; - return &cache->pages[(info - cache->infos) * VDO_BLOCK_SIZE]; -} - -/** - * Allocate components of the cache which require their own allocation. The - * caller is responsible for all clean up on errors. - * - * @param cache The cache being constructed - * - * @return VDO_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int allocateCacheComponents(VDOPageCache *cache) -{ - int result = ALLOCATE(cache->pageCount, PageInfo, "page infos", - &cache->infos); - if (result != UDS_SUCCESS) { - return result; - } - - uint64_t size = cache->pageCount * (uint64_t) VDO_BLOCK_SIZE; - result = allocateMemory(size, VDO_BLOCK_SIZE, "cache pages", &cache->pages); - if (result != UDS_SUCCESS) { - return result; - } - - return makeIntMap(cache->pageCount, 0, &cache->pageMap); -} - -/** - * Initialize all page info structures and put them on the free list. - * - * @param cache The cache to initialize - * - * @return VDO_SUCCESS or an error - **/ -static int initializeInfo(VDOPageCache *cache) -{ - initializeRing(&cache->freeList); - PageInfo *info; - for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { - info->cache = cache; - info->state = PS_FREE; - info->pbn = NO_PAGE; - - if (cache->layer->createMetadataVIO != NULL) { - int result = createVIO(cache->layer, VIO_TYPE_BLOCK_MAP, - VIO_PRIORITY_METADATA, info, getPageBuffer(info), - &info->vio); - if (result != VDO_SUCCESS) { - return result; - } - - // The thread ID should never change. - info->vio->completion.callbackThreadID = cache->zone->threadID; - } - - initializeRing(&info->listNode); - pushRingNode(&cache->freeList, &info->listNode); - initializeRing(&info->lruNode); - } - - relaxedStore64(&cache->stats.counts.freePages, cache->pageCount); - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void writeDirtyPagesCallback(RingNode *node, void *context); - -/**********************************************************************/ -int makeVDOPageCache(PhysicalLayer *layer, - PageCount pageCount, - VDOPageReadFunction *readHook, - VDOPageWriteFunction *writeHook, - size_t pageContextSize, - BlockCount maximumAge, - BlockMapZone *zone, - VDOPageCache **cachePtr) -{ - int result = ASSERT(pageContextSize <= MAX_PAGE_CONTEXT_SIZE, - "page context size %zu cannot exceed %u bytes", - pageContextSize, MAX_PAGE_CONTEXT_SIZE); - if (result != VDO_SUCCESS) { - return result; - } - - VDOPageCache *cache; - result = ALLOCATE(1, VDOPageCache, "page cache", &cache); - if (result != UDS_SUCCESS) { - return result; - } - - cache->layer = layer; - cache->pageCount = pageCount; - cache->readHook = readHook; - cache->writeHook = writeHook; - cache->zone = zone; - - result = allocateCacheComponents(cache); - if (result != VDO_SUCCESS) { - freeVDOPageCache(&cache); - return result; - } - - result = initializeInfo(cache); - if (result != VDO_SUCCESS) { - freeVDOPageCache(&cache); - return result; - } - - result = makeDirtyLists(maximumAge, writeDirtyPagesCallback, cache, - &cache->dirtyLists); - if (result != VDO_SUCCESS) { - freeVDOPageCache(&cache); - return result; - } - - // initialize empty circular queues - initializeRing(&cache->lruList); - initializeRing(&cache->outgoingList); - - *cachePtr = cache; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeVDOPageCache(VDOPageCache **cachePtr) -{ - VDOPageCache *cache = *cachePtr; - if (cache == NULL) { - return; - } - - if (cache->infos != NULL) { - PageInfo *info; - for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { - freeVIO(&info->vio); - } - } - - freeDirtyLists(&cache->dirtyLists); - freeIntMap(&cache->pageMap); - FREE(cache->infos); - FREE(cache->pages); - FREE(cache); - *cachePtr = NULL; -} - -/**********************************************************************/ -void setVDOPageCacheInitialPeriod(VDOPageCache *cache, SequenceNumber period) -{ - setCurrentPeriod(cache->dirtyLists, period); -} - -/**********************************************************************/ -void setVDOPageCacheRebuildMode(VDOPageCache *cache, bool rebuilding) -{ - cache->rebuilding = rebuilding; -} - -/** - * Assert that a function has been called on the VDO page cache's thread. - * - * @param cache the page cache - * @param functionName the name of the function - **/ -static inline void assertOnCacheThread(VDOPageCache *cache, - const char *functionName) -{ - ThreadID threadID = getCallbackThreadID(); - ASSERT_LOG_ONLY((threadID == cache->zone->threadID), - "%s() must only be called on cache thread %d, not thread %d", - functionName, cache->zone->threadID, threadID); -} - -/** - * Assert that a page cache may issue I/O. - * - * @param cache the page cache - **/ -static inline void assertIOAllowed(VDOPageCache *cache) -{ - ASSERT_LOG_ONLY(!isQuiescent(&cache->zone->state), - "VDO page cache may issue I/O"); -} - -/** - * Log and, if enabled, report cache pressure. - * - * @param cache the page cache - **/ -static void reportCachePressure(VDOPageCache *cache) -{ - relaxedAdd64(&cache->stats.cachePressure, 1); - if (cache->waiterCount > cache->pageCount) { - if ((cache->pressureReport % LOG_INTERVAL) == 0) { - logInfo("page cache pressure %llu", - relaxedLoad64(&cache->stats.cachePressure)); - } - - if (++cache->pressureReport >= DISPLAY_INTERVAL) { - cache->pressureReport = 0; - } - } -} - -/**********************************************************************/ -const char *vpcPageStateName(PageState state) -{ - static const char *stateNames[] = { - "FREE", - "INCOMING", - "FAILED", - "RESIDENT", - "DIRTY", - "OUTGOING" - }; - STATIC_ASSERT(COUNT_OF(stateNames) == PAGE_STATE_COUNT); - - int result = ASSERT(state < COUNT_OF(stateNames), - "Unknown PageState value %d", state); - if (result != UDS_SUCCESS) { - return "[UNKNOWN PAGE STATE]"; - } - - return stateNames[state]; -} - -/** - * Update the counter associated with a given state. - * - * @param info the page info to count - * @param delta the delta to apply to the counter - **/ -static void updateCounter(PageInfo *info, int32_t delta) -{ - VDOPageCache *cache = info->cache; - switch (info->state) { - case PS_FREE: - relaxedAdd64(&cache->stats.counts.freePages, delta); - return; - - case PS_INCOMING: - relaxedAdd64(&cache->stats.counts.incomingPages, delta); - return; - - case PS_OUTGOING: - relaxedAdd64(&cache->stats.counts.outgoingPages, delta); - return; - - case PS_FAILED: - relaxedAdd64(&cache->stats.counts.failedPages, delta); - return; - - case PS_RESIDENT: - relaxedAdd64(&cache->stats.counts.cleanPages, delta); - return; - - case PS_DIRTY: - relaxedAdd64(&cache->stats.counts.dirtyPages, delta); - return; - - default: - return; - } -} - -/** - * Update the lru information for an active page. - **/ -static void updateLru(PageInfo *info) -{ - VDOPageCache *cache = info->cache; - - if (cache->lruList.prev != &info->lruNode) { - pushRingNode(&cache->lruList, &info->lruNode); - } -} - -/** - * Set the state of a PageInfo and put it on the right list, adjusting - * counters. - * - * @param info the PageInfo to modify - * @param newState the new state for the PageInfo - **/ -static void setInfoState(PageInfo *info, PageState newState) -{ - if (newState == info->state) { - return; - } - - updateCounter(info, -1); - info->state = newState; - updateCounter(info, 1); - - switch (info->state) { - case PS_FREE: - case PS_FAILED: - pushRingNode(&info->cache->freeList, &info->listNode); - return; - - case PS_OUTGOING: - pushRingNode(&info->cache->outgoingList, &info->listNode); - return; - - case PS_DIRTY: - return; - - default: - unspliceRingNode(&info->listNode); - } -} - -/** - * Set the pbn for an info, updating the map as needed. - * - * @param info The page info - * @param pbn The physical block number to set - **/ -__attribute__((warn_unused_result)) -static int setInfoPBN(PageInfo *info, PhysicalBlockNumber pbn) -{ - VDOPageCache *cache = info->cache; - - // Either the new or the old page number must be NO_PAGE. - int result = ASSERT((pbn == NO_PAGE) || (info->pbn == NO_PAGE), - "Must free a page before reusing it."); - if (result != VDO_SUCCESS) { - return result; - } - - if (info->pbn != NO_PAGE) { - intMapRemove(cache->pageMap, info->pbn); - } - - info->pbn = pbn; - - if (pbn != NO_PAGE) { - result = intMapPut(cache->pageMap, pbn, info, true, NULL); - if (result != UDS_SUCCESS) { - return result; - } - } - return VDO_SUCCESS; -} - -/** - * Reset page info to represent an unallocated page. - **/ -static int resetPageInfo(PageInfo *info) -{ - int result = ASSERT(info->busy == 0, "VDO Page must not be busy"); - if (result != UDS_SUCCESS) { - return result; - } - - result = ASSERT(!hasWaiters(&info->waiting), - "VDO Page must not have waiters"); - if (result != UDS_SUCCESS) { - return result; - } - - result = setInfoPBN(info, NO_PAGE); - setInfoState(info, PS_FREE); - unspliceRingNode(&info->lruNode); - return result; -} - -/** - * Find a free page. - * - * @param cache the page cache - * - * @return a pointer to the page info structure (if found), NULL otherwise - **/ -__attribute__((warn_unused_result)) -static PageInfo *findFreePage(VDOPageCache *cache) -{ - if (cache->freeList.next == &cache->freeList) { - return NULL; - } - PageInfo *info = pageInfoFromListNode(cache->freeList.next); - unspliceRingNode(&info->listNode); - return info; -} - -/**********************************************************************/ -PageInfo *vpcFindPage(VDOPageCache *cache, PhysicalBlockNumber pbn) -{ - if ((cache->lastFound != NULL) - && (cache->lastFound->pbn == pbn)) { - return cache->lastFound; - } - cache->lastFound = intMapGet(cache->pageMap, pbn); - return cache->lastFound; -} - -/** - * Determine which page is least recently used. - * - * @param cache the page cache structure - * - * @return a pointer to the info structure for a relevant page, - * or NULL if no such page can be found. The page can be - * dirty or resident. - * - * @note Picks the least recently used from among the non-busy entries - * at the front of each of the lru ring. - * Since whenever we mark a page busy we also put it to the end - * of the ring it is unlikely that the entries at the front - * are busy unless the queue is very short, but not impossible. - **/ -__attribute__((warn_unused_result)) -static PageInfo *selectLRUPage(VDOPageCache *cache) -{ - PageInfoNode *lru; - for (lru = cache->lruList.next; - lru != &cache->lruList; - lru = lru->next) { - PageInfo *info = pageInfoFromLRUNode(lru); - if ((info->busy == 0) && !isInFlight(info)) { - return info; - } - } - - return NULL; -} - -/**********************************************************************/ -AtomicPageCacheStatistics *getVDOPageCacheStatistics(VDOPageCache *cache) -{ - return &cache->stats; -} - -// ASYNCHRONOUS INTERFACE BEYOND THIS POINT - -/** - * Helper to complete the VDO Page Completion request successfully. - * - * @param info the page info representing the result page - * @param vdoPageComp the VDO page completion to complete - **/ -static void completeWithPage(PageInfo *info, VDOPageCompletion *vdoPageComp) -{ - bool available = vdoPageComp->writable ? isPresent(info) : isValid(info); - if (!available) { - logErrorWithStringError(VDO_BAD_PAGE, - "Requested cache page %llu in state %s is" - " not %s", - info->pbn, vpcPageStateName(info->state), - vdoPageComp->writable ? "present" : "valid"); - finishCompletion(&vdoPageComp->completion, VDO_BAD_PAGE); - return; - } - - vdoPageComp->info = info; - vdoPageComp->ready = true; - finishCompletion(&vdoPageComp->completion, VDO_SUCCESS); -} - -/** - * Complete a page completion with an error code. Implements WaiterCallback. - * - * @param waiter The page completion, as a waiter - * @param resultPtr A pointer to the error code. - **/ -static void completeWaiterWithError(Waiter *waiter, void *resultPtr) -{ - int *result = resultPtr; - VDOPageCompletion *completion = pageCompletionFromWaiter(waiter); - finishCompletion(&completion->completion, *result); -} - -/** - * Complete a queue of VDOPageCompletions with an error code. - * - * @param [in] result the error result - * @param [in, out] queue a pointer to the queue - * - * @note upon completion the queue will be empty - **/ -static void distributeErrorOverQueue(int result, WaitQueue *queue) -{ - notifyAllWaiters(queue, completeWaiterWithError, &result); -} - -/** - * Complete a page completion with a page. Implements WaiterCallback. - * - * @param waiter The page completion, as a waiter - * @param pageInfo The page info to complete with - **/ -static void completeWaiterWithPage(Waiter *waiter, void *pageInfo) -{ - PageInfo *info = pageInfo; - VDOPageCompletion *completion = pageCompletionFromWaiter(waiter); - completeWithPage(info, completion); -} - -/** - * Complete a queue of VDOPageCompletions with a page result. - * - * @param [in] info the page info describing the page - * @param [in, out] queue a pointer to a queue of waiters - * - * @return the number of pages distributed - * - * @note upon completion the queue will be empty - * - **/ -static unsigned int distributePageOverQueue(PageInfo *info, WaitQueue *queue) -{ - updateLru(info); - - size_t pages = countWaiters(queue); - - /* - * Increment the busy count once for each pending completion so that - * this page does not stop being busy until all completions have - * been processed (VDO-83). - */ - info->busy += pages; - - notifyAllWaiters(queue, completeWaiterWithPage, info); - return pages; -} - -/** - * Set a persistent error which all requests will receive in the future. - * - * @param cache the page cache - * @param context a string describing what triggered the error - * @param result the error result - * - * Once triggered, all enqueued completions will get this error. - * Any future requests will result in this error as well. - **/ -static void setPersistentError(VDOPageCache *cache, - const char *context, - int result) -{ - // If we're already read-only, there's no need to log. - ReadOnlyNotifier *notifier = cache->zone->readOnlyNotifier; - if ((result != VDO_READ_ONLY) && !isReadOnly(notifier)) { - logErrorWithStringError(result, "VDO Page Cache persistent error: %s", - context); - enterReadOnlyMode(notifier, result); - } - - assertOnCacheThread(cache, __func__); - - distributeErrorOverQueue(result, &cache->freeWaiters); - cache->waiterCount = 0; - - PageInfo *info; - for (info = cache->infos; info < cache->infos + cache->pageCount; ++info) { - distributeErrorOverQueue(result, &info->waiting); - } -} - -/**********************************************************************/ -void initVDOPageCompletion(VDOPageCompletion *pageCompletion, - VDOPageCache *cache, - PhysicalBlockNumber pbn, - bool writable, - void *parent, - VDOAction *callback, - VDOAction *errorHandler) -{ - ASSERT_LOG_ONLY((pageCompletion->waiter.nextWaiter == NULL), - "New page completion was not already on a wait queue"); - - *pageCompletion = (VDOPageCompletion) { - .pbn = pbn, - .writable = writable, - .cache = cache, - }; - - VDOCompletion *completion = &pageCompletion->completion; - initializeCompletion(completion, VDO_PAGE_COMPLETION, cache->layer); - prepareCompletion(completion, callback, errorHandler, cache->zone->threadID, - parent); -} - -/** - * Helper function to check that a completion represents a successfully - * completed VDO Page Completion referring to a valid page. - * - * @param completion a VDO completion - * @param writable whether a writable page is required - * - * @return the embedding completion if valid, NULL if not - **/ -__attribute__((warn_unused_result)) -static VDOPageCompletion *validateCompletedPage(VDOCompletion *completion, - bool writable) -{ - VDOPageCompletion *vpc = asVDOPageCompletion(completion); - - int result = ASSERT(vpc->ready, "VDO Page completion not ready"); - if (result != UDS_SUCCESS) { - return NULL; - } - - result = ASSERT(vpc->info != NULL, "VDO Page Completion must be complete"); - if (result != UDS_SUCCESS) { - return NULL; - } - - result = ASSERT(vpc->info->pbn == vpc->pbn, - "VDO Page Completion pbn must be consistent"); - if (result != UDS_SUCCESS) { - return NULL; - } - - result = ASSERT(isValid(vpc->info), - "VDO Page Completion page must be valid"); - if (result != UDS_SUCCESS) { - return NULL; - } - - if (writable) { - result = ASSERT(vpc->writable, "VDO Page Completion is writable"); - if (result != UDS_SUCCESS) { - return NULL; - } - } - - return vpc; -} - -/**********************************************************************/ -bool isPageCacheActive(VDOPageCache *cache) -{ - return ((cache->outstandingReads != 0) || (cache->outstandingWrites != 0)); -} - -/** - * VIO callback used when a page has been loaded. - * - * @param completion A completion for the VIO, the parent of which is a - * PageInfo. - **/ -static void pageIsLoaded(VDOCompletion *completion) -{ - PageInfo *info = completion->parent; - VDOPageCache *cache = info->cache; - assertOnCacheThread(cache, __func__); - - setInfoState(info, PS_RESIDENT); - distributePageOverQueue(info, &info->waiting); - - /* - * Don't decrement until right before calling checkForDrainComplete() to - * ensure that the above work can't cause the page cache to be freed out from - * under us. - */ - cache->outstandingReads--; - checkForDrainComplete(cache->zone); -} - -/** - * Handle page load errors. - * - * @param completion The page read VIO - **/ -static void handleLoadError(VDOCompletion *completion) -{ - int result = completion->result; - PageInfo *info = completion->parent; - VDOPageCache *cache = info->cache; - assertOnCacheThread(cache, __func__); - - enterReadOnlyMode(cache->zone->readOnlyNotifier, result); - relaxedAdd64(&cache->stats.failedReads, 1); - setInfoState(info, PS_FAILED); - distributeErrorOverQueue(result, &info->waiting); - resetPageInfo(info); - - /* - * Don't decrement until right before calling checkForDrainComplete() to - * ensure that the above work can't cause the page cache to be freed out from - * under us. - */ - cache->outstandingReads--; - checkForDrainComplete(cache->zone); -} - -/** - * Run the read hook after a page is loaded. This callback is registered in - * launchPageLoad() when there is a read hook. - * - * @param completion The page load completion - **/ -static void runReadHook(VDOCompletion *completion) -{ - PageInfo *info = completion->parent; - completion->callback = pageIsLoaded; - resetCompletion(completion); - int result = info->cache->readHook(getPageBuffer(info), info->pbn, - info->cache->zone, info->context); - continueCompletion(completion, result); -} - -/** - * Handle a read error during a read-only rebuild. - * - * @param completion The page load completion - **/ -static void handleRebuildReadError(VDOCompletion *completion) -{ - PageInfo *info = completion->parent; - VDOPageCache *cache = info->cache; - assertOnCacheThread(cache, __func__); - - // We are doing a read-only rebuild, so treat this as a successful read - // of an uninitialized page. - relaxedAdd64(&cache->stats.failedReads, 1); - memset(getPageBuffer(info), 0, VDO_BLOCK_SIZE); - resetCompletion(completion); - if (cache->readHook != NULL) { - runReadHook(completion); - } else { - pageIsLoaded(completion); - } -} - -/** - * Begin the process of loading a page. - * - * @param info the page info representing where to load the page - * @param pbn the absolute pbn of the desired page - * - * @return VDO_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int launchPageLoad(PageInfo *info, PhysicalBlockNumber pbn) -{ - VDOPageCache *cache = info->cache; - assertIOAllowed(cache); - - int result = setInfoPBN(info, pbn); - if (result != VDO_SUCCESS) { - return result; - } - - result = ASSERT((info->busy == 0), "Page is not busy before loading."); - if (result != VDO_SUCCESS) { - return result; - } - - setInfoState(info, PS_INCOMING); - cache->outstandingReads++; - relaxedAdd64(&cache->stats.pagesLoaded, 1); - launchReadMetadataVIO(info->vio, pbn, - (cache->readHook != NULL) ? runReadHook : pageIsLoaded, - (cache->rebuilding - ? handleRebuildReadError : handleLoadError)); - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void writePages(VDOCompletion *completion); - -/** - * Handle errors flushing the layer. - * - * @param completion The flush VIO - **/ -static void handleFlushError(VDOCompletion *completion) -{ - VDOPageCache *cache = ((PageInfo *) completion->parent)->cache; - setPersistentError(cache, "flush failed", completion->result); - writePages(completion); -} - -/** - * Attempt to save the outgoing pages by first flushing the layer. - * - * @param cache The cache - **/ -static void savePages(VDOPageCache *cache) -{ - if ((cache->pagesInFlush > 0) || (cache->pagesToFlush == 0)) { - return; - } - - assertIOAllowed(cache); - - PageInfo *info = pageInfoFromListNode(cache->outgoingList.next); - cache->pagesInFlush = cache->pagesToFlush; - cache->pagesToFlush = 0; - relaxedAdd64(&cache->stats.flushCount, 1); - - VIO *vio = info->vio; - PhysicalLayer *layer = vio->completion.layer; - - /* - * We must make sure that the recovery journal entries that changed these - * pages were successfully persisted, and thus must issue a flush before - * each batch of pages is written to ensure this. However, in sync mode, - * every journal block is written with FUA, thus guaranteeing the journal - * persisted already. - */ - if (layer->getWritePolicy(layer) != WRITE_POLICY_SYNC) { - launchFlush(vio, writePages, handleFlushError); - return; - } - - writePages(&vio->completion); -} - -/** - * Add a page to the outgoing list of pages waiting to be saved. Once in the - * list, a page may not be used until it has been written out. - * - * @param info The page to save - **/ -static void schedulePageSave(PageInfo *info) -{ - if (info->busy > 0) { - info->writeStatus = WRITE_STATUS_DEFERRED; - return; - } - - info->cache->pagesToFlush++; - info->cache->outstandingWrites++; - setInfoState(info, PS_OUTGOING); -} - -/**********************************************************************/ -static void writeDirtyPagesCallback(RingNode *expired, void *context) -{ - while (!isRingEmpty(expired)) { - schedulePageSave(pageInfoFromListNode(chopRingNode(expired))); - } - - savePages((VDOPageCache *) context); -} - -/** - * Add a page to outgoing pages waiting to be saved, and then start saving - * pages if another save is not in progress. - * - * @param info The page to save - **/ -static void launchPageSave(PageInfo *info) -{ - schedulePageSave(info); - savePages(info->cache); -} - -/** - * Determine whether a given VDOPageCompletion (as a waiter) is requesting a - * given page number. Implements WaiterMatch. - * - * @param waiter The page completion in question - * @param context A pointer to the pbn of the desired page - * - * @return true if the page completion is for the desired page number - **/ -static bool completionNeedsPage(Waiter *waiter, void *context) -{ - PhysicalBlockNumber *pbn = context; - return (pageCompletionFromWaiter(waiter)->pbn == *pbn); -} - -/** - * Allocate a free page to the first completion in the waiting queue, - * and any other completions that match it in page number. - **/ -static void allocateFreePage(PageInfo *info) -{ - VDOPageCache *cache = info->cache; - assertOnCacheThread(cache, __func__); - - if (!hasWaiters(&cache->freeWaiters)) { - if (relaxedLoad64(&cache->stats.cachePressure) > 0) { - logInfo("page cache pressure relieved"); - relaxedStore64(&cache->stats.cachePressure, 0); - } - return; - } - - int result = resetPageInfo(info); - if (result != VDO_SUCCESS) { - setPersistentError(cache, "cannot reset page info", result); - return; - } - - Waiter *oldestWaiter = getFirstWaiter(&cache->freeWaiters); - PhysicalBlockNumber pbn = pageCompletionFromWaiter(oldestWaiter)->pbn; - - // Remove all entries which match the page number in question - // and push them onto the page info's wait queue. - dequeueMatchingWaiters(&cache->freeWaiters, completionNeedsPage, - &pbn, &info->waiting); - cache->waiterCount -= countWaiters(&info->waiting); - - result = launchPageLoad(info, pbn); - if (result != VDO_SUCCESS) { - distributeErrorOverQueue(result, &info->waiting); - } -} - -/** - * Begin the process of discarding a page. - * - * @param cache the page cache - * - * @note If no page is discardable, increments a count of deferred frees so - * that the next release of a page which is no longer busy will kick - * off another discard cycle. This is an indication that the cache is - * not big enough. - * - * @note If the selected page is not dirty, immediately allocates the page - * to the oldest completion waiting for a free page. - **/ -static void discardAPage(VDOPageCache *cache) -{ - PageInfo *info = selectLRUPage(cache); - if (info == NULL) { - reportCachePressure(cache); - return; - } - - if (!isDirty(info)) { - allocateFreePage(info); - return; - } - - ASSERT_LOG_ONLY(!isInFlight(info), - "page selected for discard is not in flight"); - - ++cache->discardCount; - info->writeStatus = WRITE_STATUS_DISCARD; - launchPageSave(info); -} - -/** - * Helper used to trigger a discard so that the completion can get a different - * page. - * - * @param vdoPageComp the VDO Page completion - **/ -static void discardPageForCompletion(VDOPageCompletion *vdoPageComp) -{ - VDOPageCache *cache = vdoPageComp->cache; - - ++cache->waiterCount; - - int result = enqueueWaiter(&cache->freeWaiters, &vdoPageComp->waiter); - if (result != VDO_SUCCESS) { - setPersistentError(cache, "cannot enqueue waiter", result); - } - - discardAPage(cache); -} - -/** - * Helper used to trigger a discard if the cache needs another free page. - * - * @param cache the page cache - **/ -static void discardPageIfNeeded(VDOPageCache *cache) -{ - if (cache->waiterCount > cache->discardCount) { - discardAPage(cache); - } -} - -/**********************************************************************/ -void advanceVDOPageCachePeriod(VDOPageCache *cache, SequenceNumber period) -{ - assertOnCacheThread(cache, __func__); - advancePeriod(cache->dirtyLists, period); -} - -/** - * Inform the cache that a write has finished (possibly with an error). - * - * @param info The info structure for the page whose write just completed - * - * @return true if the page write was a discard - **/ -static bool writeHasFinished(PageInfo *info) -{ - assertOnCacheThread(info->cache, __func__); - info->cache->outstandingWrites--; - - bool wasDiscard = (info->writeStatus == WRITE_STATUS_DISCARD); - info->writeStatus = WRITE_STATUS_NORMAL; - return wasDiscard; -} - -/** - * Handler for page write errors. - * - * @param completion The page write VIO - **/ -static void handlePageWriteError(VDOCompletion *completion) -{ - int result = completion->result; - PageInfo *info = completion->parent; - VDOPageCache *cache = info->cache; - - // If we're already read-only, write failures are to be expected. - if (result != VDO_READ_ONLY) { -#if __KERNEL__ - static DEFINE_RATELIMIT_STATE(errorLimiter, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - if (__ratelimit(&errorLimiter)) { - logError("failed to write block map page %llu", info->pbn); - } -#else - logError("failed to write block map page %llu", info->pbn); -#endif - } - - setInfoState(info, PS_DIRTY); - relaxedAdd64(&cache->stats.failedWrites, 1); - setPersistentError(cache, "cannot write page", result); - - if (!writeHasFinished(info)) { - discardPageIfNeeded(cache); - } - - checkForDrainComplete(cache->zone); -} - -/** - * VIO callback used when a page has been written out. - * - * @param completion A completion for the VIO, the parent of which - * is embedded in PageInfo. - **/ -static void pageIsWrittenOut(VDOCompletion *completion) -{ - PageInfo *info = completion->parent; - VDOPageCache *cache = info->cache; - - if (cache->writeHook != NULL) { - bool rewrite = cache->writeHook(getPageBuffer(info), cache->zone, - info->context); - if (rewrite) { - launchWriteMetadataVIOWithFlush(info->vio, info->pbn, pageIsWrittenOut, - handlePageWriteError, true, false); - return; - } - } - - bool wasDiscard = writeHasFinished(info); - bool reclaimed = (!wasDiscard || (info->busy > 0) - || hasWaiters(&info->waiting)); - - setInfoState(info, PS_RESIDENT); - - uint32_t reclamations = distributePageOverQueue(info, &info->waiting); - relaxedAdd64(&cache->stats.reclaimed, reclamations); - - if (wasDiscard) { - cache->discardCount--; - } - - if (reclaimed) { - discardPageIfNeeded(cache); - } else { - allocateFreePage(info); - } - - checkForDrainComplete(cache->zone); -} - -/** - * Write the batch of pages which were covered by the layer flush which just - * completed. This callback is registered in savePages(). - * - * @param flushCompletion The flush VIO - **/ -static void writePages(VDOCompletion *flushCompletion) -{ - VDOPageCache *cache = ((PageInfo *) flushCompletion->parent)->cache; - - /* - * We need to cache these two values on the stack since in the error case - * below, it is possible for the last page info to cause the page cache to - * get freed. Hence once we launch the last page, it may be unsafe to - * dereference the cache [VDO-4724]. - */ - bool hasUnflushedPages = (cache->pagesToFlush > 0); - PageCount pagesInFlush = cache->pagesInFlush; - cache->pagesInFlush = 0; - while (pagesInFlush-- > 0) { - PageInfo *info = pageInfoFromListNode(chopRingNode(&cache->outgoingList)); - if (isReadOnly(info->cache->zone->readOnlyNotifier)) { - VDOCompletion *completion = &info->vio->completion; - resetCompletion(completion); - completion->callback = pageIsWrittenOut; - completion->errorHandler = handlePageWriteError; - finishCompletion(completion, VDO_READ_ONLY); - continue; - } - relaxedAdd64(&info->cache->stats.pagesSaved, 1); - launchWriteMetadataVIO(info->vio, info->pbn, pageIsWrittenOut, - handlePageWriteError); - } - - if (hasUnflushedPages) { - // If there are unflushed pages, the cache can't have been freed, so this - // call is safe. - savePages(cache); - } -} - -/**********************************************************************/ -void releaseVDOPageCompletion(VDOCompletion *completion) -{ - if (completion == NULL) { - return; - } - - PageInfo *discardInfo = NULL; - VDOPageCompletion *pageCompletion; - if (completion->result == VDO_SUCCESS) { - pageCompletion = validateCompletedPage(completion, false); - if (--pageCompletion->info->busy == 0) { - discardInfo = pageCompletion->info; - } - } else { - // Do not check for errors if the completion was not successful. - pageCompletion = asVDOPageCompletion(completion); - } - ASSERT_LOG_ONLY((pageCompletion->waiter.nextWaiter == NULL), - "Page being released after leaving all queues"); - - VDOPageCache *cache = pageCompletion->cache; - assertOnCacheThread(cache, __func__); - memset(pageCompletion, 0, sizeof(VDOPageCompletion)); - - if (discardInfo != NULL) { - if (discardInfo->writeStatus == WRITE_STATUS_DEFERRED) { - discardInfo->writeStatus = WRITE_STATUS_NORMAL; - launchPageSave(discardInfo); - } - // if there are excess requests for pages (that have not already started - // discards) we need to discard some page (which may be this one) - discardPageIfNeeded(cache); - } -} - -/** - * Helper function to load a page as described by a VDO Page Completion. - * - * @param info the page info representing where to load the page - * @param vdoPageComp the VDO Page Completion describing the page - **/ -static void loadPageForCompletion(PageInfo *info, - VDOPageCompletion *vdoPageComp) -{ - int result = enqueueWaiter(&info->waiting, &vdoPageComp->waiter); - if (result != VDO_SUCCESS) { - finishCompletion(&vdoPageComp->completion, result); - return; - } - - result = launchPageLoad(info, vdoPageComp->pbn); - if (result != VDO_SUCCESS) { - distributeErrorOverQueue(result, &info->waiting); - } -} - -/**********************************************************************/ -void getVDOPageAsync(VDOCompletion *completion) -{ - VDOPageCompletion *vdoPageComp = asVDOPageCompletion(completion); - VDOPageCache *cache = vdoPageComp->cache; - assertOnCacheThread(cache, __func__); - - if (vdoPageComp->writable && isReadOnly(cache->zone->readOnlyNotifier)) { - finishCompletion(completion, VDO_READ_ONLY); - return; - } - - if (vdoPageComp->writable) { - relaxedAdd64(&cache->stats.writeCount, 1); - } else { - relaxedAdd64(&cache->stats.readCount, 1); - } - - PageInfo *info = vpcFindPage(cache, vdoPageComp->pbn); - if (info != NULL) { - // The page is in the cache already. - if ((info->writeStatus == WRITE_STATUS_DEFERRED) || isIncoming(info) - || (isOutgoing(info) && vdoPageComp->writable)) { - // The page is unusable until it has finished I/O. - relaxedAdd64(&cache->stats.waitForPage, 1); - int result = enqueueWaiter(&info->waiting, &vdoPageComp->waiter); - if (result != VDO_SUCCESS) { - finishCompletion(&vdoPageComp->completion, result); - } - - return; - } - - if (isValid(info)) { - // The page is usable. - relaxedAdd64(&cache->stats.foundInCache, 1); - if (!isPresent(info)) { - relaxedAdd64(&cache->stats.readOutgoing, 1); - } - updateLru(info); - ++info->busy; - completeWithPage(info, vdoPageComp); - return; - } - // Something horrible has gone wrong. - ASSERT_LOG_ONLY(false, "Info found in a usable state."); - } - - // The page must be fetched. - info = findFreePage(cache); - if (info != NULL) { - relaxedAdd64(&cache->stats.fetchRequired, 1); - loadPageForCompletion(info, vdoPageComp); - return; - } - - // The page must wait for a page to be discarded. - relaxedAdd64(&cache->stats.discardRequired, 1); - discardPageForCompletion(vdoPageComp); -} - -/**********************************************************************/ -void markCompletedVDOPageDirty(VDOCompletion *completion, - SequenceNumber oldDirtyPeriod, - SequenceNumber newDirtyPeriod) -{ - VDOPageCompletion *vdoPageComp = validateCompletedPage(completion, true); - if (vdoPageComp == NULL) { - return; - } - - PageInfo *info = vdoPageComp->info; - setInfoState(info, PS_DIRTY); - addToDirtyLists(info->cache->dirtyLists, &info->listNode, oldDirtyPeriod, - newDirtyPeriod); -} - -/**********************************************************************/ -void requestVDOPageWrite(VDOCompletion *completion) -{ - VDOPageCompletion *vdoPageComp = validateCompletedPage(completion, true); - if (vdoPageComp == NULL) { - return; - } - - PageInfo *info = vdoPageComp->info; - setInfoState(info, PS_DIRTY); - launchPageSave(info); -} - -/**********************************************************************/ -static void *dereferencePageCompletion(VDOPageCompletion *completion) -{ - return ((completion != NULL) ? getPageBuffer(completion->info) : NULL); -} - -/**********************************************************************/ -const void *dereferenceReadableVDOPage(VDOCompletion *completion) -{ - return dereferencePageCompletion(validateCompletedPage(completion, false)); -} - -/**********************************************************************/ -void *dereferenceWritableVDOPage(VDOCompletion *completion) -{ - return dereferencePageCompletion(validateCompletedPage(completion, true)); -} - -/**********************************************************************/ -void *getVDOPageCompletionContext(VDOCompletion *completion) -{ - VDOPageCompletion *pageCompletion = asVDOPageCompletion(completion); - PageInfo *info = ((pageCompletion != NULL) ? pageCompletion->info : NULL); - return (((info != NULL) && isValid(info)) ? info->context : NULL); -} - -/**********************************************************************/ -void drainVDOPageCache(VDOPageCache *cache) -{ - assertOnCacheThread(cache, __func__); - ASSERT_LOG_ONLY(isDraining(&cache->zone->state), - "drainVDOPageCache() called during block map drain"); - - if (!isSuspending(&cache->zone->state)) { - flushDirtyLists(cache->dirtyLists); - savePages(cache); - } -} - -/**********************************************************************/ -int invalidateVDOPageCache(VDOPageCache *cache) -{ - assertOnCacheThread(cache, __func__); - - // Make sure we don't throw away any dirty pages. - PageInfo *info; - for (info = cache->infos; info < cache->infos + cache->pageCount; info++) { - int result = ASSERT(!isDirty(info), "cache must have no dirty pages"); - if (result != VDO_SUCCESS) { - return result; - } - } - - // Reset the pageMap by re-allocating it. - freeIntMap(&cache->pageMap); - return makeIntMap(cache->pageCount, 0, &cache->pageMap); -} diff --git a/vdo/base/vdoPageCache.h b/vdo/base/vdoPageCache.h deleted file mode 100644 index e6a944d..0000000 --- a/vdo/base/vdoPageCache.h +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCache.h#7 $ - */ - -#ifndef VDO_PAGE_CACHE_H -#define VDO_PAGE_CACHE_H - -#include "adminState.h" -#include "atomic.h" -#include "completion.h" -#include "types.h" -#include "waitQueue.h" - -/** - * Structure describing page meta data (defined internally). - **/ -typedef struct pageInfo PageInfo; - -/** - * Structure describing entire page cache. - * (Unfortunately the name "PageCache" is already taken by Albireo.) - **/ -typedef struct vdoPageCache VDOPageCache; - -/** - * Generation counter for page references. - **/ -typedef uint32_t VDOPageGeneration; - -/** - * Page-state count statistics sub-structure. - **/ -typedef struct { - /* free pages */ - Atomic64 freePages; - /* clean (resident) pages */ - Atomic64 cleanPages; - /* dirty pages per era */ - Atomic64 dirtyPages; - /* pages incoming */ - Atomic64 incomingPages; - /* pages outgoing */ - Atomic64 outgoingPages; - /* pages in failed state */ - Atomic64 failedPages; -} AtomicPageStateCounts; - -/** - * Statistics and debugging fields for the page cache. - */ -typedef struct { - /* counts of how many pages are in each state */ - AtomicPageStateCounts counts; - /* how many times free page not available */ - Atomic64 cachePressure; - /* number of getVDOPageAsync() for read */ - Atomic64 readCount; - /* number or getVDOPageAsync() for write */ - Atomic64 writeCount; - /* number of times pages failed to read */ - Atomic64 failedReads; - /* number of times pages failed to write */ - Atomic64 failedWrites; - /* number of gets that are reclaimed */ - Atomic64 reclaimed; - /* number of gets for outgoing pages */ - Atomic64 readOutgoing; - /* number of gets that were already there */ - Atomic64 foundInCache; - /* number of gets requiring discard */ - Atomic64 discardRequired; - /* number of gets enqueued for their page */ - Atomic64 waitForPage; - /* number of gets that have to fetch */ - Atomic64 fetchRequired; - /* number of page fetches */ - Atomic64 pagesLoaded; - /* number of page saves */ - Atomic64 pagesSaved; - /* number of flushes initiated */ - Atomic64 flushCount; -} AtomicPageCacheStatistics; - -/** - * Signature for a function to call when a page is read into the cache. - * - *

If specified, this function is called when a page is fetched from disk. - * - * @param rawPage The raw memory of the freshly-fetched page - * @param pbn The absolute physical block number of the page - * @param zone The block map zone to which the cache belongs - * @param pageContext A pointer to client-specific data for the new page - * - * @return VDO_SUCCESS on success or VDO_BAD_PAGE if the page is incorrectly - * formatted - **/ -typedef int VDOPageReadFunction(void *rawPage, - PhysicalBlockNumber pbn, - BlockMapZone *zone, - void *pageContext); - -/** - * Signature for a function to call when a page is written from the cache. - * - *

If specified, this function is called when a page is written to disk. - * - * @param rawPage The raw memory of the freshly-written page - * @param zone The block map zone to which the cache belongs - * @param pageContext A pointer to client-specific data for the new page - * - * @return whether the page needs to be rewritten - **/ -typedef bool VDOPageWriteFunction(void *rawPage, - BlockMapZone *zone, - void *pageContext); - -/** - * Construct a PageCache. - * - * @param [in] layer The physical layer to read and write - * @param [in] pageCount The number of cache pages to hold - * @param [in] readHook The function to be called when a page is read - * into the cache - * @param [in] writeHook The function to be called after a page is - * written from the cache - * @param [in] pageContextSize The size of the per-page context that will be - * passed to the read and write hooks - * @param [in] maximumAge The number of journal blocks before a dirtied - * page is considered old and must be written - * out - * @param [in] zone The block map zone which owns this cache - * @param [out] cachePtr A pointer to hold the cache - * - * @return a success or error code - **/ -int makeVDOPageCache(PhysicalLayer *layer, - PageCount pageCount, - VDOPageReadFunction *readHook, - VDOPageWriteFunction *writeHook, - size_t pageContextSize, - BlockCount maximumAge, - BlockMapZone *zone, - VDOPageCache **cachePtr) - __attribute__((warn_unused_result)); - -/** - * Free the page cache structure and null out the reference to it. - * - * @param cachePtr a pointer to the cache to free - **/ -void freeVDOPageCache(VDOPageCache **cachePtr); - -/** - * Set the initial dirty period for a page cache. - * - * @param cache The cache - * @param period The initial dirty period to set - **/ -void setVDOPageCacheInitialPeriod(VDOPageCache *cache, SequenceNumber period); - -/** - * Switch the page cache into or out of read-only rebuild mode. - * - * @param cache The cache - * @param rebuilding true if the cache should be put into - * read-only rebuild mode, false otherwise - **/ -void setVDOPageCacheRebuildMode(VDOPageCache *cache, bool rebuilding); - -/** - * Check whether a page cache is active (i.e. has any active lookups, - * outstanding I/O, or pending I/O). - * - * @param cache The cache to check - * - * @return true if the cache is active - **/ -bool isPageCacheActive(VDOPageCache *cache) - __attribute__((warn_unused_result)); - -/** - * Advance the dirty period for a page cache. - * - * @param cache The cache to advance - * @param period The new dirty period - **/ -void advanceVDOPageCachePeriod(VDOPageCache *cache, SequenceNumber period); - -/** - * Write one or more batches of dirty pages. - * - * All writable pages in the ancient era and some number in the old era - * are scheduled for writing. - * - * @param cache the VDO page cache - * @param batches how many batches to write now - * @param total how many batches (including those being written now) remain - * in this era - **/ -void writeVDOPageCachePages(VDOPageCache *cache, - size_t batches, - size_t total); - -/** - * Rotate the dirty page eras. - * - * Move all pages in the old era to the ancient era and then move - * the current era bin into the old era. - * - * @param cache the VDO page cache - **/ -void rotateVDOPageCacheEras(VDOPageCache *cache); - -// ASYNC - -/** - * A completion awaiting a specific page. Also a live reference into the - * page once completed, until freed. - **/ -typedef struct { - /** The generic completion */ - VDOCompletion completion; - /** The cache involved */ - VDOPageCache *cache; - /** The waiter for the pending list */ - Waiter waiter; - /** The absolute physical block number of the page on disk */ - PhysicalBlockNumber pbn; - /** Whether the page may be modified */ - bool writable; - /** Whether the page is available */ - bool ready; - /** The info structure for the page, only valid when ready */ - PageInfo *info; -} VDOPageCompletion; - -/** - * Initialize a VDO Page Completion, requesting a particular page from the - * cache. - * - * @param pageCompletion The VDOPageCompletion to initialize - * @param cache The VDO page cache - * @param pbn The absolute physical block of the desired page - * @param writable Whether the page can be modified - * @param parent The parent object - * @param callback The completion callback - * @param errorHandler The handler for page errors - * - * @note Once a completion has occurred for the getVDOPageAsync operation, - * the underlying page shall be busy (stuck in memory) until the - * VDOCompletion returned by this operation has been released. - **/ -void initVDOPageCompletion(VDOPageCompletion *pageCompletion, - VDOPageCache *cache, - PhysicalBlockNumber pbn, - bool writable, - void *parent, - VDOAction *callback, - VDOAction *errorHandler); - -/** - * Release a VDO Page Completion. - * - * The page referenced by this completion (if any) will no longer be - * held busy by this completion. If a page becomes discardable and - * there are completions awaiting free pages then a new round of - * page discarding is started. - * - * @param completion The completion to release - **/ -void releaseVDOPageCompletion(VDOCompletion *completion); - -/** - * Asynchronous operation to get a VDO page. - * - * May cause another page to be discarded (potentially writing a dirty page) - * and the one nominated by the completion to be loaded from disk. - * - * When the page becomes available the callback registered in the completion - * provided is triggered. Once triggered the page is marked busy until - * the completion is destroyed. - * - * @param completion the completion initialized my initVDOPageCompletion(). - **/ -void getVDOPageAsync(VDOCompletion *completion); - -/** - * Mark a VDO page referenced by a completed VDOPageCompletion as dirty. - * - * @param completion a VDO Page Completion whose callback has been called - * @param oldDirtyPeriod the period in which the page was already dirty (0 if - * it wasn't) - * @param newDirtyPeriod the period in which the page is now dirty - **/ -void markCompletedVDOPageDirty(VDOCompletion *completion, - SequenceNumber oldDirtyPeriod, - SequenceNumber newDirtyPeriod); - -/** - * Request that a VDO page be written out as soon as it is not busy. - * - * @param completion the VDOPageCompletion containing the page - **/ -void requestVDOPageWrite(VDOCompletion *completion); - -/** - * Access the raw memory for a read-only page of a completed VDOPageCompletion. - * - * @param completion a vdo page completion whose callback has been called - * - * @return a pointer to the raw memory at the beginning of the page, or - * NULL if the page is not available. - **/ -const void *dereferenceReadableVDOPage(VDOCompletion *completion); - -/** - * Access the raw memory for a writable page of a completed VDOPageCompletion. - * - * @param completion a vdo page completion whose callback has been called - * - * @return a pointer to the raw memory at the beginning of the page, or - * NULL if the page is not available, or if the page is read-only - **/ -void *dereferenceWritableVDOPage(VDOCompletion *completion); - -/** - * Get the per-page client context for the page in a page completion whose - * callback has been invoked. Should only be called after dereferencing the - * page completion to validate the page. - * - * @param completion a vdo page completion whose callback has been invoked - * - * @return a pointer to the per-page client context, or NULL if - * the page is not available - **/ -void *getVDOPageCompletionContext(VDOCompletion *completion); - -/** - * Drain I/O for a page cache. - * - * @param cache The cache to drain - **/ -void drainVDOPageCache(VDOPageCache *cache); - -/** - * Invalidate all entries in the VDO page cache. There must not be any - * dirty pages in the cache. - * - * @param cache the cache to invalidate - * - * @return a success or error code - **/ -int invalidateVDOPageCache(VDOPageCache *cache) - __attribute__((warn_unused_result)); - -// STATISTICS & TESTING - -/** - * Get current cache statistics. - * - * @param cache the page cache - * - * @return the statistics - **/ -AtomicPageCacheStatistics *getVDOPageCacheStatistics(VDOPageCache *cache) - __attribute__((warn_unused_result)); - -#endif // VDO_PAGE_CACHE_H diff --git a/vdo/base/vdoPageCacheInternals.h b/vdo/base/vdoPageCacheInternals.h deleted file mode 100644 index 4e2c67f..0000000 --- a/vdo/base/vdoPageCacheInternals.h +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoPageCacheInternals.h#8 $ - */ - -#ifndef VDO_PAGE_CACHE_INTERNALS_H -#define VDO_PAGE_CACHE_INTERNALS_H - -#include "vdoPageCache.h" - -#ifndef __KERNEL__ -# include -#endif - -#include "blockMapInternals.h" -#include "completion.h" -#include "dirtyLists.h" -#include "intMap.h" -#include "physicalLayer.h" -#include "ringNode.h" - -enum { - MAX_PAGE_CONTEXT_SIZE = 8, -}; - -static const PhysicalBlockNumber NO_PAGE = 0xFFFFFFFFFFFFFFFF; - -/** - * A PageInfoNode is a ring node. - **/ -typedef RingNode PageInfoNode; - -/** - * The VDO Page Cache abstraction. - **/ -struct vdoPageCache { - /** the physical layer to page to */ - PhysicalLayer *layer; - /** number of pages in cache */ - PageCount pageCount; - /** function to call on page read */ - VDOPageReadFunction *readHook; - /** function to call on page write */ - VDOPageWriteFunction *writeHook; - /** number of pages to write in the current batch */ - PageCount pagesInBatch; - /** Whether the VDO is doing a read-only rebuild */ - bool rebuilding; - - /** array of page information entries */ - PageInfo *infos; - /** raw memory for pages */ - char *pages; - /** cache last found page info */ - PageInfo *lastFound; - /** map of page number to info */ - IntMap *pageMap; - /** master LRU list (all infos) */ - PageInfoNode lruList; - /** dirty pages by period */ - DirtyLists *dirtyLists; - /** free page list (oldest first) */ - PageInfoNode freeList; - /** outgoing page list */ - PageInfoNode outgoingList; - /** number of read I/O operations pending */ - PageCount outstandingReads; - /** number of write I/O operations pending */ - PageCount outstandingWrites; - /** number of pages covered by the current flush */ - PageCount pagesInFlush; - /** number of pages waiting to be included in the next flush */ - PageCount pagesToFlush; - /** number of discards in progress */ - unsigned int discardCount; - /** how many VPCs waiting for free page */ - unsigned int waiterCount; - /** queue of waiters who want a free page */ - WaitQueue freeWaiters; - /** statistics */ - AtomicPageCacheStatistics stats; - /** counter for pressure reports */ - uint32_t pressureReport; - /** the block map zone to which this cache belongs */ - BlockMapZone *zone; -}; - -/** - * The state of a page buffer. If the page buffer is free no particular page is - * bound to it, otherwise the page buffer is bound to particular page whose - * absolute pbn is in the pbn field. If the page is resident or dirty the page - * data is stable and may be accessed. Otherwise the page is in flight - * (incoming or outgoing) and its data should not be accessed. - * - * @note Update the static data in vpcPageStateName() and vpcPageStateFlag() - * if you change this enumeration. - **/ -typedef enum __attribute__((packed)) pageState { - /* this page buffer is not being used */ - PS_FREE, - /* this page is being read from store */ - PS_INCOMING, - /* attempt to load this page failed */ - PS_FAILED, - /* this page is valid and un-modified */ - PS_RESIDENT, - /* this page is valid and modified */ - PS_DIRTY, - /* this page is being written and should not be used */ - PS_OUTGOING, - /* not a state */ - PAGE_STATE_COUNT, -} PageState; - -/** - * The write status of page - **/ -typedef enum __attribute__((packed)) { - WRITE_STATUS_NORMAL, - WRITE_STATUS_DISCARD, - WRITE_STATUS_DEFERRED, -} WriteStatus; - -/** - * Per-page-slot information. - **/ -struct pageInfo { - /** Preallocated page VIO */ - VIO *vio; - /** back-link for references */ - VDOPageCache *cache; - /** the pbn of the page */ - PhysicalBlockNumber pbn; - /** page is busy (temporarily locked) */ - uint16_t busy; - /** the write status the page */ - WriteStatus writeStatus; - /** page state */ - PageState state; - /** queue of completions awaiting this item */ - WaitQueue waiting; - /** state linked list node */ - PageInfoNode listNode; - /** LRU node */ - PageInfoNode lruNode; - /** Space for per-page client data */ - byte context[MAX_PAGE_CONTEXT_SIZE]; -}; - -// PAGE INFO LIST OPERATIONS - -/**********************************************************************/ -static inline PageInfo *pageInfoFromListNode(PageInfoNode *node) -{ - if (node == NULL) { - return NULL; - } - return (PageInfo *) ((uintptr_t) node - offsetof(PageInfo, listNode)); -} - -/**********************************************************************/ -static inline PageInfo *pageInfoFromLRUNode(PageInfoNode *node) -{ - if (node == NULL) { - return NULL; - } - return (PageInfo *) ((uintptr_t) node - offsetof(PageInfo, lruNode)); -} - -// PAGE INFO STATE ACCESSOR FUNCTIONS - -/**********************************************************************/ -static inline bool isFree(const PageInfo *info) -{ - return info->state == PS_FREE; -} - -/**********************************************************************/ -static inline bool isAvailable(const PageInfo *info) -{ - return (info->state == PS_FREE) || (info->state == PS_FAILED); -} - -/**********************************************************************/ -static inline bool isPresent(const PageInfo *info) -{ - return (info->state == PS_RESIDENT) || (info->state == PS_DIRTY); -} - -/**********************************************************************/ -static inline bool isDirty(const PageInfo *info) -{ - return info->state == PS_DIRTY; -} - -/**********************************************************************/ -static inline bool isResident(const PageInfo *info) -{ - return info->state == PS_RESIDENT; -} - -/**********************************************************************/ -static inline bool isInFlight(const PageInfo *info) -{ - return (info->state == PS_INCOMING) || (info->state == PS_OUTGOING); -} - -/**********************************************************************/ -static inline bool isIncoming(const PageInfo *info) -{ - return info->state == PS_INCOMING; -} - -/**********************************************************************/ -static inline bool isOutgoing(const PageInfo *info) -{ - return info->state == PS_OUTGOING; -} - -/**********************************************************************/ -static inline bool isValid(const PageInfo *info) -{ - return isPresent(info) || isOutgoing(info); -} - -// COMPLETION CONVERSIONS - -/**********************************************************************/ -static inline VDOPageCompletion *asVDOPageCompletion(VDOCompletion *completion) -{ - assertCompletionType(completion->type, VDO_PAGE_COMPLETION); - return (VDOPageCompletion *) ((uintptr_t) completion - - offsetof(VDOPageCompletion, completion)); -} - -/**********************************************************************/ -static inline -VDOPageCompletion *pageCompletionFromWaiter(Waiter *waiter) -{ - if (waiter == NULL) { - return NULL; - } - - VDOPageCompletion *completion = (VDOPageCompletion *) - ((uintptr_t) waiter - offsetof(VDOPageCompletion, waiter)); - assertCompletionType(completion->completion.type, VDO_PAGE_COMPLETION); - return completion; -} - -// COMMONLY USED FUNCTIONS - -// All of these functions are prefixed "vpc" in order to prevent namespace -// issues (ordinarily they would be static). - -/** - * Find the page info (if any) associated with a given pbn. - * - * @param cache the page cache - * @param pbn the absolute physical block number of the page - * - * @return the page info for the page if available, or NULL if not - **/ -PageInfo *vpcFindPage(VDOPageCache *cache, PhysicalBlockNumber pbn) - __attribute__((warn_unused_result)); - -/** - * Return the name of a page state. - * - * @param state a page state - * - * @return a pointer to a static page state name - * - * @note If the page state is invalid a static string is returned and the - * invalid state is logged. - **/ -const char *vpcPageStateName(PageState state) - __attribute__((warn_unused_result)); - -#endif // VDO_PAGE_CACHE_INTERNALS_H diff --git a/vdo/base/vdoRecovery.c b/vdo/base/vdoRecovery.c deleted file mode 100644 index 97e72eb..0000000 --- a/vdo/base/vdoRecovery.c +++ /dev/null @@ -1,1257 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.c#16 $ - */ - -#include "vdoRecoveryInternals.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "blockAllocator.h" -#include "blockAllocatorInternals.h" -#include "blockMapInternals.h" -#include "blockMapPage.h" -#include "blockMapRecovery.h" -#include "completion.h" -#include "numUtils.h" -#include "packedRecoveryJournalBlock.h" -#include "recoveryJournal.h" -#include "recoveryUtils.h" -#include "slab.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "slabJournalInternals.h" -#include "vdoInternal.h" -#include "waitQueue.h" - -enum { - // The int map needs capacity of twice the number of VIOs in the system. - INT_MAP_CAPACITY = MAXIMUM_USER_VIOS * 2, - // There can be as many missing decrefs as there are VIOs in the system. - MAXIMUM_SYNTHESIZED_DECREFS = MAXIMUM_USER_VIOS, -}; - -typedef struct missingDecref { - /** A waiter for queueing this object */ - Waiter waiter; - /** The parent of this object */ - RecoveryCompletion *recovery; - /** Whether this decref is complete */ - bool complete; - /** The slot for which the last decref was lost */ - BlockMapSlot slot; - /** The penultimate block map entry for this LBN */ - DataLocation penultimateMapping; - /** The page completion used to fetch the block map page for this LBN */ - VDOPageCompletion pageCompletion; - /** The journal point which will be used for this entry */ - JournalPoint journalPoint; - /** The slab journal to which this entry will be applied */ - SlabJournal *slabJournal; -} MissingDecref; - -/** - * Convert a Waiter to the missing decref of which it is a part. - * - * @param waiter The Waiter to convert - * - * @return The MissingDecref wrapping the Waiter - **/ -__attribute__((warn_unused_result)) -static inline MissingDecref *asMissingDecref(Waiter *waiter) -{ - STATIC_ASSERT(offsetof(MissingDecref, waiter) == 0); - return (MissingDecref *) waiter; -} - -/** - * Enqueue a MissingDecref. If the enqueue fails, enter read-only mode. - * - * @param queue The queue on which to enqueue the decref - * @param decref The MissingDecref to enqueue - * - * @return VDO_SUCCESS or an error - **/ -static int enqueueMissingDecref(WaitQueue *queue, MissingDecref *decref) -{ - int result = enqueueWaiter(queue, &decref->waiter); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(decref->recovery->vdo->readOnlyNotifier, result); - setCompletionResult(&decref->recovery->completion, result); - FREE(decref); - } - - return result; -} - -/** - * Convert a BlockMapSlot into a unique uint64_t. - * - * @param slot The block map slot to convert. - * - * @return a one-to-one mappable uint64_t. - **/ -static uint64_t slotAsNumber(BlockMapSlot slot) -{ - return (((uint64_t) slot.pbn << 10) + slot.slot); -} - -/** - * Create a MissingDecref and enqueue it to wait for a determination of its - * penultimate mapping. - * - * @param [in] recovery The parent recovery completion - * @param [in] entry The recovery journal entry for the increment which is - * missing a decref - * @param [out] decrefPtr A pointer to hold the new MissingDecref - * - * @return VDO_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int makeMissingDecref(RecoveryCompletion *recovery, - RecoveryJournalEntry entry, - MissingDecref **decrefPtr) -{ - MissingDecref *decref; - int result = ALLOCATE(1, MissingDecref, __func__, &decref); - if (result != VDO_SUCCESS) { - return result; - } - - decref->recovery = recovery; - result = enqueueMissingDecref(&recovery->missingDecrefs[0], decref); - if (result != VDO_SUCCESS) { - return result; - } - - /* - * Each synthsized decref needs a unique journal point. Otherwise, in the - * event of a crash, we would be unable to tell which synthesized decrefs had - * already been committed in the slab journals. Instead of using real - * recovery journal space for this, we can use fake journal points between - * the last currently valid entry in the tail block and the first journal - * entry in the next block. We can't overflow the entry count since the - * number of synthesized decrefs is bounded by the DataVIO limit. - * - * It is vital that any given missing decref always have the same fake - * journal point since a failed recovery may be retried with a different - * number of zones after having written out some slab journal blocks. Since - * the missing decrefs are always read out of the journal in the same order, - * we can assign them a journal point when they are read. Their subsequent - * use will ensure that, for any given slab journal, they are applied in - * the order dictated by these assigned journal points. - */ - decref->slot = entry.slot; - decref->journalPoint = recovery->nextSynthesizedJournalPoint; - recovery->nextSynthesizedJournalPoint.entryCount++; - recovery->missingDecrefCount++; - recovery->incompleteDecrefCount++; - - *decrefPtr = decref; - return VDO_SUCCESS; -} - -/** - * Move the given recovery point forward by one entry. - * - * @param point The recovery point to alter - **/ -static void incrementRecoveryPoint(RecoveryPoint *point) -{ - point->entryCount++; - if ((point->sectorCount == (SECTORS_PER_BLOCK - 1)) - && (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR)) { - point->sequenceNumber++; - point->sectorCount = 1; - point->entryCount = 0; - } - - if (point->entryCount == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) { - point->sectorCount++; - point->entryCount = 0; - return; - } -} - -/** - * Move the given recovery point backwards by one entry. - * - * @param point The recovery point to alter - **/ -static void decrementRecoveryPoint(RecoveryPoint *point) -{ - STATIC_ASSERT(RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR > 0); - - if ((point->sectorCount <= 1) && (point->entryCount == 0)) { - point->sequenceNumber--; - point->sectorCount = SECTORS_PER_BLOCK - 1; - point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_LAST_SECTOR - 1; - return; - } - - if (point->entryCount == 0) { - point->sectorCount--; - point->entryCount = RECOVERY_JOURNAL_ENTRIES_PER_SECTOR - 1; - return; - } - - point->entryCount--; -} - -/** - * Check whether the first point precedes the second point. - * - * @param first The first recovery point - * @param second The second recovery point - * - * @return true if the first point precedes the second point - **/ -__attribute__((warn_unused_result)) -static bool beforeRecoveryPoint(const RecoveryPoint *first, - const RecoveryPoint *second) -{ - if (first->sequenceNumber < second->sequenceNumber) { - return true; - } - - if (first->sequenceNumber > second->sequenceNumber) { - return false; - } - - if (first->sectorCount < second->sectorCount) { - return true; - } - - return ((first->sectorCount == second->sectorCount) - && (first->entryCount < second->entryCount)); -} - -/** - * Prepare the sub-task completion. - * - * @param recovery The RecoveryCompletion whose sub-task completion is to - * be prepared - * @param callback The callback to register for the next sub-task - * @param errorHandler The error handler for the next sub-task - * @param zoneType The type of zone on which the callback or errorHandler - * should run - **/ -static void prepareSubTask(RecoveryCompletion *recovery, - VDOAction callback, - VDOAction errorHandler, - ZoneType zoneType) -{ - const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo); - ThreadID threadID; - switch (zoneType) { - case ZONE_TYPE_LOGICAL: - // All blockmap access is done on single thread, so use logical zone 0. - threadID = getLogicalZoneThread(threadConfig, 0); - break; - - case ZONE_TYPE_PHYSICAL: - threadID = recovery->allocator->threadID; - break; - - case ZONE_TYPE_ADMIN: - default: - threadID = getAdminThread(threadConfig); - } - - prepareCompletion(&recovery->subTaskCompletion, callback, errorHandler, - threadID, recovery); -} - -/**********************************************************************/ -int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr) -{ - const ThreadConfig *threadConfig = getThreadConfig(vdo); - RecoveryCompletion *recovery; - int result = ALLOCATE_EXTENDED(RecoveryCompletion, - threadConfig->physicalZoneCount, RingNode, - __func__, &recovery); - if (result != VDO_SUCCESS) { - return result; - } - - recovery->vdo = vdo; - for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) { - initializeWaitQueue(&recovery->missingDecrefs[z]); - } - - result = initializeEnqueueableCompletion(&recovery->completion, - RECOVERY_COMPLETION, vdo->layer); - if (result != VDO_SUCCESS) { - freeRecoveryCompletion(&recovery); - return result; - } - - result = initializeEnqueueableCompletion(&recovery->subTaskCompletion, - SUB_TASK_COMPLETION, vdo->layer); - if (result != VDO_SUCCESS) { - freeRecoveryCompletion(&recovery); - return result; - } - - result = makeIntMap(INT_MAP_CAPACITY, 0, &recovery->slotEntryMap); - if (result != VDO_SUCCESS) { - freeRecoveryCompletion(&recovery); - return result; - } - - *recoveryPtr = recovery; - return VDO_SUCCESS; -} - -/** - * A waiter callback to free MissingDecrefs. - * - * Implements WaiterCallback. - **/ -static void freeMissingDecref(Waiter *waiter, - void *context __attribute__((unused))) -{ - FREE(asMissingDecref(waiter)); -} - -/**********************************************************************/ -void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr) -{ - RecoveryCompletion *recovery = *recoveryPtr; - if (recovery == NULL) { - return; - } - - freeIntMap(&recovery->slotEntryMap); - const ThreadConfig *threadConfig = getThreadConfig(recovery->vdo); - for (ZoneCount z = 0; z < threadConfig->physicalZoneCount; z++) { - notifyAllWaiters(&recovery->missingDecrefs[z], freeMissingDecref, NULL); - } - - FREE(recovery->journalData); - FREE(recovery->entries); - destroyEnqueueable(&recovery->subTaskCompletion); - destroyEnqueueable(&recovery->completion); - FREE(recovery); - *recoveryPtr = NULL; -} - -/** - * Finish recovering, free the recovery completion and notify the parent. - * - * @param completion The recovery completion - **/ -static void finishRecovery(VDOCompletion *completion) -{ - VDOCompletion *parent = completion->parent; - RecoveryCompletion *recovery = asRecoveryCompletion(completion); - VDO *vdo = recovery->vdo; - uint64_t recoveryCount = ++vdo->completeRecoveries; - initializeRecoveryJournalPostRecovery(vdo->recoveryJournal, - recoveryCount, recovery->highestTail); - freeRecoveryCompletion(&recovery); - logInfo("Rebuild complete."); - - // Now that we've freed the recovery completion and its vast array of - // journal entries, we can allocate refcounts. - int result = allocateSlabRefCounts(vdo->depot); - finishCompletion(parent, result); -} - -/** - * Handle a recovery error. - * - * @param completion The recovery completion - **/ -static void abortRecovery(VDOCompletion *completion) -{ - VDOCompletion *parent = completion->parent; - int result = completion->result; - RecoveryCompletion *recovery = asRecoveryCompletion(completion); - freeRecoveryCompletion(&recovery); - logWarning("Recovery aborted"); - finishCompletion(parent, result); -} - -/** - * Abort a recovery if there is an error. - * - * @param result The result to check - * @param recovery The recovery completion - * - * @return true if the result was an error - **/ -__attribute__((warn_unused_result)) -static bool abortRecoveryOnError(int result, RecoveryCompletion *recovery) -{ - if (result == VDO_SUCCESS) { - return false; - } - - finishCompletion(&recovery->completion, result); - return true; -} - -/** - * Unpack the recovery journal entry associated with the given recovery point. - * - * @param recovery The recovery completion - * @param point The recovery point - * - * @return The unpacked contents of the matching recovery journal entry - **/ -static RecoveryJournalEntry getEntry(const RecoveryCompletion *recovery, - const RecoveryPoint *point) -{ - RecoveryJournal *journal = recovery->vdo->recoveryJournal; - PhysicalBlockNumber blockNumber - = getRecoveryJournalBlockNumber(journal, point->sequenceNumber); - off_t sectorOffset - = (blockNumber * VDO_BLOCK_SIZE) + (point->sectorCount * VDO_SECTOR_SIZE); - PackedJournalSector *sector - = (PackedJournalSector *) &recovery->journalData[sectorOffset]; - return unpackRecoveryJournalEntry(§or->entries[point->entryCount]); -} - -/** - * Create an array of all valid journal entries, in order, and store it in the - * recovery completion. - * - * @param recovery The recovery completion - * - * @return VDO_SUCCESS or an error code - **/ -static int extractJournalEntries(RecoveryCompletion *recovery) -{ - // Allocate a NumberedBlockMapping array just large enough to transcribe - // every increment PackedRecoveryJournalEntry from every valid journal block. - int result = ALLOCATE(recovery->increfCount, NumberedBlockMapping, __func__, - &recovery->entries); - if (result != VDO_SUCCESS) { - return result; - } - - RecoveryPoint recoveryPoint = { - .sequenceNumber = recovery->blockMapHead, - .sectorCount = 1, - .entryCount = 0, - }; - while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { - RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); - result = validateRecoveryJournalEntry(recovery->vdo, &entry); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); - return result; - } - - if (isIncrementOperation(entry.operation)) { - recovery->entries[recovery->entryCount] = (NumberedBlockMapping) { - .blockMapSlot = entry.slot, - .blockMapEntry = packPBN(entry.mapping.pbn, entry.mapping.state), - .number = recovery->entryCount, - }; - recovery->entryCount++; - } - - incrementRecoveryPoint(&recoveryPoint); - } - - result = ASSERT((recovery->entryCount <= recovery->increfCount), - "approximate incref count is an upper bound"); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); - } - - return result; -} - -/** - * Extract journal entries and recover the block map. This callback is - * registered in startSuperBlockSave(). - * - * @param completion The sub-task completion - **/ -static void launchBlockMapRecovery(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - VDO *vdo = recovery->vdo; - assertOnLogicalZoneThread(vdo, 0, __func__); - - // Extract the journal entries for the block map recovery. - int result = extractJournalEntries(recovery); - if (abortRecoveryOnError(result, recovery)) { - return; - } - - prepareToFinishParent(completion, &recovery->completion); - recoverBlockMap(vdo, recovery->entryCount, recovery->entries, completion); -} - -/** - * Finish flushing all slab journals and start a write of the super block. - * This callback is registered in addSynthesizedEntries(). - * - * @param completion The sub-task completion - **/ -static void startSuperBlockSave(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - VDO *vdo = recovery->vdo; - assertOnAdminThread(vdo, __func__); - - logInfo("Saving recovery progress"); - vdo->state = VDO_REPLAYING; - - // The block map access which follows the super block save must be done - // on a logical thread. - prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback, - ZONE_TYPE_LOGICAL); - saveVDOComponentsAsync(vdo, completion); -} - -/** - * The callback from loading the slab depot. It will update the logical blocks - * and block map data blocks counts in the recovery journal and then drain the - * slab depot in order to commit the recovered slab journals. It is registered - * in applyToDepot(). - * - * @param completion The sub-task completion - **/ -static void finishRecoveringDepot(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - VDO *vdo = recovery->vdo; - assertOnAdminThread(vdo, __func__); - - logInfo("Replayed %zu journal entries into slab journals", - recovery->entriesAddedToSlabJournals); - logInfo("Synthesized %zu missing journal entries", - recovery->missingDecrefCount); - vdo->recoveryJournal->logicalBlocksUsed = recovery->logicalBlocksUsed; - vdo->recoveryJournal->blockMapDataBlocks = recovery->blockMapDataBlocks; - - prepareSubTask(recovery, startSuperBlockSave, finishParentCallback, - ZONE_TYPE_ADMIN); - drainSlabDepot(vdo->depot, ADMIN_STATE_RECOVERING, completion); -} - -/** - * The error handler for recovering slab journals. It will skip any remaining - * recovery on the current zone and propagate the error. It is registered in - * addSlabJournalEntries() and addSynthesizedEntries(). - * - * @param completion The completion of the block allocator being recovered - **/ -static void handleAddSlabJournalEntryError(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - notifySlabJournalsAreRecovered(recovery->allocator, completion->result); -} - -/** - * Add synthesized entries into slab journals, waiting when necessary. - * - * @param completion The allocator completion - **/ -static void addSynthesizedEntries(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - - // Get ready in case we need to enqueue again - prepareCompletion(completion, addSynthesizedEntries, - handleAddSlabJournalEntryError, - completion->callbackThreadID, recovery); - WaitQueue *missingDecrefs - = &recovery->missingDecrefs[recovery->allocator->zoneNumber]; - while (hasWaiters(missingDecrefs)) { - MissingDecref *decref = asMissingDecref(getFirstWaiter(missingDecrefs)); - if (!attemptReplayIntoSlabJournal(decref->slabJournal, - decref->penultimateMapping.pbn, - DATA_DECREMENT, &decref->journalPoint, - completion)) { - return; - } - - dequeueNextWaiter(missingDecrefs); - FREE(decref); - } - - notifySlabJournalsAreRecovered(recovery->allocator, VDO_SUCCESS); -} - -/** - * Determine the LBNs used count as of the end of the journal (but - * not including any changes to that count from entries that will be - * synthesized later). - * - * @param recovery The recovery completion - * - * @return VDO_SUCCESS or an error - **/ -static int computeUsages(RecoveryCompletion *recovery) -{ - RecoveryJournal *journal = recovery->vdo->recoveryJournal; - PackedJournalHeader *tailHeader - = getJournalBlockHeader(journal, recovery->journalData, recovery->tail); - - RecoveryBlockHeader unpacked; - unpackRecoveryBlockHeader(tailHeader, &unpacked); - recovery->logicalBlocksUsed = unpacked.logicalBlocksUsed; - recovery->blockMapDataBlocks = unpacked.blockMapDataBlocks; - - RecoveryPoint recoveryPoint = { - .sequenceNumber = recovery->tail, - .sectorCount = 1, - .entryCount = 0, - }; - while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { - RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); - if (isMappedLocation(&entry.mapping)) { - switch (entry.operation) { - case DATA_INCREMENT: - recovery->logicalBlocksUsed++; - break; - - case DATA_DECREMENT: - recovery->logicalBlocksUsed--; - break; - - case BLOCK_MAP_INCREMENT: - recovery->blockMapDataBlocks++; - break; - - default: - return logErrorWithStringError(VDO_CORRUPT_JOURNAL, - "Recovery journal entry at " - "sequence number %" PRIu64 - ", sector %u, entry %u had invalid " - "operation %u", - recoveryPoint.sequenceNumber, - recoveryPoint.sectorCount, - recoveryPoint.entryCount, - entry.operation); - } - } - - incrementRecoveryPoint(&recoveryPoint); - } - - return VDO_SUCCESS; -} - -/** - * Advance the current recovery and journal points. - * - * @param recovery The RecoveryCompletion whose points are to be - * advanced - * @param entriesPerBlock The number of entries in a recovery journal block - **/ -static void advancePoints(RecoveryCompletion *recovery, - JournalEntryCount entriesPerBlock) -{ - incrementRecoveryPoint(&recovery->nextRecoveryPoint); - advanceJournalPoint(&recovery->nextJournalPoint, entriesPerBlock); -} - -/** - * Replay recovery journal entries into the slab journals of the allocator - * currently being recovered, waiting for slab journal tailblock space when - * necessary. This method is its own callback. - * - * @param completion The allocator completion - **/ -static void addSlabJournalEntries(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - VDO *vdo = recovery->vdo; - RecoveryJournal *journal = vdo->recoveryJournal; - - // Get ready in case we need to enqueue again. - prepareCompletion(completion, addSlabJournalEntries, - handleAddSlabJournalEntryError, - completion->callbackThreadID, recovery); - for (RecoveryPoint *recoveryPoint = &recovery->nextRecoveryPoint; - beforeRecoveryPoint(recoveryPoint, &recovery->tailRecoveryPoint); - advancePoints(recovery, journal->entriesPerBlock)) { - RecoveryJournalEntry entry = getEntry(recovery, recoveryPoint); - int result = validateRecoveryJournalEntry(vdo, &entry); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(journal->readOnlyNotifier, result); - finishCompletion(completion, result); - return; - } - - if (entry.mapping.pbn == ZERO_BLOCK) { - continue; - } - - Slab *slab = getSlab(vdo->depot, entry.mapping.pbn); - if (slab->allocator != recovery->allocator) { - continue; - } - - if (!attemptReplayIntoSlabJournal(slab->journal, entry.mapping.pbn, - entry.operation, - &recovery->nextJournalPoint, - completion)) { - return; - } - - recovery->entriesAddedToSlabJournals++; - } - - logInfo("Recreating missing journal entries for zone %u", - recovery->allocator->zoneNumber); - addSynthesizedEntries(completion); -} - -/**********************************************************************/ -void replayIntoSlabJournals(BlockAllocator *allocator, - VDOCompletion *completion, - void *context) -{ - RecoveryCompletion *recovery = context; - assertOnPhysicalZoneThread(recovery->vdo, allocator->zoneNumber, __func__); - if ((recovery->journalData == NULL) || isReplaying(recovery->vdo)) { - // there's nothing to replay - notifySlabJournalsAreRecovered(allocator, VDO_SUCCESS); - return; - } - - recovery->allocator = allocator; - recovery->nextRecoveryPoint = (RecoveryPoint) { - .sequenceNumber = recovery->slabJournalHead, - .sectorCount = 1, - .entryCount = 0, - }; - - recovery->nextJournalPoint = (JournalPoint) { - .sequenceNumber = recovery->slabJournalHead, - .entryCount = 0, - }; - - logInfo("Replaying entries into slab journals for zone %u", - allocator->zoneNumber); - completion->parent = recovery; - addSlabJournalEntries(completion); -} - -/** - * A waiter callback to enqueue a MissingDecref on the queue for the physical - * zone in which it will be applied. - * - * Implements WaiterCallback. - **/ -static void queueOnPhysicalZone(Waiter *waiter, void *context) -{ - MissingDecref *decref = asMissingDecref(waiter); - DataLocation mapping = decref->penultimateMapping; - if (isMappedLocation(&mapping)) { - decref->recovery->logicalBlocksUsed--; - } - - if (mapping.pbn == ZERO_BLOCK) { - // Decrefs of zero are not applied to slab journals. - FREE(decref); - return; - } - - decref->slabJournal = getSlabJournal((SlabDepot *) context, mapping.pbn); - ZoneCount zoneNumber = decref->slabJournal->slab->allocator->zoneNumber; - enqueueMissingDecref(&decref->recovery->missingDecrefs[zoneNumber], decref); -} - -/** - * Queue each missing decref on the slab journal to which it is to be applied - * then load the slab depot. This callback is registered in - * findSlabJournalEntries(). - * - * @param completion The sub-task completion - **/ -static void applyToDepot(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - assertOnAdminThread(recovery->vdo, __func__); - prepareSubTask(recovery, finishRecoveringDepot, finishParentCallback, - ZONE_TYPE_ADMIN); - - SlabDepot *depot = getSlabDepot(recovery->vdo); - notifyAllWaiters(&recovery->missingDecrefs[0], queueOnPhysicalZone, depot); - if (abortRecoveryOnError(recovery->completion.result, recovery)) { - return; - } - - loadSlabDepot(depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, recovery); -} - -/** - * Validate the location of the penultimate mapping for a MissingDecref. If it - * is valid, enqueue it for the appropriate physical zone or account for it. - * Otherwise, dispose of it and signal an error. - * - * @param decref The decref whose penultimate mapping has just been found - * @param location The penultimate mapping - * @param errorCode The error code to use if the location is invalid - **/ -static int recordMissingDecref(MissingDecref *decref, - DataLocation location, - int errorCode) -{ - RecoveryCompletion *recovery = decref->recovery; - recovery->incompleteDecrefCount--; - if (isValidLocation(&location) - && isPhysicalDataBlock(recovery->vdo->depot, location.pbn)) { - decref->penultimateMapping = location; - decref->complete = true; - return VDO_SUCCESS; - } - - // The location was invalid - enterReadOnlyMode(recovery->vdo->readOnlyNotifier, errorCode); - setCompletionResult(&recovery->completion, errorCode); - logErrorWithStringError(errorCode, - "Invalid mapping for pbn %llu with state %u", - location.pbn, location.state); - return errorCode; -} - -/** - * Find the block map slots with missing decrefs. - * - * To find the slots missing decrefs, we iterate through the journal in reverse - * so we see decrefs before increfs; if we see an incref before its paired - * decref, we instantly know this incref is missing its decref. - * - * Simultaneously, we attempt to determine the missing decref. If there is a - * missing decref, and at least two increfs for that slot, we know we should - * decref the PBN from the penultimate incref. Otherwise, there is only one - * incref for that slot: we must synthesize the decref out of the block map - * instead of the recovery journal. - * - * @param recovery The recovery completion - * - * @return VDO_SUCCESS or an error code - **/ -__attribute__((warn_unused_result)) -static int findMissingDecrefs(RecoveryCompletion *recovery) -{ - IntMap *slotEntryMap = recovery->slotEntryMap; - // This placeholder decref is used to mark lbns for which we have observed a - // decref but not the paired incref (going backwards through the journal). - MissingDecref foundDecref; - - // A buffer is allocated based on the number of incRef entries found, so use - // the earliest head. - SequenceNumber head = minSequenceNumber(recovery->blockMapHead, - recovery->slabJournalHead); - RecoveryPoint headPoint = { - .sequenceNumber = head, - .sectorCount = 1, - .entryCount = 0, - }; - - // Set up for the first fake journal point that will be used for a - // synthesized entry. - recovery->nextSynthesizedJournalPoint = (JournalPoint) { - .sequenceNumber = recovery->tail, - .entryCount = recovery->vdo->recoveryJournal->entriesPerBlock, - }; - - RecoveryPoint recoveryPoint = recovery->tailRecoveryPoint; - while (beforeRecoveryPoint(&headPoint, &recoveryPoint)) { - decrementRecoveryPoint(&recoveryPoint); - RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); - - if (!isIncrementOperation(entry.operation)) { - // Observe that we've seen a decref before its incref, but only if - // the IntMap does not contain an unpaired incref for this lbn. - int result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), - &foundDecref, false, NULL); - if (result != VDO_SUCCESS) { - return result; - } - - continue; - } - - recovery->increfCount++; - - MissingDecref *decref - = intMapRemove(slotEntryMap, slotAsNumber(entry.slot)); - if (entry.operation == BLOCK_MAP_INCREMENT) { - if (decref != NULL) { - return logErrorWithStringError(VDO_CORRUPT_JOURNAL, - "decref found for block map block %" - PRIu64 " with state %u", - entry.mapping.pbn, entry.mapping.state); - } - - // There are no decrefs for block map pages, so they can't be missing. - continue; - } - - if (decref == &foundDecref) { - // This incref already had a decref in the intmap, so we know it is - // not missing its decref. - continue; - } - - if (decref == NULL) { - // This incref is missing a decref. Add a missing decref object. - int result = makeMissingDecref(recovery, entry, &decref); - if (result != VDO_SUCCESS) { - return result; - } - - result = intMapPut(slotEntryMap, slotAsNumber(entry.slot), decref, - false, NULL); - if (result != VDO_SUCCESS) { - return result; - } - - continue; - } - - /* - * This MissingDecref was left here by an incref without a decref. - * We now know what its penultimate mapping is, and all entries - * before here in the journal are paired, decref before incref, so - * we needn't remember it in the intmap any longer. - */ - int result = recordMissingDecref(decref, entry.mapping, - VDO_CORRUPT_JOURNAL); - if (result != VDO_SUCCESS) { - return result; - } - } - - return VDO_SUCCESS; -} - -/** - * Process a fetched block map page for a missing decref. This callback is - * registered in findSlabJournalEntries(). - * - * @param completion The page completion which has just finished loading - **/ -static void processFetchedPage(VDOCompletion *completion) -{ - MissingDecref *currentDecref = completion->parent; - RecoveryCompletion *recovery = currentDecref->recovery; - assertOnLogicalZoneThread(recovery->vdo, 0, __func__); - - const BlockMapPage *page = dereferenceReadableVDOPage(completion); - DataLocation location - = unpackBlockMapEntry(&page->entries[currentDecref->slot.slot]); - releaseVDOPageCompletion(completion); - recordMissingDecref(currentDecref, location, VDO_BAD_MAPPING); - if (recovery->incompleteDecrefCount == 0) { - completeCompletion(&recovery->subTaskCompletion); - } -} - -/** - * Handle an error fetching a block map page for a missing decref. - * This error handler is registered in findSlabJournalEntries(). - * - * @param completion The page completion which has just finished loading - **/ -static void handleFetchError(VDOCompletion *completion) -{ - MissingDecref *decref = completion->parent; - RecoveryCompletion *recovery = decref->recovery; - assertOnLogicalZoneThread(recovery->vdo, 0, __func__); - - // If we got a VDO_OUT_OF_RANGE error, it is because the pbn we read from - // the journal was bad, so convert the error code - setCompletionResult(&recovery->subTaskCompletion, - ((completion->result == VDO_OUT_OF_RANGE) - ? VDO_CORRUPT_JOURNAL : completion->result)); - releaseVDOPageCompletion(completion); - if (--recovery->incompleteDecrefCount == 0) { - completeCompletion(&recovery->subTaskCompletion); - } -} - -/** - * The waiter callback to requeue a missing decref and launch its page fetch. - * - * Implements WaiterCallback. - **/ -static void launchFetch(Waiter *waiter, void *context) -{ - MissingDecref *decref = asMissingDecref(waiter); - RecoveryCompletion *recovery = decref->recovery; - if (enqueueMissingDecref(&recovery->missingDecrefs[0], decref) - != VDO_SUCCESS) { - return; - } - - if (decref->complete) { - // We've already found the mapping for this decref, no fetch needed. - return; - } - - BlockMapZone *zone = context; - initVDOPageCompletion(&decref->pageCompletion, zone->pageCache, - decref->slot.pbn, false, decref, processFetchedPage, - handleFetchError); - getVDOPageAsync(&decref->pageCompletion.completion); -} - -/** - * Find all entries which need to be replayed into the slab journals. - * - * @param completion The sub-task completion - **/ -static void findSlabJournalEntries(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - VDO *vdo = recovery->vdo; - - // We need to be on logical zone 0's thread since we are going to use its - // page cache. - assertOnLogicalZoneThread(vdo, 0, __func__); - int result = findMissingDecrefs(recovery); - if (abortRecoveryOnError(result, recovery)) { - return; - } - - prepareSubTask(recovery, applyToDepot, finishParentCallback, - ZONE_TYPE_ADMIN); - - /* - * Increment the incompleteDecrefCount so that the fetch callback can't - * complete the sub-task while we are still processing the queue of missing - * decrefs. - */ - if (recovery->incompleteDecrefCount++ > 0) { - // Fetch block map pages to fill in the incomplete missing decrefs. - notifyAllWaiters(&recovery->missingDecrefs[0], launchFetch, - getBlockMapZone(getBlockMap(vdo), 0)); - } - - if (--recovery->incompleteDecrefCount == 0) { - completeCompletion(completion); - } -} - -/** - * Find the contiguous range of journal blocks. - * - * @param recovery The recovery completion - * - * @return true if there were valid journal blocks - **/ -static bool findContiguousRange(RecoveryCompletion *recovery) -{ - RecoveryJournal *journal = recovery->vdo->recoveryJournal; - SequenceNumber head - = minSequenceNumber(recovery->blockMapHead, recovery->slabJournalHead); - - bool foundEntries = false; - for (SequenceNumber i = head; i <= recovery->highestTail; i++) { - recovery->tail = i; - recovery->tailRecoveryPoint = (RecoveryPoint) { - .sequenceNumber = i, - .sectorCount = 0, - .entryCount = 0, - }; - - PackedJournalHeader *packedHeader - = getJournalBlockHeader(journal, recovery->journalData, i); - RecoveryBlockHeader header; - unpackRecoveryBlockHeader(packedHeader, &header); - - if (!isExactRecoveryJournalBlock(journal, &header, i) - || (header.entryCount > journal->entriesPerBlock)) { - // A bad block header was found so this must be the end of the journal. - break; - } - - JournalEntryCount blockEntries = header.entryCount; - // Examine each sector in turn to determine the last valid sector. - for (uint8_t j = 1; j < SECTORS_PER_BLOCK; j++) { - PackedJournalSector *sector = getJournalBlockSector(packedHeader, j); - - // A bad sector means that this block was torn. - if (!isValidRecoveryJournalSector(&header, sector)) { - break; - } - - JournalEntryCount sectorEntries = minBlock(sector->entryCount, - blockEntries); - if (sectorEntries > 0) { - foundEntries = true; - recovery->tailRecoveryPoint.sectorCount++; - recovery->tailRecoveryPoint.entryCount = sectorEntries; - blockEntries -= sectorEntries; - } - - // If this sector is short, the later sectors can't matter. - if ((sectorEntries < RECOVERY_JOURNAL_ENTRIES_PER_SECTOR) - || (blockEntries == 0)) { - break; - } - } - - // If this block was not filled, or if it tore, no later block can matter. - if ((header.entryCount != journal->entriesPerBlock) - || (blockEntries > 0)) { - break; - } - } - - // Set the tail to the last valid tail block, if there is one. - if (foundEntries && (recovery->tailRecoveryPoint.sectorCount == 0)) { - recovery->tail--; - } - - return foundEntries; -} - -/** - * Count the number of increment entries in the journal. - * - * @param recovery The recovery completion - **/ -static int countIncrementEntries(RecoveryCompletion *recovery) -{ - RecoveryPoint recoveryPoint = { - .sequenceNumber = recovery->blockMapHead, - .sectorCount = 1, - .entryCount = 0, - }; - while (beforeRecoveryPoint(&recoveryPoint, &recovery->tailRecoveryPoint)) { - RecoveryJournalEntry entry = getEntry(recovery, &recoveryPoint); - int result = validateRecoveryJournalEntry(recovery->vdo, &entry); - if (result != VDO_SUCCESS) { - enterReadOnlyMode(recovery->vdo->readOnlyNotifier, result); - return result; - } - if (isIncrementOperation(entry.operation)) { - recovery->increfCount++; - } - incrementRecoveryPoint(&recoveryPoint); - } - - return VDO_SUCCESS; -} - -/** - * Determine the limits of the valid recovery journal and prepare to replay - * into the slab journals and block map. - * - * @param completion The sub-task completion - **/ -static void prepareToApplyJournalEntries(VDOCompletion *completion) -{ - RecoveryCompletion *recovery = asRecoveryCompletion(completion->parent); - VDO *vdo = recovery->vdo; - RecoveryJournal *journal = vdo->recoveryJournal; - logInfo("Finished reading recovery journal"); - bool foundEntries = findHeadAndTail(journal, recovery->journalData, - &recovery->highestTail, - &recovery->blockMapHead, - &recovery->slabJournalHead); - if (foundEntries) { - foundEntries = findContiguousRange(recovery); - } - - // Both reap heads must be behind the tail. - if ((recovery->blockMapHead > recovery->tail) - || (recovery->slabJournalHead > recovery->tail)) { - int result = logErrorWithStringError(VDO_CORRUPT_JOURNAL, - "Journal tail too early. " - "block map head: %" PRIu64 - ", slab journal head: %" PRIu64 - ", tail: %llu", - recovery->blockMapHead, - recovery->slabJournalHead, - recovery->tail); - finishCompletion(&recovery->completion, result); - return; - } - - if (!foundEntries) { - // This message must be recognizable by VDOTest::RebuildBase. - logInfo("Replaying 0 recovery entries into block map"); - // We still need to load the SlabDepot. - FREE(recovery->journalData); - recovery->journalData = NULL; - prepareSubTask(recovery, finishParentCallback, finishParentCallback, - ZONE_TYPE_ADMIN); - loadSlabDepot(getSlabDepot(vdo), ADMIN_STATE_LOADING_FOR_RECOVERY, - completion, recovery); - return; - } - - logInfo("Highest-numbered recovery journal block has sequence number" - " %llu, and the highest-numbered usable block is %" - PRIu64, recovery->highestTail, recovery->tail); - - if (isReplaying(vdo)) { - // We need to know how many entries the block map rebuild completion will - // need to hold. - int result = countIncrementEntries(recovery); - if (result != VDO_SUCCESS) { - finishCompletion(&recovery->completion, result); - return; - } - - // We need to access the block map from a logical zone. - prepareSubTask(recovery, launchBlockMapRecovery, finishParentCallback, - ZONE_TYPE_LOGICAL); - loadSlabDepot(vdo->depot, ADMIN_STATE_LOADING_FOR_RECOVERY, completion, - recovery); - return; - } - - int result = computeUsages(recovery); - if (abortRecoveryOnError(result, recovery)) { - return; - } - - prepareSubTask(recovery, findSlabJournalEntries, finishParentCallback, - ZONE_TYPE_LOGICAL); - invokeCallback(completion); -} - -/**********************************************************************/ -void launchRecovery(VDO *vdo, VDOCompletion *parent) -{ - // Note: This message must be recognizable by Permabit::VDODeviceBase. - logWarning("Device was dirty, rebuilding reference counts"); - - RecoveryCompletion *recovery; - int result = makeRecoveryCompletion(vdo, &recovery); - if (result != VDO_SUCCESS) { - finishCompletion(parent, result); - return; - } - - VDOCompletion *completion = &recovery->completion; - prepareCompletion(completion, finishRecovery, abortRecovery, - parent->callbackThreadID, parent); - prepareSubTask(recovery, prepareToApplyJournalEntries, finishParentCallback, - ZONE_TYPE_ADMIN); - loadJournalAsync(vdo->recoveryJournal, &recovery->subTaskCompletion, - &recovery->journalData); -} diff --git a/vdo/base/vdoRecovery.h b/vdo/base/vdoRecovery.h deleted file mode 100644 index f817a05..0000000 --- a/vdo/base/vdoRecovery.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecovery.h#2 $ - */ - -#ifndef VDO_RECOVERY_H -#define VDO_RECOVERY_H - -#include "completion.h" -#include "vdo.h" - -/** - * Replay recovery journal entries in the the slab journals of slabs owned by a - * given BlockAllocator. - * - * @param allocator The allocator whose slab journals are to be recovered - * @param completion The completion to use for waiting on slab journal space - * @param context The slab depot load context supplied by a recovery when - * it loads the depot - **/ -void replayIntoSlabJournals(BlockAllocator *allocator, - VDOCompletion *completion, - void *context); - -/** - * Construct a recovery completion and launch it. Apply all valid journal block - * entries to all VDO structures. This function performs the offline portion of - * recovering a VDO from a crash. - * - * @param vdo The vdo to recover - * @param parent The completion to notify when the offline portion of the - * recovery is complete - **/ -void launchRecovery(VDO *vdo, VDOCompletion *parent); - -#endif // VDO_RECOVERY_H diff --git a/vdo/base/vdoRecoveryInternals.h b/vdo/base/vdoRecoveryInternals.h deleted file mode 100644 index b0414c1..0000000 --- a/vdo/base/vdoRecoveryInternals.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoRecoveryInternals.h#2 $ - */ - -#ifndef VDO_RECOVERY_INTERNALS_H -#define VDO_RECOVERY_INTERNALS_H - -#include "vdoRecovery.h" - -#include "blockMapRecovery.h" -#include "intMap.h" -#include "journalPoint.h" -#include "ringNode.h" -#include "types.h" -#include "waitQueue.h" - -/** - * The absolute position of an entry in the recovery journal, including - * the sector number and the entry number within the sector. - **/ -typedef struct { - SequenceNumber sequenceNumber; // Block sequence number - uint8_t sectorCount; // Sector number - JournalEntryCount entryCount; // Entry number -} RecoveryPoint; - -typedef struct { - /** The completion header */ - VDOCompletion completion; - /** The sub-task completion */ - VDOCompletion subTaskCompletion; - /** The VDO in question */ - VDO *vdo; - /** The BlockAllocator whose journals are being recovered */ - BlockAllocator *allocator; - /** A buffer to hold the data read off disk */ - char *journalData; - /** The number of increfs */ - size_t increfCount; - - /** The entry data for the block map recovery */ - NumberedBlockMapping *entries; - /** The number of entries in the entry array */ - size_t entryCount; - /** The sequence number of the first valid block for block map recovery */ - SequenceNumber blockMapHead; - /** The sequence number of the first valid block for slab journal replay */ - SequenceNumber slabJournalHead; - /** The sequence number of the last valid block of the journal (if known) */ - SequenceNumber tail; - /** - * The highest sequence number of the journal, not the same as the tail, - * since the tail ignores blocks after the first hole. - */ - SequenceNumber highestTail; - - /** A location just beyond the last valid entry of the journal */ - RecoveryPoint tailRecoveryPoint; - /** The location of the next recovery journal entry to apply */ - RecoveryPoint nextRecoveryPoint; - /** The number of logical blocks currently known to be in use */ - BlockCount logicalBlocksUsed; - /** The number of block map data blocks known to be allocated */ - BlockCount blockMapDataBlocks; - /** The journal point to give to the next synthesized decref */ - JournalPoint nextJournalPoint; - /** The number of entries played into slab journals */ - size_t entriesAddedToSlabJournals; - - // Decref synthesis fields - - /** An intMap for use in finding which slots are missing decrefs */ - IntMap *slotEntryMap; - /** The number of synthesized decrefs */ - size_t missingDecrefCount; - /** The number of incomplete decrefs */ - size_t incompleteDecrefCount; - /** The fake journal point of the next missing decref */ - JournalPoint nextSynthesizedJournalPoint; - /** The queue of missing decrefs */ - WaitQueue missingDecrefs[]; -} RecoveryCompletion; - -/** - * Convert a generic completion to a RecoveryCompletion. - * - * @param completion The completion to convert - * - * @return The RecoveryCompletion - **/ -__attribute__((warn_unused_result)) -static inline RecoveryCompletion * -asRecoveryCompletion(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(RecoveryCompletion, completion) == 0); - assertCompletionType(completion->type, RECOVERY_COMPLETION); - return (RecoveryCompletion *) completion; -} - -/** - * Allocate and initialize a RecoveryCompletion. - * - * @param vdo The VDO in question - * @param recoveryPtr A pointer to hold the new RecoveryCompletion - * - * @return VDO_SUCCESS or a status code - **/ -int makeRecoveryCompletion(VDO *vdo, RecoveryCompletion **recoveryPtr) - __attribute__((warn_unused_result)); - -/** - * Free a RecoveryCompletion and all underlying structures. - * - * @param recoveryPtr A pointer to the recovery completion to free - **/ -void freeRecoveryCompletion(RecoveryCompletion **recoveryPtr); - -#endif // VDO_RECOVERY_INTERNALS_H diff --git a/vdo/base/vdoResize.c b/vdo/base/vdoResize.c deleted file mode 100644 index ee3271d..0000000 --- a/vdo/base/vdoResize.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResize.c#15 $ - */ - -#include "vdoResize.h" - -#include "logger.h" - -#include "adminCompletion.h" -#include "completion.h" -#include "recoveryJournal.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "vdoInternal.h" -#include "vdoLayout.h" - -typedef enum { - GROW_PHYSICAL_PHASE_START = 0, - GROW_PHYSICAL_PHASE_COPY_SUMMARY, - GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS, - GROW_PHYSICAL_PHASE_USE_NEW_SLABS, - GROW_PHYSICAL_PHASE_END, - GROW_PHYSICAL_PHASE_ERROR, -} GrowPhysicalPhase; - -static const char *GROW_PHYSICAL_PHASE_NAMES[] = { - "GROW_PHYSICAL_PHASE_START", - "GROW_PHYSICAL_PHASE_COPY_SUMMARY", - "GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS", - "GROW_PHYSICAL_PHASE_USE_NEW_SLABS", - "GROW_PHYSICAL_PHASE_END", - "GROW_PHYSICAL_PHASE_ERROR", -}; - -/** - * Implements ThreadIDGetterForPhase. - **/ -__attribute__((warn_unused_result)) -static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) -{ - return getAdminThread(getThreadConfig(adminCompletion->completion.parent)); -} - -/** - * Callback to initiate a grow physical, registered in performGrowPhysical(). - * - * @param completion The sub-task completion - **/ -static void growPhysicalCallback(VDOCompletion *completion) -{ - AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); - assertAdminOperationType(adminCompletion, ADMIN_OPERATION_GROW_PHYSICAL); - assertAdminPhaseThread(adminCompletion, __func__, GROW_PHYSICAL_PHASE_NAMES); - - VDO *vdo = adminCompletion->completion.parent; - switch (adminCompletion->phase++) { - case GROW_PHYSICAL_PHASE_START: - if (isReadOnly(vdo->readOnlyNotifier)) { - logErrorWithStringError(VDO_READ_ONLY, - "Can't grow physical size of a read-only VDO"); - setCompletionResult(resetAdminSubTask(completion), VDO_READ_ONLY); - break; - } - - if (startOperationWithWaiter(&vdo->adminState, - ADMIN_STATE_SUSPENDED_OPERATION, - &adminCompletion->completion, NULL)) { - // Copy the journal into the new layout. - copyPartition(vdo->layout, RECOVERY_JOURNAL_PARTITION, - resetAdminSubTask(completion)); - } - return; - - case GROW_PHYSICAL_PHASE_COPY_SUMMARY: - copyPartition(vdo->layout, SLAB_SUMMARY_PARTITION, - resetAdminSubTask(completion)); - return; - - case GROW_PHYSICAL_PHASE_UPDATE_COMPONENTS: - vdo->config.physicalBlocks = growVDOLayout(vdo->layout); - updateSlabDepotSize(vdo->depot); - saveVDOComponentsAsync(vdo, resetAdminSubTask(completion)); - return; - - case GROW_PHYSICAL_PHASE_USE_NEW_SLABS: - useNewSlabs(vdo->depot, resetAdminSubTask(completion)); - return; - - case GROW_PHYSICAL_PHASE_END: - setSlabSummaryOrigin(getSlabSummary(vdo->depot), - getVDOPartition(vdo->layout, SLAB_SUMMARY_PARTITION)); - setRecoveryJournalPartition(vdo->recoveryJournal, - getVDOPartition(vdo->layout, - RECOVERY_JOURNAL_PARTITION)); - break; - - case GROW_PHYSICAL_PHASE_ERROR: - enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); - break; - - default: - setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); - } - - finishVDOLayoutGrowth(vdo->layout); - finishOperationWithResult(&vdo->adminState, completion->result); -} - -/** - * Handle an error during the grow physical process. - * - * @param completion The sub-task completion - **/ -static void handleGrowthError(VDOCompletion *completion) -{ - adminCompletionFromSubTask(completion)->phase = GROW_PHYSICAL_PHASE_ERROR; - growPhysicalCallback(completion); -} - -/**********************************************************************/ -int performGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) -{ - BlockCount oldPhysicalBlocks = vdo->config.physicalBlocks; - - // Skip any noop grows. - if (oldPhysicalBlocks == newPhysicalBlocks) { - return VDO_SUCCESS; - } - - if (newPhysicalBlocks != getNextVDOLayoutSize(vdo->layout)) { - /* - * Either the VDO isn't prepared to grow, or it was prepared to grow - * to a different size. Doing this check here relies on the fact that - * the call to this method is done under the dmsetup message lock. - */ - finishVDOLayoutGrowth(vdo->layout); - abandonNewSlabs(vdo->depot); - return VDO_PARAMETER_MISMATCH; - } - - // Validate that we are prepared to grow appropriately. - BlockCount newDepotSize = getNextBlockAllocatorPartitionSize(vdo->layout); - BlockCount preparedDepotSize = getNewDepotSize(vdo->depot); - if (preparedDepotSize != newDepotSize) { - return VDO_PARAMETER_MISMATCH; - } - - int result = performAdminOperation(vdo, ADMIN_OPERATION_GROW_PHYSICAL, - getThreadIDForPhase, growPhysicalCallback, - handleGrowthError); - if (result != VDO_SUCCESS) { - return result; - } - - logInfo("Physical block count was %llu, now %llu", - oldPhysicalBlocks, newPhysicalBlocks); - return VDO_SUCCESS; -} - -/** - * Callback to check that we're not in recovery mode, used in - * prepareToGrowPhysical(). - * - * @param completion The sub-task completion - **/ -static void checkMayGrowPhysical(VDOCompletion *completion) -{ - AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); - assertAdminOperationType(adminCompletion, - ADMIN_OPERATION_PREPARE_GROW_PHYSICAL); - - VDO *vdo = adminCompletion->completion.parent; - assertOnAdminThread(vdo, __func__); - - resetAdminSubTask(completion); - - // This check can only be done from a base code thread. - if (isReadOnly(vdo->readOnlyNotifier)) { - finishCompletion(completion->parent, VDO_READ_ONLY); - return; - } - - // This check should only be done from a base code thread. - if (inRecoveryMode(vdo)) { - finishCompletion(completion->parent, VDO_RETRY_AFTER_REBUILD); - return; - } - - completeCompletion(completion->parent); -} - -/**********************************************************************/ -int prepareToGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) -{ - BlockCount currentPhysicalBlocks = vdo->config.physicalBlocks; - if (newPhysicalBlocks < currentPhysicalBlocks) { - return logErrorWithStringError(VDO_NOT_IMPLEMENTED, - "Removing physical storage from a VDO is " - "not supported"); - } - - if (newPhysicalBlocks == currentPhysicalBlocks) { - logWarning("Requested physical block count %" PRIu64 - " not greater than %llu", - newPhysicalBlocks, currentPhysicalBlocks); - finishVDOLayoutGrowth(vdo->layout); - abandonNewSlabs(vdo->depot); - return VDO_PARAMETER_MISMATCH; - } - - int result = performAdminOperation(vdo, - ADMIN_OPERATION_PREPARE_GROW_PHYSICAL, - getThreadIDForPhase, checkMayGrowPhysical, - finishParentCallback); - if (result != VDO_SUCCESS) { - return result; - } - - result = prepareToGrowVDOLayout(vdo->layout, currentPhysicalBlocks, - newPhysicalBlocks, vdo->layer); - if (result != VDO_SUCCESS) { - return result; - } - - BlockCount newDepotSize = getNextBlockAllocatorPartitionSize(vdo->layout); - result = prepareToGrowSlabDepot(vdo->depot, newDepotSize); - if (result != VDO_SUCCESS) { - finishVDOLayoutGrowth(vdo->layout); - return result; - } - - return VDO_SUCCESS; -} diff --git a/vdo/base/vdoResize.h b/vdo/base/vdoResize.h deleted file mode 100644 index 76bfc1f..0000000 --- a/vdo/base/vdoResize.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResize.h#1 $ - */ - -#ifndef VDO_RESIZE_H -#define VDO_RESIZE_H - -#include "types.h" - -/** - * Make the completion for an asynchronous resize. - * - * @param vdo The VDO - * @param newPhysicalBlocks The new physical size in blocks - * @param completionPtr A pointer to hold the completion - * - * @return VDO_SUCCESS or an error - **/ -int makeResizeVDOCompletion(VDO *vdo, - BlockCount newPhysicalBlocks, - VDOCompletion **completionPtr) - __attribute__((warn_unused_result)); - -/** - * Free the completion for an asynchronous resize, and NULL out the - * reference to it. - * - * @param completionPtr A reference to the completion to free - **/ -void freeResizeVDOCompletion(VDOCompletion **completionPtr); - -/** - * Grow the physical size of the VDO. This method may only be called when the - * VDO has been suspended and must not be called from a base thread. - * - * @param vdo The VDO to resize - * @param newPhysicalBlocks The new physical size in blocks - * - * @return VDO_SUCCESS or an error - **/ -int performGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks); - -/** - * Prepare to resize the VDO, allocating memory as needed. - * - * @param vdo The VDO - * @param newPhysicalBlocks The new physical size in blocks - **/ -int prepareToGrowPhysical(VDO *vdo, BlockCount newPhysicalBlocks) - __attribute__((warn_unused_result)); - -#endif /* VDO_RESIZE_H */ diff --git a/vdo/base/vdoResizeLogical.c b/vdo/base/vdoResizeLogical.c deleted file mode 100644 index 97a06d1..0000000 --- a/vdo/base/vdoResizeLogical.c +++ /dev/null @@ -1,154 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResizeLogical.c#6 $ - */ - -#include "vdoResizeLogical.h" - -#include "logger.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "completion.h" -#include "vdoInternal.h" - -typedef enum { - GROW_LOGICAL_PHASE_START = 0, - GROW_LOGICAL_PHASE_GROW_BLOCK_MAP, - GROW_LOGICAL_PHASE_END, - GROW_LOGICAL_PHASE_ERROR, -} GrowLogicalPhase; - -static const char *GROW_LOGICAL_PHASE_NAMES[] = { - "GROW_LOGICAL_PHASE_START", - "GROW_LOGICAL_PHASE_GROW_BLOCK_MAP", - "GROW_LOGICAL_PHASE_END", - "GROW_LOGICAL_PHASE_ERROR", -}; - -/** - * Implements ThreadIDGetterForPhase. - **/ -__attribute__((warn_unused_result)) -static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) -{ - return getAdminThread(getThreadConfig(adminCompletion->completion.parent)); -} - -/** - * Callback to initiate a grow logical, registered in performGrowLogical(). - * - * @param completion The sub-task completion - **/ -static void growLogicalCallback(VDOCompletion *completion) -{ - AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); - assertAdminOperationType(adminCompletion, ADMIN_OPERATION_GROW_LOGICAL); - assertAdminPhaseThread(adminCompletion, __func__, GROW_LOGICAL_PHASE_NAMES); - - VDO *vdo = adminCompletion->completion.parent; - switch (adminCompletion->phase++) { - case GROW_LOGICAL_PHASE_START: - if (isReadOnly(vdo->readOnlyNotifier)) { - logErrorWithStringError(VDO_READ_ONLY, - "Can't grow logical size of a read-only VDO"); - finishCompletion(resetAdminSubTask(completion), VDO_READ_ONLY); - return; - } - - if (startOperationWithWaiter(&vdo->adminState, - ADMIN_STATE_SUSPENDED_OPERATION, - &adminCompletion->completion, NULL)) { - - vdo->config.logicalBlocks = getNewEntryCount(getBlockMap(vdo)); - saveVDOComponentsAsync(vdo, resetAdminSubTask(completion)); - } - - return; - - case GROW_LOGICAL_PHASE_GROW_BLOCK_MAP: - growBlockMap(getBlockMap(vdo), resetAdminSubTask(completion)); - return; - - case GROW_LOGICAL_PHASE_END: - break; - - case GROW_LOGICAL_PHASE_ERROR: - enterReadOnlyMode(vdo->readOnlyNotifier, completion->result); - break; - - default: - setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); - } - - finishOperationWithResult(&vdo->adminState, completion->result); -} - -/** - * Handle an error during the grow physical process. - * - * @param completion The sub-task completion - **/ -static void handleGrowthError(VDOCompletion *completion) -{ - AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); - if (adminCompletion->phase == GROW_LOGICAL_PHASE_GROW_BLOCK_MAP) { - // We've failed to write the new size in the super block, so set our - // in memory config back to the old size. - VDO *vdo = adminCompletion->completion.parent; - BlockMap *map = getBlockMap(vdo); - vdo->config.logicalBlocks = getNumberOfBlockMapEntries(map); - abandonBlockMapGrowth(map); - } - - adminCompletion->phase = GROW_LOGICAL_PHASE_ERROR; - growLogicalCallback(completion); -} - -/**********************************************************************/ -int performGrowLogical(VDO *vdo, BlockCount newLogicalBlocks) -{ - if (getNewEntryCount(getBlockMap(vdo)) != newLogicalBlocks) { - return VDO_PARAMETER_MISMATCH; - } - - return performAdminOperation(vdo, ADMIN_OPERATION_GROW_LOGICAL, - getThreadIDForPhase, growLogicalCallback, - handleGrowthError); -} - -/**********************************************************************/ -int prepareToGrowLogical(VDO *vdo, BlockCount newLogicalBlocks) -{ - if (newLogicalBlocks < vdo->config.logicalBlocks) { - return logErrorWithStringError(VDO_PARAMETER_MISMATCH, - "Can't shrink VDO logical size from its " - "current value of %llu", - vdo->config.logicalBlocks); - } - - if (newLogicalBlocks == vdo->config.logicalBlocks) { - return logErrorWithStringError(VDO_PARAMETER_MISMATCH, - "Can't grow VDO logical size to its " - "current value of %llu", - vdo->config.logicalBlocks); - } - - return prepareToGrowBlockMap(getBlockMap(vdo), newLogicalBlocks); -} diff --git a/vdo/base/vdoResizeLogical.h b/vdo/base/vdoResizeLogical.h deleted file mode 100644 index fbea60d..0000000 --- a/vdo/base/vdoResizeLogical.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResizeLogical.h#1 $ - */ - -#ifndef VDO_RESIZE_LOGICAL_H -#define VDO_RESIZE_LOGICAL_H - -#include "types.h" - -/** - * Grow the logical size of the VDO. This method may only be called when the - * VDO has been suspended and must not be called from a base thread. - * - * @param vdo The VDO to grow - * @param newLogicalBlocks The size to which the VDO should be grown - * - * @return VDO_SUCCESS or an error - **/ -int performGrowLogical(VDO *vdo, BlockCount newLogicalBlocks); - -/** - * Prepare to grow the logical size of the VDO. This method may only be called - * while the VDO is running. - * - * @param vdo The VDO to prepare for growth - * @param newLogicalBlocks The size to which the VDO should be grown - * - * @return VDO_SUCCESS or an error - **/ -int prepareToGrowLogical(VDO *vdo, BlockCount newLogicalBlocks); - -#endif /* VDO_RESIZE_LOGICAL_H */ diff --git a/vdo/base/vdoResume.c b/vdo/base/vdoResume.c deleted file mode 100644 index a10c2ef..0000000 --- a/vdo/base/vdoResume.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResume.c#3 $ - */ - -#include "vdoResume.h" - -#include "logger.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "completion.h" -#include "logicalZone.h" -#include "recoveryJournal.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "threadConfig.h" -#include "vdoInternal.h" - -typedef enum { - RESUME_PHASE_START = 0, - RESUME_PHASE_ALLOW_READ_ONLY_MODE, - RESUME_PHASE_DEPOT, - RESUME_PHASE_JOURNAL, - RESUME_PHASE_BLOCK_MAP, - RESUME_PHASE_LOGICAL_ZONES, - RESUME_PHASE_PACKER, - RESUME_PHASE_END, -} ResumePhase; - -static const char *RESUME_PHASE_NAMES[] = { - "RESUME_PHASE_START", - "RESUME_PHASE_ALLOW_READ_ONLY_MODE", - "RESUME_PHASE_DEPOT", - "RESUME_PHASE_JOURNAL", - "RESUME_PHASE_BLOCK_MAP", - "RESUME_PHASE_LOGICAL_ZONES", - "RESUME_PHASE_PACKER", - "RESUME_PHASE_END", -}; - -/** - * Implements ThreadIDGetterForPhase. - **/ -__attribute__((warn_unused_result)) -static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) -{ - const ThreadConfig *threadConfig - = getThreadConfig(adminCompletion->completion.parent); - switch (adminCompletion->phase) { - case RESUME_PHASE_JOURNAL: - return getJournalZoneThread(threadConfig); - - case RESUME_PHASE_PACKER: - return getPackerZoneThread(threadConfig); - - default: - return getAdminThread(threadConfig); - } -} - -/** - * Update the VDO state and save the super block. - * - * @param vdo The VDO being resumed - * @param completion The AdminCompletion's sub-task completion - **/ -static void writeSuperBlock(VDO *vdo, VDOCompletion *completion) -{ - switch (vdo->state) { - case VDO_CLEAN: - case VDO_NEW: - vdo->state = VDO_DIRTY; - saveVDOComponentsAsync(vdo, completion); - return; - - case VDO_DIRTY: - case VDO_READ_ONLY_MODE: - case VDO_FORCE_REBUILD: - case VDO_RECOVERING: - case VDO_REBUILD_FOR_UPGRADE: - // No need to write the super block in these cases - completeCompletion(completion); - return; - - case VDO_REPLAYING: - default: - finishCompletion(completion, UDS_BAD_STATE); - } -} - -/** - * Callback to resume a VDO. - * - * @param completion The sub-task completion - **/ -static void resumeCallback(VDOCompletion *completion) -{ - AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); - assertAdminOperationType(adminCompletion, ADMIN_OPERATION_RESUME); - assertAdminPhaseThread(adminCompletion, __func__, RESUME_PHASE_NAMES); - - VDO *vdo = adminCompletion->completion.parent; - switch (adminCompletion->phase++) { - case RESUME_PHASE_START: - if (startResuming(&vdo->adminState, ADMIN_STATE_RESUMING, - &adminCompletion->completion, NULL)) { - writeSuperBlock(vdo, completion); - } - return; - - case RESUME_PHASE_ALLOW_READ_ONLY_MODE: - allowReadOnlyModeEntry(vdo->readOnlyNotifier, - resetAdminSubTask(completion)); - return; - - case RESUME_PHASE_DEPOT: - resumeSlabDepot(vdo->depot, resetAdminSubTask(completion)); - return; - - case RESUME_PHASE_JOURNAL: - resumeRecoveryJournal(vdo->recoveryJournal, resetAdminSubTask(completion)); - return; - - case RESUME_PHASE_BLOCK_MAP: - resumeBlockMap(vdo->blockMap, resetAdminSubTask(completion)); - return; - - case RESUME_PHASE_LOGICAL_ZONES: - resumeLogicalZones(vdo->logicalZones,resetAdminSubTask(completion)); - return; - - case RESUME_PHASE_PACKER: - resumePacker(vdo->packer, resetAdminSubTask(completion)); - return; - - case RESUME_PHASE_END: - break; - - default: - setCompletionResult(resetAdminSubTask(completion), UDS_BAD_STATE); - } - - finishResumingWithResult(&vdo->adminState, completion->result); -} - -/**********************************************************************/ -int performVDOResume(VDO *vdo) -{ - return performAdminOperation(vdo, ADMIN_OPERATION_RESUME, - getThreadIDForPhase, resumeCallback, - preserveErrorAndContinue); -} diff --git a/vdo/base/vdoResume.h b/vdo/base/vdoResume.h deleted file mode 100644 index 1ef25b2..0000000 --- a/vdo/base/vdoResume.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoResume.h#1 $ - */ - -#ifndef VDO_RESUME_H -#define VDO_RESUME_H - -#include "types.h" - -/** - * Resume a suspended VDO. - * - * @param vdo The VDO to resume - * - * @return VDO_SUCCESS or an error - **/ -int performVDOResume(VDO *vdo); - -#endif /* VDO_RESUME_H */ diff --git a/vdo/base/vdoState.c b/vdo/base/vdoState.c deleted file mode 100644 index 00d3986..0000000 --- a/vdo/base/vdoState.c +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoState.c#1 $ - */ - -#include "vdoState.h" - -#include "permassert.h" - -static const char *VDO_STATE_NAMES[] = { - [VDO_CLEAN] = "CLEAN", - [VDO_DIRTY] = "DIRTY", - [VDO_FORCE_REBUILD] = "FORCE_REBUILD", - [VDO_NEW] = "NEW", - [VDO_READ_ONLY_MODE] = "READ_ONLY_MODE", - [VDO_REBUILD_FOR_UPGRADE] = "REBUILD_FOR_UPGRADE", - [VDO_RECOVERING] = "RECOVERING", - [VDO_REPLAYING] = "REPLAYING", -}; - -/**********************************************************************/ -const char *getVDOStateName(VDOState state) -{ - // Catch if a state has been added without updating the name array. - STATIC_ASSERT(COUNT_OF(VDO_STATE_NAMES) == VDO_STATE_COUNT); - - int result = ASSERT(state < COUNT_OF(VDO_STATE_NAMES), - "VDOState value %u must have a registered name", state); - if (result != UDS_SUCCESS) { - return "INVALID VDO STATE CODE"; - } - - return VDO_STATE_NAMES[state]; -} - -/**********************************************************************/ -const char *describeVDOState(VDOState state) -{ - // These strings should all fit in the 15 chars of VDOStatistics.mode. - switch (state) { - case VDO_RECOVERING: - return "recovering"; - - case VDO_READ_ONLY_MODE: - return "read-only"; - - default: - return "normal"; - } -} diff --git a/vdo/base/vdoState.h b/vdo/base/vdoState.h deleted file mode 100644 index 5843565..0000000 --- a/vdo/base/vdoState.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoState.h#2 $ - */ - -#ifndef VDO_STATE_H -#define VDO_STATE_H - -/** - * The current operating mode of the VDO. These are persistent on disk - * so the values must not change. - **/ -typedef enum { - VDO_DIRTY = 0, - VDO_NEW = 1, - VDO_CLEAN = 2, - VDO_READ_ONLY_MODE = 3, - VDO_FORCE_REBUILD = 4, - VDO_RECOVERING = 5, - VDO_REPLAYING = 6, - VDO_REBUILD_FOR_UPGRADE = 7, - - // Keep VDO_STATE_COUNT at the bottom. - VDO_STATE_COUNT -} VDOState; - -/** - * Get the name of a VDO state code for logging purposes. - * - * @param state The state code - * - * @return The name of the state code - **/ -const char *getVDOStateName(VDOState state) - __attribute__((warn_unused_result)); - -/** - * Return a user-visible string describing the current VDO state. - * - * @param state The VDO state to describe - * - * @return A string constant describing the state - **/ -const char *describeVDOState(VDOState state) - __attribute__((warn_unused_result)); - -#endif // VDO_STATE_H diff --git a/vdo/base/vdoSuspend.c b/vdo/base/vdoSuspend.c deleted file mode 100644 index e919f19..0000000 --- a/vdo/base/vdoSuspend.c +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoSuspend.c#4 $ - */ - -#include "vdoSuspend.h" - -#include "logger.h" - -#include "adminCompletion.h" -#include "blockMap.h" -#include "completion.h" -#include "logicalZone.h" -#include "recoveryJournal.h" -#include "slabDepot.h" -#include "slabSummary.h" -#include "threadConfig.h" -#include "vdoInternal.h" - -typedef enum { - SUSPEND_PHASE_START = 0, - SUSPEND_PHASE_PACKER, - SUSPEND_PHASE_LOGICAL_ZONES, - SUSPEND_PHASE_BLOCK_MAP, - SUSPEND_PHASE_JOURNAL, - SUSPEND_PHASE_DEPOT, - SUSPEND_PHASE_WRITE_SUPER_BLOCK, - SUSPEND_PHASE_END, -} SuspendPhase; - -static const char *SUSPEND_PHASE_NAMES[] = { - "SUSPEND_PHASE_START", - "SUSPEND_PHASE_PACKER", - "SUSPEND_PHASE_LOGICAL_ZONES", - "SUSPEND_PHASE_BLOCK_MAP", - "SUSPEND_PHASE_JOURNAL", - "SUSPEND_PHASE_DEPOT", - "SUSPEND_PHASE_WRITE_SUPER_BLOCK", - "SUSPEND_PHASE_END", -}; - -/** - * Implements ThreadIDGetterForPhase. - **/ -__attribute__((warn_unused_result)) -static ThreadID getThreadIDForPhase(AdminCompletion *adminCompletion) -{ - const ThreadConfig *threadConfig - = getThreadConfig(adminCompletion->completion.parent); - switch (adminCompletion->phase) { - case SUSPEND_PHASE_PACKER: - return getPackerZoneThread(threadConfig); - - case SUSPEND_PHASE_JOURNAL: - return getJournalZoneThread(threadConfig); - - default: - return getAdminThread(threadConfig); - } -} - -/** - * Update the VDO state and save the super block. - * - * @param vdo The VDO being suspended - * @param completion The AdminCompletion's sub-task completion - **/ -static void writeSuperBlock(VDO *vdo, VDOCompletion *completion) -{ - switch (vdo->state) { - case VDO_DIRTY: - case VDO_NEW: - vdo->state = VDO_CLEAN; - break; - - case VDO_CLEAN: - case VDO_READ_ONLY_MODE: - case VDO_FORCE_REBUILD: - case VDO_RECOVERING: - case VDO_REBUILD_FOR_UPGRADE: - break; - - case VDO_REPLAYING: - default: - finishCompletion(completion, UDS_BAD_STATE); - return; - } - - saveVDOComponentsAsync(vdo, completion); -} - -/** - * Callback to initiate a suspend, registered in performVDOSuspend(). - * - * @param completion The sub-task completion - **/ -static void suspendCallback(VDOCompletion *completion) -{ - AdminCompletion *adminCompletion = adminCompletionFromSubTask(completion); - ASSERT_LOG_ONLY(((adminCompletion->type == ADMIN_OPERATION_SUSPEND) - || (adminCompletion->type == ADMIN_OPERATION_SAVE)), - "unexpected admin operation type %u is neither " - "suspend nor save", adminCompletion->type); - assertAdminPhaseThread(adminCompletion, __func__, SUSPEND_PHASE_NAMES); - - VDO *vdo = adminCompletion->completion.parent; - switch (adminCompletion->phase++) { - case SUSPEND_PHASE_START: - if (!startDraining(&vdo->adminState, - ((adminCompletion->type == ADMIN_OPERATION_SUSPEND) - ? ADMIN_STATE_SUSPENDING : ADMIN_STATE_SAVING), - &adminCompletion->completion, NULL)) { - return; - } - - if (!vdo->closeRequired) { - // There's nothing to do. - break; - } - - waitUntilNotEnteringReadOnlyMode(vdo->readOnlyNotifier, - resetAdminSubTask(completion)); - return; - - case SUSPEND_PHASE_PACKER: - /* - * If the VDO was already resumed from a prior suspend while read-only, - * some of the components may not have been resumed. By setting a read-only - * error here, we guarantee that the result of this suspend will be - * VDO_READ_ONLY and not VDO_INVALID_ADMIN_STATE in that case. - */ - if (inReadOnlyMode(vdo)) { - setCompletionResult(&adminCompletion->completion, VDO_READ_ONLY); - } - - drainPacker(vdo->packer, resetAdminSubTask(completion)); - return; - - case SUSPEND_PHASE_LOGICAL_ZONES: - drainLogicalZones(vdo->logicalZones, vdo->adminState.state, - resetAdminSubTask(completion)); - return; - - case SUSPEND_PHASE_BLOCK_MAP: - drainBlockMap(vdo->blockMap, vdo->adminState.state, - resetAdminSubTask(completion)); - return; - - case SUSPEND_PHASE_JOURNAL: - drainRecoveryJournal(vdo->recoveryJournal, vdo->adminState.state, - resetAdminSubTask(completion)); - return; - - case SUSPEND_PHASE_DEPOT: - drainSlabDepot(vdo->depot, vdo->adminState.state, - resetAdminSubTask(completion)); - return; - - case SUSPEND_PHASE_WRITE_SUPER_BLOCK: - if (isSuspending(&vdo->adminState) - || (adminCompletion->completion.result != VDO_SUCCESS)) { - // If we didn't save the VDO or there was an error, we're done. - break; - } - - writeSuperBlock(vdo, resetAdminSubTask(completion)); - return; - - case SUSPEND_PHASE_END: - break; - - default: - setCompletionResult(completion, UDS_BAD_STATE); - } - - finishDrainingWithResult(&vdo->adminState, completion->result); -} - -/**********************************************************************/ -int performVDOSuspend(VDO *vdo, bool save) -{ - return performAdminOperation(vdo, (save - ? ADMIN_OPERATION_SAVE - : ADMIN_OPERATION_SUSPEND), - getThreadIDForPhase, suspendCallback, - preserveErrorAndContinue); -} diff --git a/vdo/base/vdoSuspend.h b/vdo/base/vdoSuspend.h deleted file mode 100644 index 39172dc..0000000 --- a/vdo/base/vdoSuspend.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vdoSuspend.h#1 $ - */ - -#ifndef VDO_SUSPEND_H -#define VDO_SUSPEND_H - -#include "types.h" - -/** - * Ensure that the VDO has no outstanding I/O and will issue none until it is - * resumed. - * - * @param vdo The VDO to suspend - * @param save If true, all dirty metadata will be flushed as - * well - * - * @return VDO_SUCCESS or an error - **/ -int performVDOSuspend(VDO *vdo, bool save); - -#endif /* VDO_SUSPEND_H */ diff --git a/vdo/base/vio.c b/vdo/base/vio.c deleted file mode 100644 index 9bd678d..0000000 --- a/vdo/base/vio.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vio.c#5 $ - */ - -#include "vio.h" - -#include "logger.h" - -#include "dataVIO.h" -#include "vdoInternal.h" - -#ifdef __KERNEL__ -#include -#endif - -/**********************************************************************/ -void freeVIO(VIO **vioPtr) -{ - VIO *vio = *vioPtr; - if (vio == NULL) { - return; - } - - vio->completion.layer->freeVIO(vioPtr); -} - -/**********************************************************************/ -void initializeVIO(VIO *vio, - VIOType type, - VIOPriority priority, - VDOCompletion *parent, - VDO *vdo, - PhysicalLayer *layer) -{ - vio->vdo = vdo; - vio->type = type; - vio->priority = priority; - - VDOCompletion *completion = vioAsCompletion(vio); - initializeCompletion(completion, VIO_COMPLETION, layer); - completion->parent = parent; -} - -/**********************************************************************/ -void vioDoneCallback(VDOCompletion *completion) -{ - VIO *vio = asVIO(completion); - completion->callback = vio->callback; - completion->errorHandler = vio->errorHandler; - completeCompletion(completion); -} - -/**********************************************************************/ -const char *getVIOReadWriteFlavor(const VIO *vio) -{ - if (isReadVIO(vio)) { - return "read"; - } - return (isWriteVIO(vio) ? "write" : "read-modify-write"); -} - -/**********************************************************************/ -void updateVIOErrorStats(VIO *vio, const char *format, ...) -{ - int priority; - int result = vioAsCompletion(vio)->result; - switch (result) { - case VDO_READ_ONLY: - atomicAdd64(&vio->vdo->errorStats.readOnlyErrorCount, 1); - return; - - case VDO_NO_SPACE: - atomicAdd64(&vio->vdo->errorStats.noSpaceErrorCount, 1); - priority = LOG_DEBUG; - break; - - default: - priority = LOG_ERR; - } - -#ifdef __KERNEL__ - static DEFINE_RATELIMIT_STATE(errorLimiter, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - if (!__ratelimit(&errorLimiter)) { - return; - } -#endif - - va_list args; - va_start(args, format); - vLogWithStringError(priority, result, format, args); - va_end(args); -} - -/** - * Handle an error from a metadata I/O. - * - * @param completion The VIO - **/ -static void handleMetadataIOError(VDOCompletion *completion) -{ - VIO *vio = asVIO(completion); - updateVIOErrorStats(vio, - "Completing %s VIO of type %u for physical block %" - PRIu64 " with error", - getVIOReadWriteFlavor(vio), vio->type, vio->physical); - vioDoneCallback(completion); -} - -/**********************************************************************/ -void launchMetadataVIO(VIO *vio, - PhysicalBlockNumber physical, - VDOAction *callback, - VDOAction *errorHandler, - VIOOperation operation) -{ - vio->operation = operation; - vio->physical = physical; - vio->callback = callback; - vio->errorHandler = errorHandler; - - VDOCompletion *completion = vioAsCompletion(vio); - resetCompletion(completion); - completion->callback = vioDoneCallback; - completion->errorHandler = handleMetadataIOError; - - if (isReadVIO(vio)) { - completion->layer->readMetadata(vio); - } else { - completion->layer->writeMetadata(vio); - } -} - -/** - * Handle a flush error. - * - * @param completion The flush VIO - **/ -static void handleFlushError(VDOCompletion *completion) -{ - logErrorWithStringError(completion->result, "Error flushing layer"); - completion->errorHandler = asVIO(completion)->errorHandler; - completeCompletion(completion); -} - -/**********************************************************************/ -void launchFlush(VIO *vio, VDOAction *callback, VDOAction *errorHandler) -{ - VDOCompletion *completion = vioAsCompletion(vio); - resetCompletion(completion); - completion->callback = callback; - completion->errorHandler = handleFlushError; - vio->errorHandler = errorHandler; - vio->operation = VIO_FLUSH_BEFORE; - vio->physical = ZERO_BLOCK; - - PhysicalLayer *layer = completion->layer; - if (layer->getWritePolicy(layer) == WRITE_POLICY_SYNC) { - // XXX It is dangerous to be subtly dropping flushes possibly - // needed for correctness in sync mode. - finishCompletion(completion, VDO_SUCCESS); - return; - } - - layer->flush(vio); -} diff --git a/vdo/base/vio.h b/vdo/base/vio.h deleted file mode 100644 index 8129cc6..0000000 --- a/vdo/base/vio.h +++ /dev/null @@ -1,351 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vio.h#3 $ - */ - -#ifndef VIO_H -#define VIO_H - -#include - -#include "completion.h" -#include "trace.h" -#include "types.h" -#include "vdo.h" - -/** - * A representation of a single block which may be passed between the VDO base - * and the physical layer. - **/ -struct vio { - /* The completion for this VIO */ - VDOCompletion completion; - - /* The functions to call when this VIO's operation is complete */ - VDOAction *callback; - VDOAction *errorHandler; - - /* The VDO handling this VIO */ - VDO *vdo; - - /* The address on the underlying device of the block to be read/written */ - PhysicalBlockNumber physical; - - /* The type of request this VIO is servicing */ - VIOOperation operation; - - /* The queueing priority of the VIO operation */ - VIOPriority priority; - - /* The VIO type is used for statistics and instrumentation. */ - VIOType type; - - /* Used for logging and debugging */ - Trace *trace; -}; - -/** - * Convert a generic VDOCompletion to a VIO. - * - * @param completion The completion to convert - * - * @return The completion as a VIO - **/ -static inline VIO *asVIO(VDOCompletion *completion) -{ - STATIC_ASSERT(offsetof(VIO, completion) == 0); - assertCompletionType(completion->type, VIO_COMPLETION); - return (VIO *) completion; -} - -/** - * Convert a VIO to a generic completion. - * - * @param vio The VIO to convert - * - * @return The VIO as a completion - **/ -static inline VDOCompletion *vioAsCompletion(VIO *vio) -{ - return &vio->completion; -} - -/** - * Create a VIO. - * - * @param [in] layer The physical layer - * @param [in] vioType The type of VIO to create - * @param [in] priority The relative priority to assign to the VIO - * @param [in] parent The parent of the VIO - * @param [in] data The buffer - * @param [out] vioPtr A pointer to hold the new VIO - * - * @return VDO_SUCCESS or an error - **/ -static inline int createVIO(PhysicalLayer *layer, - VIOType vioType, - VIOPriority priority, - void *parent, - char *data, - VIO **vioPtr) -{ - return layer->createMetadataVIO(layer, vioType, priority, parent, data, - vioPtr); -} - -/** - * Destroy a vio. The pointer to the VIO will be nulled out. - * - * @param vioPtr A pointer to the VIO to destroy - **/ -void freeVIO(VIO **vioPtr); - -/** - * Initialize a VIO. - * - * @param vio The VIO to initialize - * @param type The VIO type - * @param priority The relative priority of the VIO - * @param parent The parent (the extent completion) to assign to the VIO - * completion - * @param vdo The VDO for this VIO - * @param layer The layer for this VIO - **/ -void initializeVIO(VIO *vio, - VIOType type, - VIOPriority priority, - VDOCompletion *parent, - VDO *vdo, - PhysicalLayer *layer); - -/** - * The very last step in processing a VIO. Set the VIO's completion's callback - * and error handler from the fields set in the VIO itself on launch and then - * actually complete the VIO's completion. - * - * @param completion The VIO - **/ -void vioDoneCallback(VDOCompletion *completion); - -/** - * Get the name of a VIO's operation. - * - * @param vio The VIO - * - * @return The name of the VIO's operation (read, write, or read-modify-write) - **/ -const char *getVIOReadWriteFlavor(const VIO *vio); - -/** - * Update per-VIO error stats and log the error. - * - * @param vio The VIO which got an error - * @param format The format of the message to log (a printf style format) - **/ -void updateVIOErrorStats(VIO *vio, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/** - * Add a trace record for the current source location. - * - * @param vio The VIO structure to be updated - * @param location The source-location descriptor to be recorded - **/ -static inline void vioAddTraceRecord(VIO *vio, TraceLocation location) -{ - if (unlikely(vio->trace != NULL)) { - addTraceRecord(vio->trace, location); - } -} - -/** - * Check whether a VIO is servicing an external data request. - * - * @param vio The VIO to check - **/ -static inline bool isDataVIO(VIO *vio) -{ - return isDataVIOType(vio->type); -} - -/** - * Check whether a VIO is for compressed block writes - * - * @param vio The VIO to check - **/ -static inline bool isCompressedWriteVIO(VIO *vio) -{ - return isCompressedWriteVIOType(vio->type); -} - -/** - * Check whether a VIO is for metadata - * - * @param vio The VIO to check - **/ -static inline bool isMetadataVIO(VIO *vio) -{ - return isMetadataVIOType(vio->type); -} - -/** - * Check whether a VIO is a read. - * - * @param vio The VIO - * - * @return true if the VIO is a read - **/ -static inline bool isReadVIO(const VIO *vio) -{ - return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ); -} - -/** - * Check whether a VIO is a read-modify-write. - * - * @param vio The VIO - * - * @return true if the VIO is a read-modify-write - **/ -static inline bool isReadModifyWriteVIO(const VIO *vio) -{ - return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_READ_MODIFY_WRITE); -} - -/** - * Check whether a VIO is a write. - * - * @param vio The VIO - * - * @return true if the VIO is a write - **/ -static inline bool isWriteVIO(const VIO *vio) -{ - return ((vio->operation & VIO_READ_WRITE_MASK) == VIO_WRITE); -} - -/** - * Check whether a VIO requires a flush before doing its I/O. - * - * @param vio The VIO - * - * @return true if the VIO requires a flush before - **/ -static inline bool vioRequiresFlushBefore(const VIO *vio) -{ - return ((vio->operation & VIO_FLUSH_BEFORE) == VIO_FLUSH_BEFORE); -} - -/** - * Check whether a VIO requires a flush after doing its I/O. - * - * @param vio The VIO - * - * @return true if the VIO requires a flush after - **/ -static inline bool vioRequiresFlushAfter(const VIO *vio) -{ - return ((vio->operation & VIO_FLUSH_AFTER) == VIO_FLUSH_AFTER); -} - -/** - * Launch a metadata VIO. - * - * @param vio The VIO to launch - * @param physical The physical block number to read or write - * @param callback The function to call when the VIO completes its I/O - * @param errorHandler The handler for write errors - * @param operation The operation to perform (read or write) - **/ -void launchMetadataVIO(VIO *vio, - PhysicalBlockNumber physical, - VDOAction *callback, - VDOAction *errorHandler, - VIOOperation operation); - -/** - * Launch a metadata read VIO. - * - * @param vio The VIO to launch - * @param physical The physical block number to read - * @param callback The function to call when the VIO completes its read - * @param errorHandler The handler for write errors - **/ -static inline void launchReadMetadataVIO(VIO *vio, - PhysicalBlockNumber physical, - VDOAction *callback, - VDOAction *errorHandler) -{ - launchMetadataVIO(vio, physical, callback, errorHandler, VIO_READ); -} - -/** - * Launch a metadata write VIO. - * - * @param vio The VIO to launch - * @param physical The physical block number to write - * @param callback The function to call when the VIO completes its write - * @param errorHandler The handler for write errors - **/ -static inline void launchWriteMetadataVIO(VIO *vio, - PhysicalBlockNumber physical, - VDOAction *callback, - VDOAction *errorHandler) -{ - launchMetadataVIO(vio, physical, callback, errorHandler, VIO_WRITE); -} - -/** - * Launch a metadata write VIO optionally flushing the layer before and/or - * after the write operation. - * - * @param vio The VIO to launch - * @param physical The physical block number to write - * @param callback The function to call when the VIO completes its - * operation - * @param errorHandler The handler for flush or write errors - * @param flushBefore Whether or not to flush before writing - * @param flushAfter Whether or not to flush after writing - **/ -static inline -void launchWriteMetadataVIOWithFlush(VIO *vio, - PhysicalBlockNumber physical, - VDOAction *callback, - VDOAction *errorHandler, - bool flushBefore, - bool flushAfter) -{ - launchMetadataVIO(vio, physical, callback, errorHandler, - (VIO_WRITE - | (flushBefore ? VIO_FLUSH_BEFORE : 0) - | (flushAfter ? VIO_FLUSH_AFTER : 0))); -} - -/** - * Issue a flush to the layer. If the layer does not require flushing, this - * method will immediately finish the VIO with which it was called. Care must - * be taken to avoid introducing a stack overflow in that case. - * - * @param vio The VIO to notify when the flush is complete - * @param callback The function to call when the flush is complete - * @param errorHandler The handler for flush errors - **/ -void launchFlush(VIO *vio, VDOAction *callback, VDOAction *errorHandler); - -#endif // VIO_H diff --git a/vdo/base/vioPool.c b/vdo/base/vioPool.c deleted file mode 100644 index 3d5ce07..0000000 --- a/vdo/base/vioPool.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioPool.c#5 $ - */ - -#include "vioPool.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" - -#include "constants.h" -#include "vio.h" -#include "types.h" - -/** - * An VIOPool is a collection of preallocated VIOs. - **/ -struct vioPool { - /** The number of objects managed by the pool */ - size_t size; - /** The list of objects which are available */ - RingNode available; - /** The queue of requestors waiting for objects from the pool */ - WaitQueue waiting; - /** The number of objects currently in use */ - size_t busyCount; - /** The list of objects which are in use */ - RingNode busy; - /** The number of requests when no object was available */ - uint64_t outageCount; - /** The ID of the thread on which this pool may be used */ - ThreadID threadID; - /** The buffer backing the pool's VIOs */ - char *buffer; - /** The pool entries */ - VIOPoolEntry entries[]; -}; - -/**********************************************************************/ -int makeVIOPool(PhysicalLayer *layer, - size_t poolSize, - ThreadID threadID, - VIOConstructor *vioConstructor, - void *context, - VIOPool **poolPtr) -{ - VIOPool *pool; - int result = ALLOCATE_EXTENDED(VIOPool, poolSize, VIOPoolEntry, __func__, - &pool); - if (result != VDO_SUCCESS) { - return result; - } - - pool->threadID = threadID; - initializeRing(&pool->available); - initializeRing(&pool->busy); - - result = ALLOCATE(poolSize * VDO_BLOCK_SIZE, char, "VIO pool buffer", - &pool->buffer); - if (result != VDO_SUCCESS) { - freeVIOPool(&pool); - return result; - } - - char *ptr = pool->buffer; - for (size_t i = 0; i < poolSize; i++) { - VIOPoolEntry *entry = &pool->entries[i]; - entry->buffer = ptr; - entry->context = context; - result = vioConstructor(layer, entry, ptr, &entry->vio); - if (result != VDO_SUCCESS) { - freeVIOPool(&pool); - return result; - } - - ptr += VDO_BLOCK_SIZE; - initializeRing(&entry->node); - pushRingNode(&pool->available, &entry->node); - pool->size++; - } - - *poolPtr = pool; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeVIOPool(VIOPool **poolPtr) -{ - if (*poolPtr == NULL) { - return; - } - - // Remove all available entries from the object pool. - VIOPool *pool = *poolPtr; - ASSERT_LOG_ONLY(!hasWaiters(&pool->waiting), - "VIO pool must not have any waiters when being freed"); - ASSERT_LOG_ONLY((pool->busyCount == 0), - "VIO pool must not have %zu busy entries when being freed", - pool->busyCount); - ASSERT_LOG_ONLY(isRingEmpty(&pool->busy), - "VIO pool must not have busy entries when being freed"); - - VIOPoolEntry *entry; - while ((entry = asVIOPoolEntry(chopRingNode(&pool->available))) != NULL) { - freeVIO(&entry->vio); - } - - // Make sure every VIOPoolEntry has been removed. - for (size_t i = 0; i < pool->size; i++) { - VIOPoolEntry *entry = &pool->entries[i]; - ASSERT_LOG_ONLY(isRingEmpty(&entry->node), "VIO Pool entry still in use:" - " VIO is in use for physical block %" PRIu64 - " for operation %u", - entry->vio->physical, - entry->vio->operation); - } - - FREE(pool->buffer); - FREE(pool); - *poolPtr = NULL; -} - -/**********************************************************************/ -bool isVIOPoolBusy(VIOPool *pool) -{ - return (pool->busyCount != 0); -} - -/**********************************************************************/ -int acquireVIOFromPool(VIOPool *pool, Waiter *waiter) -{ - ASSERT_LOG_ONLY((pool->threadID == getCallbackThreadID()), - "acquire from active VIOPool called from correct thread"); - - if (isRingEmpty(&pool->available)) { - pool->outageCount++; - return enqueueWaiter(&pool->waiting, waiter); - } - - pool->busyCount++; - RingNode *entry = chopRingNode(&pool->available); - pushRingNode(&pool->busy, entry); - (*waiter->callback)(waiter, entry); - return VDO_SUCCESS; -} - -/**********************************************************************/ -void returnVIOToPool(VIOPool *pool, VIOPoolEntry *entry) -{ - ASSERT_LOG_ONLY((pool->threadID == getCallbackThreadID()), - "vio pool entry returned on same thread as it was acquired"); - entry->vio->completion.errorHandler = NULL; - if (hasWaiters(&pool->waiting)) { - notifyNextWaiter(&pool->waiting, NULL, entry); - return; - } - - pushRingNode(&pool->available, &entry->node); - --pool->busyCount; -} - -/**********************************************************************/ -uint64_t getVIOPoolOutageCount(VIOPool *pool) -{ - return pool->outageCount; -} diff --git a/vdo/base/vioPool.h b/vdo/base/vioPool.h deleted file mode 100644 index bab3dbe..0000000 --- a/vdo/base/vioPool.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioPool.h#4 $ - */ - -#ifndef VIO_POOL_H -#define VIO_POOL_H - -#include "permassert.h" - -#include "completion.h" -#include "types.h" -#include "waitQueue.h" - -/** - * A VIOPool is a collection of preallocated VIOs used to write arbitrary - * metadata blocks. - **/ - -/** - * An VIOPoolEntry is the pair of VIO and buffer whether in use or not. - **/ -typedef struct { - RingNode node; - VIO *vio; - void *buffer; - void *parent; - void *context; -} VIOPoolEntry; - -/** - * A function which constructs a VIO for a pool. - * - * @param [in] layer The physical layer in which the VIO will operate - * @param [in] parent The parent of the VIO - * @param [in] buffer The data buffer for the VIO - * @param [out] vioPtr A pointer to hold the new VIO - **/ -typedef int VIOConstructor(PhysicalLayer *layer, - void *parent, - void *buffer, - VIO **vioPtr); - -/** - * Create a new VIO pool. - * - * @param [in] layer the physical layer to write to and read from - * @param [in] poolSize the number of VIOs in the pool - * @param [in] threadID the ID of the thread using this pool - * @param [in] vioConstructor the constructor for VIOs in the pool - * @param [in] context the context that each entry will have - * @param [out] poolPtr the resulting pool - * - * @return a success or error code - **/ -int makeVIOPool(PhysicalLayer *layer, - size_t poolSize, - ThreadID threadID, - VIOConstructor *vioConstructor, - void *context, - VIOPool **poolPtr) - __attribute__((warn_unused_result)); - -/** - * Destroy a VIO pool - * - * @param poolPtr the pointer holding the pool, which will be nulled out - **/ -void freeVIOPool(VIOPool **poolPtr); - -/** - * Check whether an VIO pool has outstanding entries. - * - * @return true if the pool is busy - **/ -bool isVIOPoolBusy(VIOPool *pool) - __attribute__((warn_unused_result)); - -/** - * Acquire a VIO and buffer from the pool (asynchronous). - * - * @param pool the VIO pool - * @param waiter object that is requesting a VIO - * - * @return VDO_SUCCESS or an error - **/ -int acquireVIOFromPool(VIOPool *pool, Waiter *waiter); - -/** - * Return a VIO and its buffer to the pool. - * - * @param pool the VIO pool - * @param entry a VIO pool entry - **/ -void returnVIOToPool(VIOPool *pool, VIOPoolEntry *entry); - -/** - * Convert a RingNode to the VIOPoolEntry that contains it. - * - * @param node The RingNode to convert - * - * @return The VIOPoolEntry wrapping the RingNode - **/ -static inline VIOPoolEntry *asVIOPoolEntry(RingNode *node) -{ - STATIC_ASSERT(offsetof(VIOPoolEntry, node) == 0); - return (VIOPoolEntry *) node; -} - -/** - * Return the outage count of an VIO pool. - * - * @param pool The pool - * - * @return the number of times an acquisition request had to wait - **/ -uint64_t getVIOPoolOutageCount(VIOPool *pool) - __attribute__((warn_unused_result)); - -#endif // VIO_POOL_H diff --git a/vdo/base/vioRead.c b/vdo/base/vioRead.c deleted file mode 100644 index ab73727..0000000 --- a/vdo/base/vioRead.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioRead.c#1 $ - */ - -#include "vioRead.h" - -#include "logger.h" - -#include "blockMap.h" -#include "dataVIO.h" -#include "vdoInternal.h" -#include "vioWrite.h" - -/** - * Do the modify-write part of a read-modify-write cycle. This callback is - * registered in readBlock(). - * - * @param completion The DataVIO which has just finished its read - **/ -static void modifyForPartialWrite(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - - if (completion->result != VDO_SUCCESS) { - completeDataVIO(completion); - return; - } - - completion->layer->applyPartialWrite(dataVIO); - VIO *vio = dataVIOAsVIO(dataVIO); - vio->operation = VIO_WRITE | (vio->operation & ~VIO_READ_WRITE_MASK); - dataVIO->isPartialWrite = true; - launchWriteDataVIO(dataVIO); -} - -/** - * Read a block asynchronously. This is the callback registered in - * readBlockMapping(). - * - * @param completion The DataVIO to read - **/ -static void readBlock(VDOCompletion *completion) -{ - if (completion->result != VDO_SUCCESS) { - completeDataVIO(completion); - return; - } - - DataVIO *dataVIO = asDataVIO(completion); - VIO *vio = asVIO(completion); - completion->callback - = (isReadVIO(vio) ? completeDataVIO : modifyForPartialWrite); - - if (dataVIO->mapped.pbn == ZERO_BLOCK) { - completion->layer->zeroDataVIO(dataVIO); - invokeCallback(completion); - return; - } - - vio->physical = dataVIO->mapped.pbn; - dataVIO->lastAsyncOperation = READ_DATA; - completion->layer->readData(dataVIO); -} - -/** - * Read the DataVIO's mapping from the block map. This callback is registered - * in launchReadDataVIO(). - * - * @param completion The DataVIO to be read - **/ -static void readBlockMapping(VDOCompletion *completion) -{ - if (completion->result != VDO_SUCCESS) { - completeDataVIO(completion); - return; - } - - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - setLogicalCallback(dataVIO, readBlock, THIS_LOCATION("$F;cb=readBlock")); - dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK; - getMappedBlockAsync(dataVIO); -} - -/**********************************************************************/ -void launchReadDataVIO(DataVIO *dataVIO) -{ - assertInLogicalZone(dataVIO); - dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT; - // Go find the block map slot for the LBN mapping. - findBlockMapSlotAsync(dataVIO, readBlockMapping, - getLogicalZoneThreadID(dataVIO->logical.zone)); -} - -/** - * Release the logical block lock which a read DataVIO obtained now that it - * is done. - * - * @param completion The DataVIO - **/ -static void releaseLogicalLock(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - releaseLogicalBlockLock(dataVIO); - vioDoneCallback(completion); -} - -/** - * Clean up a DataVIO which has finished processing a read. - * - * @param dataVIO The DataVIO to clean up - **/ -void cleanupReadDataVIO(DataVIO *dataVIO) -{ - launchLogicalCallback(dataVIO, releaseLogicalLock, - THIS_LOCATION("$F;cb=releaseLL")); -} diff --git a/vdo/base/vioRead.h b/vdo/base/vioRead.h deleted file mode 100644 index ae2fa37..0000000 --- a/vdo/base/vioRead.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioRead.h#1 $ - */ - -#ifndef VIO_READ_H -#define VIO_READ_H - -#include "types.h" - -/** - * Start the asynchronous processing of the DataVIO for a read or - * read-modify-write request which has acquired a lock on its logical block. - * The first step is to perform a block map lookup. - * - * @param dataVIO The DataVIO doing the read - **/ -void launchReadDataVIO(DataVIO *dataVIO); - -/** - * Clean up a DataVIO which has finished processing a read. - * - * @param dataVIO The DataVIO to clean up - **/ -void cleanupReadDataVIO(DataVIO *dataVIO); - -#endif /* VIO_READ_H */ diff --git a/vdo/base/vioWrite.c b/vdo/base/vioWrite.c deleted file mode 100644 index ac2bb53..0000000 --- a/vdo/base/vioWrite.c +++ /dev/null @@ -1,1201 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.c#9 $ - */ - -/* - * This file contains almost all of the VDO write path, which begins with - * writeExtent(). The progression through the callbacks which make up the - * write path depends upon whether or not the write policy is synchronous or - * asynchronous. The paths would proceed as outlined in the pseudo-code here - * if this were normal, synchronous code without callbacks. Complications - * involved in waiting on locks are not included. - * - * ###################################################################### - * writeExtentSynchronous(extent) - * { - * foreach (vio in extent) { - * launchWriteVIO() - * # allocateBlockForWrite() - * if (!trim and !zero-block) { - * allocate block - * if (vio is compressed) { - * completeCompressedBlockWrite() - * finishVIO() - * return - * } - * writeBlock() - * } - * finishBlockWrite() - * addJournalEntry() # Increment - * if (vio->newMapped is not ZERO_BLOCK) { - * journalIncrementForWrite() - * } - * acknowledgeWriteCallback() - * readOldBlockMapping() - * journalUnmappingForWrite() - * if (vio->mapped is not ZERO_BLOCK) { - * journalDecrementForWrite() - * } - * updateBlockMapForWrite() - * if (trim || zero-block) { - * finishVIO() - * return - * } - * - * prepareForDedupe() - * hashData() - * resolveHashZone() - * acquireHashLock() - * attemptDedupe() (query albireo) - * if (isDuplicate) { - * verifyAdvice() (read verify) - * if (isDuplicate and canAddReference) { - * shareBlock() - * addJournalEntryForDedupe() - * incrementForDedupe() - * journalUnmappingForDedupe() - * if (vio->mapped is not ZERO_BLOCK) { - * decrementForDedupe() - * } - * updateBlockMapForDedupe() - * finishVIO() - * return - * } - * } - * - * if (not canAddReference) { - * layer->updateAlbireo() - * } - * # compressData() - * if (compressing and not mooted and has no waiters) { - * layer->compressVIO() - * packCompressedData() - * if (compressed) { - * journalCompressedBlocks() - * incrementForDedupe() - * readOldBlockMappingForDedupe() - * journalUnmappingForDedupe() - * if (vio->mapped is not ZERO_BLOCK) { - * decrementForDedupe() - * } - * updateBlockMapForDedupe() - * } - * } - * - * finishVIO() - * } - * } - * - * ###################################################################### - * writeExtentAsynchronous(extent) - * { - * foreach (vio in extent) { - * launchWriteVIO() - * # allocateBlockForWrite() - * if (trim || zero-block) { - * acknowledgeWrite() - * } else { - * allocateAndLockBlock() - * if (vio is compressed) { - * writeBlock() - * completeCompressedBlockWrite() - * finishVIO() - * return - * } - * - * acknowledgeWrite() - * prepareForDedupe() - * hashData() - * resolveHashZone() - * acquireHashLock() - * attemptDedupe() (query albireo) - * if (isDuplicate) { - * verifyAdvice() (read verify) - * if (isDuplicate and canAddReference) { - * shareBlock() - * addJournalEntryForDedupe() - * incrementForDedupe() - * readOldBlockMappingForDedupe() - * journalUnmappingForDedupe() - * if (vio->mapped is not ZERO_BLOCK) { - * decrementForDedupe() - * } - * updateBlockMapForDedupe() - * finishVIO() - * return - * } - * } - * - * if (not canAddReference) { - * layer->updateAlbireo() - * } - * # compressData() - * if (compressing and not mooted and has no waiters) { - * layer->compressVIO() - * packCompressedData() - * if (compressed) { - * journalCompressedBlocks() - * journalIncrementForDedupe() - * readOldBlockMappingForDedupe() - * journalUnmappingForDedupe() - * if (vio->mapped is not ZERO_BLOCK) { - * decrementForDedupe() - * } - * updateBlockMapForDedupe() - * finishVIO() - * return - * } - * } - * - * writeBlock() - * } - * - * finishBlockWrite() - * addJournalEntry() # Increment - * if (vio->newMapped is not ZERO_BLOCK) { - * journalIncrementForWrite() - * } - * readOldBlockMappingForWrite() - * journalUnmappingForWrite() - * if (vio->mapped is not ZERO_BLOCK) { - * journalDecrementForWrite() - * } - * updateBlockMapForWrite() - * finishVIO() - * } - * } - */ - -#include "vioWrite.h" - -#include "logger.h" - -#include "allocatingVIO.h" -#include "atomic.h" -#include "blockMap.h" -#include "compressionState.h" -#include "dataVIO.h" -#include "hashLock.h" -#include "recoveryJournal.h" -#include "referenceOperation.h" -#include "slab.h" -#include "slabDepot.h" -#include "slabJournal.h" -#include "vdoInternal.h" -#include "vioRead.h" - -/** - * The steps taken cleaning up a VIO, in the order they are performed. - **/ -typedef enum dataVIOCleanupStage { - VIO_CLEANUP_START = 0, - VIO_RELEASE_ALLOCATED = VIO_CLEANUP_START, - VIO_RELEASE_RECOVERY_LOCKS, - VIO_RELEASE_HASH_LOCK, - VIO_RELEASE_LOGICAL, - VIO_CLEANUP_DONE -} DataVIOCleanupStage; - -/** - * Actions to take on error used by abortOnError(). - **/ -typedef enum { - NOT_READ_ONLY, - READ_ONLY_IF_ASYNC, - READ_ONLY, -} ReadOnlyAction; - -// Forward declarations required because of circular function references. -static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage); -static void writeBlock(DataVIO *dataVIO); - -/** - * Check whether we are in async mode. - * - * @param dataVIO A DataVIO containing a pointer to the VDO whose write - * policy we want to check - * - * @return true if we are in async mode - **/ -static inline bool isAsync(DataVIO *dataVIO) -{ - return (getWritePolicy(getVDOFromDataVIO(dataVIO)) != WRITE_POLICY_SYNC); -} - -/** - * Release the PBN lock and/or the reference on the allocated block at the - * end of processing a DataVIO. - * - * @param completion The DataVIO - **/ -static void releaseAllocatedLock(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInAllocatedZone(dataVIO); - releaseAllocationLock(dataVIOAsAllocatingVIO(dataVIO)); - performCleanupStage(dataVIO, VIO_RELEASE_RECOVERY_LOCKS); -} - -/** - * Release the logical block lock and flush generation lock at the end of - * processing a DataVIO. - * - * @param completion The DataVIO - **/ -static void releaseLogicalLock(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - releaseLogicalBlockLock(dataVIO); - releaseFlushGenerationLock(dataVIO); - performCleanupStage(dataVIO, VIO_CLEANUP_DONE); -} - -/** - * Release the hash lock at the end of processing a DataVIO. - * - * @param completion The DataVIO - **/ -static void cleanHashLock(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInHashZone(dataVIO); - releaseHashLock(dataVIO); - performCleanupStage(dataVIO, VIO_RELEASE_LOGICAL); -} - -/** - * Make some assertions about a DataVIO which has finished cleaning up - * and do its final callback. - * - * @param dataVIO The DataVIO which has finished cleaning up - **/ -static void finishCleanup(DataVIO *dataVIO) -{ - ASSERT_LOG_ONLY(dataVIOAsAllocatingVIO(dataVIO)->allocationLock == NULL, - "complete DataVIO has no allocation lock"); - ASSERT_LOG_ONLY(dataVIO->hashLock == NULL, - "complete DataVIO has no hash lock"); - vioDoneCallback(dataVIOAsCompletion(dataVIO)); -} - -/** - * Perform the next step in the process of cleaning up a DataVIO. - * - * @param dataVIO The DataVIO to clean up - * @param stage The cleanup stage to perform - **/ -static void performCleanupStage(DataVIO *dataVIO, DataVIOCleanupStage stage) -{ - switch (stage) { - case VIO_RELEASE_ALLOCATED: - if (hasAllocation(dataVIO)) { - launchAllocatedZoneCallback(dataVIO, releaseAllocatedLock, - THIS_LOCATION("$F;cb=releaseAllocLock")); - return; - } - // fall through - - case VIO_RELEASE_RECOVERY_LOCKS: - if ((dataVIO->recoverySequenceNumber > 0) - && !isOrWillBeReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier) - && (dataVIOAsCompletion(dataVIO)->result != VDO_READ_ONLY)) { - logWarning("VDO not read-only when cleaning DataVIO with RJ lock"); - } - // fall through - - case VIO_RELEASE_HASH_LOCK: - if (dataVIO->hashLock != NULL) { - launchHashZoneCallback(dataVIO, cleanHashLock, - THIS_LOCATION("$F;cb=cleanHashLock")); - return; - } - // fall through - - case VIO_RELEASE_LOGICAL: - if (!isCompressedWriteDataVIO(dataVIO)) { - launchLogicalCallback(dataVIO, releaseLogicalLock, - THIS_LOCATION("$F;cb=releaseLL")); - return; - } - // fall through - - default: - finishCleanup(dataVIO); - } -} - -/** - * Return a DataVIO that encountered an error to its hash lock so it can - * update the hash lock state accordingly. This continuation is registered in - * abortOnError(), and must be called in the hash zone of the DataVIO. - * - * @param completion The completion of the DataVIO to return to its hash lock - **/ -static void finishWriteDataVIOWithError(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInHashZone(dataVIO); - continueHashLockOnError(dataVIO); -} - -/** - * Check whether a result is an error, and if so abort the DataVIO associated - * with the error. - * - * @param result The result to check - * @param dataVIO The DataVIO - * @param readOnlyAction The conditions under which the VDO should be put - * into read-only mode if the result is an error - * - * @return true if the result is an error - **/ -static bool abortOnError(int result, - DataVIO *dataVIO, - ReadOnlyAction readOnlyAction) -{ - if (result == VDO_SUCCESS) { - return false; - } - - if ((result == VDO_READ_ONLY) - || (readOnlyAction == READ_ONLY) - || ((readOnlyAction == READ_ONLY_IF_ASYNC) && isAsync(dataVIO))) { - ReadOnlyNotifier *notifier = dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier; - if (!isReadOnly(notifier)) { - if (result != VDO_READ_ONLY) { - logErrorWithStringError(result, "Preparing to enter read-only mode:" - " DataVIO for LBN %llu (becoming mapped" - " to %llu, previously mapped" - " to %llu, allocated %llu) is" - " completing with a fatal error after" - " operation %s", dataVIO->logical.lbn, - dataVIO->newMapped.pbn, dataVIO->mapped.pbn, - getDataVIOAllocation(dataVIO), - getOperationName(dataVIO)); - } - - enterReadOnlyMode(notifier, result); - } - } - - if (dataVIO->hashLock != NULL) { - launchHashZoneCallback(dataVIO, finishWriteDataVIOWithError, - THIS_LOCATION(NULL)); - } else { - finishDataVIO(dataVIO, result); - } - return true; -} - -/** - * Return a DataVIO that finished writing, compressing, or deduplicating to - * its hash lock so it can share the result with any DataVIOs waiting in the - * hash lock, or update albireo, or simply release its share of the lock. This - * continuation is registered in updateBlockMapForWrite(), - * updateBlockMapForDedupe(), and abortDeduplication(), and must be called in - * the hash zone of the DataVIO. - * - * @param completion The completion of the DataVIO to return to its hash lock - **/ -static void finishWriteDataVIO(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInHashZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { - return; - } - continueHashLock(dataVIO); -} - -/** - * Abort the data optimization process. - * - * @param dataVIO The DataVIO which does not deduplicate or compress - **/ -static void abortDeduplication(DataVIO *dataVIO) -{ - if (!hasAllocation(dataVIO)) { - // There was no space to write this block and we failed to deduplicate - // or compress it. - finishDataVIO(dataVIO, VDO_NO_SPACE); - return; - } - - if (isAsync(dataVIO)) { - // We failed to deduplicate or compress an async DataVIO, so now we need - // to actually write the data. - writeBlock(dataVIO); - return; - } - - if (dataVIO->hashLock == NULL) { - // We failed to compress a synchronous DataVIO that is a hash collision, - // which means it can't dedpe or be used for dedupe, so it's done now. - finishDataVIO(dataVIO, VDO_SUCCESS); - return; - } - - /* - * This synchronous DataVIO failed to compress and so is finished, but must - * now return to its hash lock so other DataVIOs with the same data can - * deduplicate against the uncompressed block it wrote. - */ - launchHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); -} - -/** - * Update the block map now that we've added an entry in the recovery journal - * for a block we have just shared. This is the callback registered in - * decrementForDedupe(). - * - * @param completion The completion of the write in progress - **/ -static void updateBlockMapForDedupe(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - if (dataVIO->hashLock != NULL) { - setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); - } else { - completion->callback = completeDataVIO; - } - dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK_FOR_DEDUPE; - putMappedBlockAsync(dataVIO); -} - -/** - * Make a recovery journal increment. - * - * @param dataVIO The DataVIO - * @param lock The PBNLock on the block being incremented - **/ -static void journalIncrement(DataVIO *dataVIO, PBNLock *lock) -{ - setUpReferenceOperationWithLock(DATA_INCREMENT, dataVIO->newMapped.pbn, - dataVIO->newMapped.state, lock, - &dataVIO->operation); - addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, - dataVIO); -} - -/** - * Make a recovery journal decrement entry. - * - * @param dataVIO The DataVIO - **/ -static void journalDecrement(DataVIO *dataVIO) -{ - setUpReferenceOperationWithZone(DATA_DECREMENT, dataVIO->mapped.pbn, - dataVIO->mapped.state, dataVIO->mapped.zone, - &dataVIO->operation); - addRecoveryJournalEntry(getVDOFromDataVIO(dataVIO)->recoveryJournal, - dataVIO); -} - -/** - * Make a reference count change. - * - * @param dataVIO The DataVIO - **/ -static void updateReferenceCount(DataVIO *dataVIO) -{ - SlabDepot *depot = getVDOFromDataVIO(dataVIO)->depot; - PhysicalBlockNumber pbn = dataVIO->operation.pbn; - int result = ASSERT(isPhysicalDataBlock(depot, pbn), - "Adding slab journal entry for impossible PBN %" PRIu64 - "for LBN %llu", pbn, dataVIO->logical.lbn); - if (abortOnError(result, dataVIO, READ_ONLY)) { - return; - } - - addSlabJournalEntry(getSlabJournal(depot, pbn), dataVIO); -} - -/** - * Do the decref after a successful dedupe or compression. This is the callback - * registered by journalUnmappingForDedupe(). - * - * @param completion The completion of the write in progress - **/ -static void decrementForDedupe(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInMappedZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - AllocatingVIO *allocatingVIO = dataVIOAsAllocatingVIO(dataVIO); - if (allocatingVIO->allocation == dataVIO->mapped.pbn) { - /* - * If we are about to release the reference on the allocated block, - * we must release the PBN lock on it first so that the allocator will - * not allocate a write-locked block. - */ - releaseAllocationLock(allocatingVIO); - } - - setLogicalCallback(dataVIO, updateBlockMapForDedupe, - THIS_LOCATION("$F;js=dec")); - dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_DEDUPE; - updateReferenceCount(dataVIO); -} - -/** - * Write the appropriate journal entry for removing the mapping of logical to - * mapped, for dedupe or compression. This is the callback registered in - * readOldBlockMappingForDedupe(). - * - * @param completion The completion of the write in progress - **/ -static void journalUnmappingForDedupe(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInJournalZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - if (dataVIO->mapped.pbn == ZERO_BLOCK) { - setLogicalCallback(dataVIO, updateBlockMapForDedupe, - THIS_LOCATION("$F;j=dedupe;js=unmap;cb=updateBM")); - } else { - setMappedZoneCallback(dataVIO, decrementForDedupe, - THIS_LOCATION("$F;j=dedupe;js=unmap;cb=decDedupe")); - } - dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_DEDUPE; - journalDecrement(dataVIO); -} - -/** - * Get the previous PBN mapped to this LBN from the block map, so as to make - * an appropriate journal entry referencing the removal of this LBN->PBN - * mapping, for dedupe or compression. This callback is registered in - * incrementForDedupe(). - * - * @param completion The completion of the write in progress - **/ -static void readOldBlockMappingForDedupe(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_DEDUPE; - setJournalCallback(dataVIO, journalUnmappingForDedupe, - THIS_LOCATION("$F;cb=journalUnmapDedupe")); - getMappedBlockAsync(dataVIO); -} - -/** - * Do the incref after compression. This is the callback registered by - * addRecoveryJournalEntryForCompression(). - * - * @param completion The completion of the write in progress - **/ -static void incrementForCompression(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInNewMappedZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - ASSERT_LOG_ONLY(isCompressed(dataVIO->newMapped.state), - "Impossible attempt to update reference counts for a block " - "which was not compressed (logical block %llu)", - dataVIO->logical.lbn); - - /* - * If we are synchronous and allocated a block, we know the one we - * allocated is the block we need to decrement, so there is no need - * to look in the block map. - */ - if (isAsync(dataVIO) || !hasAllocation(dataVIO)) { - setLogicalCallback(dataVIO, readOldBlockMappingForDedupe, - THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe")); - } else { - setJournalCallback(dataVIO, journalUnmappingForDedupe, - THIS_LOCATION("$F;cb=journalUnmappingForDedupe")); - } - dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_COMPRESSION; - updateReferenceCount(dataVIO); -} - -/** - * Add a recovery journal entry for the increment resulting from compression. - * - * @param completion The DataVIO which has been compressed - **/ -static void addRecoveryJournalEntryForCompression(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInJournalZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { - return; - } - - if (!isCompressed(dataVIO->newMapped.state)) { - abortDeduplication(dataVIO); - return; - } - - setNewMappedZoneCallback(dataVIO, incrementForCompression, - THIS_LOCATION("$F($dup);js=map/$dup;" - "cb=incCompress($dup)")); - dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_COMPRESSION; - journalIncrement(dataVIO, getDuplicateLock(dataVIO)); -} - -/** - * Attempt to pack the compressed DataVIO into a block. This is the callback - * registered in compressData(). - * - * @param completion The completion of a compressed DataVIO - **/ -static void packCompressedData(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInPackerZone(dataVIO); - - // XXX this is a callback, so there should probably be an error check here - // even if we think compression can't currently return one. - - if (!mayPackDataVIO(dataVIO)) { - abortDeduplication(dataVIO); - return; - } - - setJournalCallback(dataVIO, addRecoveryJournalEntryForCompression, - THIS_LOCATION("$F;cb=update(compress)")); - dataVIO->lastAsyncOperation = PACK_COMPRESSED_BLOCK; - attemptPacking(dataVIO); -} - -/**********************************************************************/ -void compressData(DataVIO *dataVIO) -{ - ASSERT_LOG_ONLY(!dataVIO->isDuplicate, - "compressing a non-duplicate block"); - if (!mayCompressDataVIO(dataVIO)) { - abortDeduplication(dataVIO); - return; - } - - dataVIO->lastAsyncOperation = COMPRESS_DATA; - setPackerCallback(dataVIO, packCompressedData, THIS_LOCATION("$F;cb=pack")); - dataVIOAsCompletion(dataVIO)->layer->compressDataVIO(dataVIO); -} - -/** - * Do the incref after deduplication. This is the callback registered by - * addRecoveryJournalEntryForDedupe(). - * - * @param completion The completion of the write in progress - **/ -static void incrementForDedupe(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInNewMappedZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - ASSERT_LOG_ONLY(dataVIO->isDuplicate, - "Impossible attempt to update reference counts for a block " - "which was not a duplicate (logical block %llu)", - dataVIO->logical.lbn); - - /* - * If we are synchronous and allocated a block, we know the one we - * allocated is the block we need to decrement, so there is no need - * to look in the block map. - */ - if (isAsync(dataVIO) || !hasAllocation(dataVIO)) { - setLogicalCallback(dataVIO, readOldBlockMappingForDedupe, - THIS_LOCATION("$F;cb=readOldBlockMappingForDedupe")); - } else { - setJournalCallback(dataVIO, journalUnmappingForDedupe, - THIS_LOCATION("$F;cb=journalUnmappingForDedupe")); - } - dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_DEDUPE; - updateReferenceCount(dataVIO); -} - -/** - * Add a recovery journal entry for the increment resulting from deduplication. - * This callback is registered in shareBlock(). - * - * @param completion The DataVIO which has been deduplicated - **/ -static void addRecoveryJournalEntryForDedupe(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInJournalZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { - return; - } - - setNewMappedZoneCallback(dataVIO, incrementForDedupe, - THIS_LOCATION("$F($dup);js=map/$dup;" - "cb=incDedupe($dup)")); - dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_DEDUPE; - journalIncrement(dataVIO, getDuplicateLock(dataVIO)); -} - -/** - * Share a block in the block map if it is a duplicate. This is the lock - * callback registered in acquirePBNReadLock(). This is only public so - * test code can compare the function to the current callback in a completion. - * - * @param completion The completion of the write in progress - **/ -void shareBlock(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInDuplicateZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { - return; - } - - if (!dataVIO->isDuplicate) { - compressData(dataVIO); - return; - } - - dataVIO->newMapped = dataVIO->duplicate; - launchJournalCallback(dataVIO, addRecoveryJournalEntryForDedupe, - THIS_LOCATION("$F;cb=addJournalEntryDup")); -} - -/** - * Route the DataVIO to the HashZone responsible for the chunk name to acquire - * a hash lock on that name, or join with a existing hash lock managing - * concurrent dedupe for that name. This is the callback registered in - * resolveHashZone(). - * - * @param completion The DataVIO to lock - **/ -static void lockHashInZone(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInHashZone(dataVIO); - // Shouldn't have had any errors since all we did was switch threads. - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - int result = acquireHashLock(dataVIO); - if (abortOnError(result, dataVIO, READ_ONLY)) { - return; - } - - if (dataVIO->hashLock == NULL) { - // It's extremely unlikely, but in the case of a hash collision, the - // DataVIO will not obtain a reference to the lock and cannot deduplicate. - compressData(dataVIO); - return; - } - - enterHashLock(dataVIO); -} - -/** - * Set the hash zone (and flag the chunk name as set) while still on the - * thread that just hashed the data to set the chunk name. This is the - * callback registered by prepareForDedupe(). - * - * @param completion The DataVIO whose chunk name was just generated, as a - * completion - **/ -static void resolveHashZone(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - // We don't care what thread we are on. - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, "zero blocks should not be hashed"); - - dataVIO->hashZone - = selectHashZone(getVDOFromDataVIO(dataVIO), &dataVIO->chunkName); - dataVIO->lastAsyncOperation = ACQUIRE_HASH_LOCK; - launchHashZoneCallback(dataVIO, lockHashInZone, THIS_LOCATION(NULL)); -} - -/** - * Prepare for the dedupe path after a synchronous write or an asynchronous - * allocation. This callback is registered in updateBlockMapForWrite() for - * sync, and continueWriteAfterAllocation() (via acknowledgeWrite()) for - * async. It is also called directly from the latter when allocation fails. - * - * @param completion The completion of the write in progress - **/ -static void prepareForDedupe(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - // We don't care what thread we are on - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - if (!isAsync(dataVIO)) { - // Remember which block we wrote so we will decrement the reference to it - // if we deduplicate. This avoids having to look it up in the block map. - dataVIO->mapped = dataVIO->newMapped; - } - - ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, - "must not prepare to dedupe zero blocks"); - - // Before we can dedupe, we need to know the chunk name, so the first step - // is to hash the block data. - dataVIO->lastAsyncOperation = HASH_DATA; - // XXX this is the wrong thread to run this callback, but we don't yet have - // a mechanism for running it on the CPU thread immediately after hashing. - setAllocatedZoneCallback(dataVIO, resolveHashZone, THIS_LOCATION(NULL)); - completion->layer->hashData(dataVIO); -} - -/** - * Update the block map after a data write (or directly for a ZERO_BLOCK write - * or trim). This callback is registered in decrementForWrite() and - * journalUnmappingForWrite(). - * - * @param completion The completion of the write in progress - **/ -static void updateBlockMapForWrite(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) { - completion->callback = completeDataVIO; - } else if (!isAsync(dataVIO)) { - // Synchronous DataVIOs branch off to the hash/dedupe path after finishing - // the uncompressed write of their data. - completion->callback = prepareForDedupe; - } else if (dataVIO->hashLock != NULL) { - // Async writes will be finished, but must return to the hash lock to - // allow other DataVIOs with the same data to dedupe against the write. - setHashZoneCallback(dataVIO, finishWriteDataVIO, THIS_LOCATION(NULL)); - } else { - // Async writes without a hash lock (hash collisions) will be finished. - completion->callback = completeDataVIO; - } - - dataVIO->lastAsyncOperation = PUT_MAPPED_BLOCK; - putMappedBlockAsync(dataVIO); -} - -/** - * Do the decref after a successful block write. This is the callback - * by journalUnmappingForWrite() if the old mapping was not the zero block. - * - * @param completion The completion of the write in progress - **/ -static void decrementForWrite(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInMappedZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - dataVIO->lastAsyncOperation = JOURNAL_DECREMENT_FOR_WRITE; - setLogicalCallback(dataVIO, updateBlockMapForWrite, THIS_LOCATION(NULL)); - updateReferenceCount(dataVIO); -} - -/** - * Write the appropriate journal entry for unmapping logical to mapped for a - * write. This is the callback registered in readOldBlockMappingForWrite(). - * - * @param completion The completion of the write in progress - **/ -static void journalUnmappingForWrite(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInJournalZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - if (dataVIO->mapped.pbn == ZERO_BLOCK) { - setLogicalCallback(dataVIO, updateBlockMapForWrite, - THIS_LOCATION("$F;js=unmap;cb=updateBMwrite")); - } else { - setMappedZoneCallback(dataVIO, decrementForWrite, - THIS_LOCATION("$F;js=unmap;cb=decWrite")); - } - dataVIO->lastAsyncOperation = JOURNAL_UNMAPPING_FOR_WRITE; - journalDecrement(dataVIO); -} - -/** - * Get the previous PBN mapped to this LBN from the block map for a write, so - * as to make an appropriate journal entry referencing the removal of this - * LBN->PBN mapping. This callback is registered in finishBlockWrite() in the - * async path, and is registered in acknowledgeWrite() in the sync path. - * - * @param completion The completion of the write in progress - **/ -static void readOldBlockMappingForWrite(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInLogicalZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - setJournalCallback(dataVIO, journalUnmappingForWrite, - THIS_LOCATION("$F;cb=journalUnmapWrite")); - dataVIO->lastAsyncOperation = GET_MAPPED_BLOCK_FOR_WRITE; - getMappedBlockAsync(dataVIO); -} - -/** - * Acknowledge a write to the requestor. - * - * @param dataVIO The DataVIO being acknowledged - **/ -static void acknowledgeWrite(DataVIO *dataVIO) -{ - ASSERT_LOG_ONLY(dataVIO->hasFlushGenerationLock, - "write VIO to be acknowledged has a flush generation lock"); - dataVIO->lastAsyncOperation = ACKNOWLEDGE_WRITE; - dataVIOAsCompletion(dataVIO)->layer->acknowledgeDataVIO(dataVIO); -} - -/** - * Acknowledge a write now that we have made an entry in the recovery - * journal. This is the callback registered in finishBlockWrite() in - * synchronous mode. - * - * @param completion The completion of the write in progress - **/ -static void acknowledgeWriteCallback(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - if (abortOnError(completion->result, dataVIO, READ_ONLY)) { - return; - } - - setLogicalCallback(dataVIO, readOldBlockMappingForWrite, - THIS_LOCATION(NULL)); - acknowledgeWrite(dataVIO); -} - -/**********************************************************************/ -static VDOAction *getWriteIncrementCallback(DataVIO *dataVIO) -{ - return (isAsync(dataVIO) - ? readOldBlockMappingForWrite : acknowledgeWriteCallback); -} - -/** - * Do the incref after a successful block write. This is the callback - * registered by finishBlockWrite(). - * - * @param completion The completion of the write in progress - **/ -static void incrementForWrite(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInAllocatedZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { - return; - } - - /* - * Now that the data has been written, it's safe to deduplicate against the - * block. Downgrade the allocation lock to a read lock so it can be used - * later by the hash lock (which we don't have yet in sync mode). - */ - downgradePBNWriteLock(dataVIOAsAllocatingVIO(dataVIO)->allocationLock); - - dataVIO->lastAsyncOperation = JOURNAL_INCREMENT_FOR_WRITE; - setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO), - THIS_LOCATION(NULL)); - updateReferenceCount(dataVIO); -} - -/** - * Add an entry in the recovery journal after a successful block write. This is - * the callback registered by writeBlock(). It is also registered in - * allocateBlockForWrite(). - * - * @param completion The completion of the write in progress - **/ -static void finishBlockWrite(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - assertInJournalZone(dataVIO); - if (abortOnError(completion->result, dataVIO, READ_ONLY_IF_ASYNC)) { - return; - } - - if (dataVIO->newMapped.pbn == ZERO_BLOCK) { - setLogicalCallback(dataVIO, getWriteIncrementCallback(dataVIO), - THIS_LOCATION("$F;js=writeZero")); - } else { - setAllocatedZoneCallback(dataVIO, incrementForWrite, - THIS_LOCATION("$F;js=mapWrite")); - } - dataVIO->lastAsyncOperation = JOURNAL_MAPPING_FOR_WRITE; - journalIncrement(dataVIO, dataVIOAsAllocatingVIO(dataVIO)->allocationLock); -} - -/** - * Write data to the underlying storage. - * - * @param dataVIO The DataVIO to write - **/ -static void writeBlock(DataVIO *dataVIO) -{ - dataVIO->lastAsyncOperation = WRITE_DATA; - setJournalCallback(dataVIO, finishBlockWrite, - THIS_LOCATION("$F(data);cb=finishWrite")); - dataVIOAsCompletion(dataVIO)->layer->writeData(dataVIO); -} - -/** - * Continue the write path for a DataVIO now that block allocation is complete - * (the DataVIO may or may not have actually received an allocation). This - * callback is registered in continueWriteWithBlockMapSlot(). - * - * @param allocatingVIO The DataVIO which has finished the allocation process - * (as an AllocatingVIO) - **/ -static void continueWriteAfterAllocation(AllocatingVIO *allocatingVIO) -{ - DataVIO *dataVIO = allocatingVIOAsDataVIO(allocatingVIO); - if (abortOnError(dataVIOAsCompletion(dataVIO)->result, dataVIO, - NOT_READ_ONLY)) { - return; - } - - if (!hasAllocation(dataVIO)) { - prepareForDedupe(dataVIOAsCompletion(dataVIO)); - return; - } - - atomicStoreBool(&dataVIO->hasAllocation, true); - dataVIO->newMapped = (ZonedPBN) { - .zone = allocatingVIO->zone, - .pbn = allocatingVIO->allocation, - .state = MAPPING_STATE_UNCOMPRESSED, - }; - - if (!isAsync(dataVIO)) { - writeBlock(dataVIO); - return; - } - - // XXX prepareForDedupe can run from any thread, so this is a place where - // running the callback on the kernel thread would save a thread switch. - setAllocatedZoneCallback(dataVIO, prepareForDedupe, THIS_LOCATION(NULL)); - if (vioRequiresFlushAfter(allocatingVIOAsVIO(allocatingVIO))) { - invokeCallback(dataVIOAsCompletion(dataVIO)); - return; - } - - acknowledgeWrite(dataVIO); -} - -/** - * Continue the write path for a VIO now that block map slot resolution is - * complete. This callback is registered in launchWriteDataVIO(). - * - * @param completion The DataVIO to write - **/ -static void continueWriteWithBlockMapSlot(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - // We don't care what thread we're on. - if (abortOnError(completion->result, dataVIO, NOT_READ_ONLY)) { - return; - } - - if (dataVIO->treeLock.treeSlots[0].blockMapSlot.pbn == ZERO_BLOCK) { - int result = ASSERT(isTrimDataVIO(dataVIO), - "dataVIO with no block map page is a trim"); - if (abortOnError(result, dataVIO, READ_ONLY)) { - return; - } - - // This is a trim for a block on a block map page which has not been - // allocated, so there's nothing more we need to do. - finishDataVIO(dataVIO, VDO_SUCCESS); - return; - } - - if (dataVIO->isZeroBlock || isTrimDataVIO(dataVIO)) { - // We don't need to write any data, so skip allocation and just update - // the block map and reference counts (via the journal). - dataVIO->newMapped.pbn = ZERO_BLOCK; - launchJournalCallback(dataVIO, finishBlockWrite, - THIS_LOCATION("$F;cb=finishWrite")); - return; - } - - allocateDataBlock(dataVIOAsAllocatingVIO(dataVIO), - getAllocationSelector(dataVIO->logical.zone), - VIO_WRITE_LOCK, continueWriteAfterAllocation); -} - -/**********************************************************************/ -void launchWriteDataVIO(DataVIO *dataVIO) -{ - if (isReadOnly(dataVIOAsVIO(dataVIO)->vdo->readOnlyNotifier)) { - finishDataVIO(dataVIO, VDO_READ_ONLY); - return; - } - - // Write requests join the current flush generation. - int result = acquireFlushGenerationLock(dataVIO); - if (abortOnError(result, dataVIO, NOT_READ_ONLY)) { - return; - } - - // Go find the block map slot for the LBN mapping. - dataVIO->lastAsyncOperation = FIND_BLOCK_MAP_SLOT; - findBlockMapSlotAsync(dataVIO, continueWriteWithBlockMapSlot, - getLogicalZoneThreadID(dataVIO->logical.zone)); -} - -/**********************************************************************/ -void cleanupWriteDataVIO(DataVIO *dataVIO) -{ - performCleanupStage(dataVIO, VIO_CLEANUP_START); -} diff --git a/vdo/base/vioWrite.h b/vdo/base/vioWrite.h deleted file mode 100644 index 6effc91..0000000 --- a/vdo/base/vioWrite.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/vioWrite.h#1 $ - */ - -#ifndef VIO_WRITE_H -#define VIO_WRITE_H - -#include "types.h" - -/** - * Release the PBN read lock if it is held. - * - * @param dataVIO The possible lock holder - **/ -void releasePBNReadLock(DataVIO *dataVIO); - -/** - * Start the asynchronous processing of a DataVIO for a write request which has - * acquired a lock on its logical block by joining the current flush generation - * and then attempting to allocate a physical block. - * - * @param dataVIO The DataVIO doing the write - **/ -void launchWriteDataVIO(DataVIO *dataVIO); - -/** - * Clean up a DataVIO which has finished processing a write. - * - * @param dataVIO The DataVIO to clean up - **/ -void cleanupWriteDataVIO(DataVIO *dataVIO); - -/** - * Continue a write by attempting to compress the data. This is a re-entry - * point to vioWrite used by hash locks. - * - * @param dataVIO The DataVIO to be compressed - **/ -void compressData(DataVIO *dataVIO); - -#endif /* VIO_WRITE_H */ diff --git a/vdo/base/volumeGeometry.c b/vdo/base/volumeGeometry.c deleted file mode 100644 index 32b2e5f..0000000 --- a/vdo/base/volumeGeometry.c +++ /dev/null @@ -1,564 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/volumeGeometry.c#10 $ - */ - -#include "volumeGeometry.h" - -#include "buffer.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" -#include "permassert.h" - -#include "constants.h" -#include "header.h" -#include "physicalLayer.h" -#include "releaseVersions.h" -#include "statusCodes.h" -#include "types.h" - -enum { - GEOMETRY_BLOCK_LOCATION = 0, - MAGIC_NUMBER_SIZE = 8, -}; - -typedef struct { - char magicNumber[MAGIC_NUMBER_SIZE]; - Header header; - VolumeGeometry geometry; - CRC32Checksum checksum; -} __attribute__((packed)) GeometryBlock; - -static const Header GEOMETRY_BLOCK_HEADER_4_0 = { - .id = GEOMETRY_BLOCK, - .version = { - .majorVersion = 4, - .minorVersion = 0, - }, - // Note: this size isn't just the payload size following the header, like it - // is everywhere else in VDO. - .size = sizeof(GeometryBlock), -}; - -static const byte MAGIC_NUMBER[MAGIC_NUMBER_SIZE + 1] = "dmvdo001"; - -static const ReleaseVersionNumber COMPATIBLE_RELEASE_VERSIONS[] = { - MAGNESIUM_RELEASE_VERSION_NUMBER, -}; - -/** - * Determine whether the supplied release version can be understood by - * the VDO code. - * - * @param version The release version number to check - * - * @return True if the given version can be loaded. - **/ -static inline bool isLoadableReleaseVersion(ReleaseVersionNumber version) -{ - if (version == CURRENT_RELEASE_VERSION_NUMBER) { - return true; - } - - for (unsigned int i = 0; i < COUNT_OF(COMPATIBLE_RELEASE_VERSIONS); i++) { - if (version == COMPATIBLE_RELEASE_VERSIONS[i]) { - return true; - } - } - - return false; -} - -/** - * Decode the on-disk representation of an index configuration from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param config The structure to receive the decoded fields - * - * @return UDS_SUCCESS or an error - **/ -static int decodeIndexConfig(Buffer *buffer, IndexConfig *config) -{ - uint32_t mem; - int result = getUInt32LEFromBuffer(buffer, &mem); - if (result != VDO_SUCCESS) { - return result; - } - - uint32_t checkpointFrequency; - result = getUInt32LEFromBuffer(buffer, &checkpointFrequency); - if (result != VDO_SUCCESS) { - return result; - } - - bool sparse; - result = getBoolean(buffer, &sparse); - if (result != VDO_SUCCESS) { - return result; - } - - *config = (IndexConfig) { - .mem = mem, - .checkpointFrequency = checkpointFrequency, - .sparse = sparse, - }; - return VDO_SUCCESS; -} - -/** - * Encode the on-disk representation of an index configuration into a buffer. - * - * @param config The index configuration to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error - **/ -static int encodeIndexConfig(const IndexConfig *config, Buffer *buffer) -{ - int result = putUInt32LEIntoBuffer(buffer, config->mem); - if (result != VDO_SUCCESS) { - return result; - } - - result = putUInt32LEIntoBuffer(buffer, config->checkpointFrequency); - if (result != VDO_SUCCESS) { - return result; - } - - return putBoolean(buffer, config->sparse); -} - -/** - * Decode the on-disk representation of a volume region from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param region The structure to receive the decoded fields - * - * @return UDS_SUCCESS or an error - **/ -static int decodeVolumeRegion(Buffer *buffer, VolumeRegion *region) -{ - VolumeRegionID id; - int result = getUInt32LEFromBuffer(buffer, &id); - if (result != VDO_SUCCESS) { - return result; - } - - PhysicalBlockNumber startBlock; - result = getUInt64LEFromBuffer(buffer, &startBlock); - if (result != VDO_SUCCESS) { - return result; - } - - *region = (VolumeRegion) { - .id = id, - .startBlock = startBlock, - }; - return VDO_SUCCESS; -} - -/** - * Encode the on-disk representation of a volume region into a buffer. - * - * @param region The region to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error - **/ -static int encodeVolumeRegion(const VolumeRegion *region, Buffer *buffer) -{ - int result = putUInt32LEIntoBuffer(buffer, region->id); - if (result != VDO_SUCCESS) { - return result; - } - - return putUInt64LEIntoBuffer(buffer, region->startBlock); -} - -/** - * Decode the on-disk representation of a volume geometry from a buffer. - * - * @param buffer A buffer positioned at the start of the encoding - * @param geometry The structure to receive the decoded fields - * - * @return UDS_SUCCESS or an error - **/ -static int decodeVolumeGeometry(Buffer *buffer, VolumeGeometry *geometry) -{ - ReleaseVersionNumber releaseVersion; - int result = getUInt32LEFromBuffer(buffer, &releaseVersion); - if (result != VDO_SUCCESS) { - return result; - } - - Nonce nonce; - result = getUInt64LEFromBuffer(buffer, &nonce); - if (result != VDO_SUCCESS) { - return result; - } - - geometry->releaseVersion = releaseVersion; - geometry->nonce = nonce; - - result = getBytesFromBuffer(buffer, sizeof(UUID), geometry->uuid); - if (result != VDO_SUCCESS) { - return result; - } - - for (VolumeRegionID id = 0; id < VOLUME_REGION_COUNT; id++) { - result = decodeVolumeRegion(buffer, &geometry->regions[id]); - if (result != VDO_SUCCESS) { - return result; - } - } - - return decodeIndexConfig(buffer, &geometry->indexConfig); -} - -/** - * Encode the on-disk representation of a volume geometry into a buffer. - * - * @param geometry The geometry to encode - * @param buffer A buffer positioned at the start of the encoding - * - * @return UDS_SUCCESS or an error - **/ -static int encodeVolumeGeometry(const VolumeGeometry *geometry, Buffer *buffer) -{ - int result = putUInt32LEIntoBuffer(buffer, geometry->releaseVersion); - if (result != VDO_SUCCESS) { - return result; - } - - result = putUInt64LEIntoBuffer(buffer, geometry->nonce); - if (result != VDO_SUCCESS) { - return result; - } - - result = putBytes(buffer, sizeof(UUID), geometry->uuid); - if (result != VDO_SUCCESS) { - return result; - } - - for (VolumeRegionID id = 0; id < VOLUME_REGION_COUNT; id++) { - result = encodeVolumeRegion(&geometry->regions[id], buffer); - if (result != VDO_SUCCESS) { - return result; - } - } - - return encodeIndexConfig(&geometry->indexConfig, buffer); -} - -/** - * Decode the on-disk representation of a geometry block, up to but not - * including the checksum, from a buffer. - * - * @param buffer A buffer positioned at the start of the block - * @param geometry The structure to receive the decoded volume geometry fields - * - * @return UDS_SUCCESS or an error - **/ -static int decodeGeometryBlock(Buffer *buffer, VolumeGeometry *geometry) -{ - if (!hasSameBytes(buffer, MAGIC_NUMBER, MAGIC_NUMBER_SIZE)) { - return VDO_BAD_MAGIC; - } - - int result = skipForward(buffer, MAGIC_NUMBER_SIZE); - if (result != VDO_SUCCESS) { - return result; - } - - Header header; - result = decodeHeader(buffer, &header); - if (result != VDO_SUCCESS) { - return result; - } - - result = validateHeader(&GEOMETRY_BLOCK_HEADER_4_0, &header, true, __func__); - if (result != VDO_SUCCESS) { - return result; - } - - result = decodeVolumeGeometry(buffer, geometry); - if (result != VDO_SUCCESS) { - return result; - } - - // Leave the CRC for the caller to decode and verify. - return ASSERT(header.size - == (uncompactedAmount(buffer) + sizeof(CRC32Checksum)), - "should have decoded up to the geometry checksum"); -} - -/** - * Encode the on-disk representation of a geometry block, up to but not - * including the checksum, into a buffer. - * - * @param geometry The volume geometry to encode into the block - * @param buffer A buffer positioned at the start of the block - * - * @return UDS_SUCCESS or an error - **/ -static int encodeGeometryBlock(const VolumeGeometry *geometry, Buffer *buffer) -{ - int result = putBytes(buffer, MAGIC_NUMBER_SIZE, MAGIC_NUMBER); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeHeader(&GEOMETRY_BLOCK_HEADER_4_0, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - result = encodeVolumeGeometry(geometry, buffer); - if (result != VDO_SUCCESS) { - return result; - } - - // Leave the CRC for the caller to compute and encode. - return ASSERT(GEOMETRY_BLOCK_HEADER_4_0.size - == (contentLength(buffer) + sizeof(CRC32Checksum)), - "should have decoded up to the geometry checksum"); -} - -/** - * Allocate a block-size buffer to read the geometry from the physical layer, - * read the block, and return the buffer. - * - * @param [in] layer The physical layer containing the block to read - * @param [out] blockPtr A pointer to receive the allocated buffer - * - * @return VDO_SUCCESS or an error code - **/ -static int readGeometryBlock(PhysicalLayer *layer, byte **blockPtr) -{ - int result = ASSERT(layer->reader != NULL, "Layer must have a sync reader"); - if (result != VDO_SUCCESS) { - return result; - } - - char *block; - result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", - &block); - if (result != VDO_SUCCESS) { - return result; - } - - result = layer->reader(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); - if (result != VDO_SUCCESS) { - FREE(block); - return result; - } - - *blockPtr = (byte *) block; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int loadVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) -{ - byte *block; - int result = readGeometryBlock(layer, &block); - if (result != VDO_SUCCESS) { - return result; - } - - Buffer *buffer; - result = wrapBuffer(block, VDO_BLOCK_SIZE, VDO_BLOCK_SIZE, &buffer); - if (result != VDO_SUCCESS) { - FREE(block); - return result; - } - - result = decodeGeometryBlock(buffer, geometry); - if (result != VDO_SUCCESS) { - freeBuffer(&buffer); - FREE(block); - return result; - } - - // Checksum everything decoded so far. - CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, block, - uncompactedAmount(buffer)); - CRC32Checksum savedChecksum; - result = getUInt32LEFromBuffer(buffer, &savedChecksum); - if (result != VDO_SUCCESS) { - freeBuffer(&buffer); - FREE(block); - return result; - } - - // Finished all decoding. Everything that follows is validation code. - freeBuffer(&buffer); - FREE(block); - - if (!isLoadableReleaseVersion(geometry->releaseVersion)) { - return logErrorWithStringError(VDO_UNSUPPORTED_VERSION, - "release version %d cannot be loaded", - geometry->releaseVersion); - } - - return ((checksum == savedChecksum) ? VDO_SUCCESS : VDO_CHECKSUM_MISMATCH); -} - -/************************************************************************/ -int computeIndexBlocks(IndexConfig *indexConfig, BlockCount *indexBlocksPtr) -{ - UdsConfiguration udsConfiguration = NULL; - int result = indexConfigToUdsConfiguration(indexConfig, &udsConfiguration); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "error creating index config"); - } - - uint64_t indexBytes; - result = udsComputeIndexSize(udsConfiguration, 0, &indexBytes); - udsFreeConfiguration(udsConfiguration); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "error computing index size"); - } - - BlockCount indexBlocks = indexBytes / VDO_BLOCK_SIZE; - if ((((uint64_t) indexBlocks) * VDO_BLOCK_SIZE) != indexBytes) { - return logErrorWithStringError(VDO_PARAMETER_MISMATCH, "index size must be" - " a multiple of block size %d", - VDO_BLOCK_SIZE); - } - - *indexBlocksPtr = indexBlocks; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int initializeVolumeGeometry(Nonce nonce, - UUID uuid, - IndexConfig *indexConfig, - VolumeGeometry *geometry) -{ - BlockCount indexSize = 0; - if (indexConfig != NULL) { - int result = computeIndexBlocks(indexConfig, &indexSize); - if (result != VDO_SUCCESS) { - return result; - } - } - - *geometry = (VolumeGeometry) { - .releaseVersion = CURRENT_RELEASE_VERSION_NUMBER, - .nonce = nonce, - .regions = { - [INDEX_REGION] = { - .id = INDEX_REGION, - .startBlock = 1, - }, - [DATA_REGION] = { - .id = DATA_REGION, - .startBlock = 1 + indexSize, - } - } - }; - memcpy(geometry->uuid, uuid, sizeof(UUID)); - if (indexSize > 0) { - memcpy(&geometry->indexConfig, indexConfig, sizeof(IndexConfig)); - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int clearVolumeGeometry(PhysicalLayer *layer) -{ - char *block; - int result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", - &block); - if (result != VDO_SUCCESS) { - return result; - } - - result = layer->writer(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); - FREE(block); - return result; -} - -/**********************************************************************/ -int writeVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) -{ - char *block; - int result = layer->allocateIOBuffer(layer, VDO_BLOCK_SIZE, "geometry block", - &block); - if (result != VDO_SUCCESS) { - return result; - } - - Buffer *buffer; - result = wrapBuffer((byte *) block, VDO_BLOCK_SIZE, 0, &buffer); - if (result != VDO_SUCCESS) { - FREE(block); - return result; - } - - result = encodeGeometryBlock(geometry, buffer); - if (result != VDO_SUCCESS) { - freeBuffer(&buffer); - FREE(block); - return result; - } - - // Checksum everything encoded so far and then encode the checksum. - CRC32Checksum checksum = layer->updateCRC32(INITIAL_CHECKSUM, (byte *) block, - contentLength(buffer)); - result = putUInt32LEIntoBuffer(buffer, checksum); - if (result != VDO_SUCCESS) { - freeBuffer(&buffer); - FREE(block); - return result; - } - - // Write it. - result = layer->writer(layer, GEOMETRY_BLOCK_LOCATION, 1, block, NULL); - freeBuffer(&buffer); - FREE(block); - return result; -} - -/************************************************************************/ -int indexConfigToUdsConfiguration(IndexConfig *indexConfig, - UdsConfiguration *udsConfigPtr) -{ - UdsConfiguration udsConfiguration; - int result = udsInitializeConfiguration(&udsConfiguration, indexConfig->mem); - if (result != UDS_SUCCESS) { - return logErrorWithStringError(result, "error initializing configuration"); - } - - udsConfigurationSetSparse(udsConfiguration, indexConfig->sparse); - - *udsConfigPtr = udsConfiguration; - return VDO_SUCCESS; -} - -/************************************************************************/ -void indexConfigToUdsParameters(IndexConfig *indexConfig, - struct uds_parameters *userParams) -{ - userParams->checkpoint_frequency = indexConfig->checkpointFrequency; -} diff --git a/vdo/base/volumeGeometry.h b/vdo/base/volumeGeometry.h deleted file mode 100644 index c06cdde..0000000 --- a/vdo/base/volumeGeometry.h +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/volumeGeometry.h#5 $ - */ - -#ifndef VOLUME_GEOMETRY_H -#define VOLUME_GEOMETRY_H - -#include "uds.h" - -#include "types.h" - -struct indexConfig { - uint32_t mem; - uint32_t checkpointFrequency; - bool sparse; -} __attribute__((packed)); - -typedef enum { - INDEX_REGION = 0, - DATA_REGION = 1, - VOLUME_REGION_COUNT, -} VolumeRegionID; - -typedef struct { - /** The ID of the region */ - VolumeRegionID id; - /** - * The absolute starting offset on the device. The region continues until - * the next region begins. - */ - PhysicalBlockNumber startBlock; -} __attribute__((packed)) VolumeRegion; - -/** A binary UUID is 16 bytes. */ -typedef unsigned char UUID[16]; - -typedef struct { - /** The release version number of this volume */ - ReleaseVersionNumber releaseVersion; - /** The nonce of this volume */ - Nonce nonce; - /** The UUID of this volume */ - UUID uuid; - /** The regions in ID order */ - VolumeRegion regions[VOLUME_REGION_COUNT]; - /** The index config */ - IndexConfig indexConfig; -} __attribute__((packed)) VolumeGeometry; - -/** - * Get the start of the index region from a geometry. - * - * @param geometry The geometry - * - * @return The start of the index region - **/ -__attribute__((warn_unused_result)) -static inline PhysicalBlockNumber getIndexRegionOffset(VolumeGeometry geometry) -{ - return geometry.regions[INDEX_REGION].startBlock; -} - -/** - * Get the start of the data region from a geometry. - * - * @param geometry The geometry - * - * @return The start of the data region - **/ -__attribute__((warn_unused_result)) -static inline PhysicalBlockNumber getDataRegionOffset(VolumeGeometry geometry) -{ - return geometry.regions[DATA_REGION].startBlock; -} - -/** - * Get the size of the index region from a geometry. - * - * @param geometry The geometry - * - * @return the size of the index region - **/ -__attribute__((warn_unused_result)) -static inline PhysicalBlockNumber getIndexRegionSize(VolumeGeometry geometry) -{ - return getDataRegionOffset(geometry) - getIndexRegionOffset(geometry); -} - -/** - * Read the volume geometry from a layer. - * - * @param layer The layer to read and parse the geometry from - * @param geometry The geometry to be loaded - **/ -int loadVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) -__attribute__((warn_unused_result)); - -/** - * Initialize a VolumeGeometry for a VDO. - * - * @param nonce The nonce for the VDO - * @param uuid The uuid for the VDO - * @param indexConfig The index config of the VDO. - * @param geometry The geometry being initialized - * - * @return VDO_SUCCESS or an error - **/ -int initializeVolumeGeometry(Nonce nonce, - UUID uuid, - IndexConfig *indexConfig, - VolumeGeometry *geometry) - __attribute__((warn_unused_result)); - -/** - * Zero out the geometry on a layer. - * - * @param layer The layer to clear - * - * @return VDO_SUCCESS or an error - **/ -int clearVolumeGeometry(PhysicalLayer *layer) - __attribute__((warn_unused_result)); - -/** - * Write a geometry block for a VDO. - * - * @param layer The layer on which to write. - * @param geometry The VolumeGeometry to be written - * - * @return VDO_SUCCESS or an error - **/ -int writeVolumeGeometry(PhysicalLayer *layer, VolumeGeometry *geometry) -__attribute__((warn_unused_result)); - -/** - * Convert a index config to a UDS configuration, which can be used by UDS. - * - * @param [in] indexConfig The index config to convert - * @param [out] udsConfigPtr A pointer to return the UDS configuration - * - * @return VDO_SUCCESS or an error - **/ -int indexConfigToUdsConfiguration(IndexConfig *indexConfig, - UdsConfiguration *udsConfigPtr) -__attribute__((warn_unused_result)); - -/** - * Modify the uds_parameters to match the requested index config. - * - * @param indexConfig The index config to convert - * @param userParams The uds_parameters to modify - **/ -void indexConfigToUdsParameters(IndexConfig *indexConfig, - struct uds_parameters *userParams); - -/** - * Compute the index size in blocks from the IndexConfig. - * - * @param [in] indexConfig The index config - * @param [out] indexBlocksPtr A pointer to return the index size in blocks - * - * @return VDO_SUCCESS or an error - **/ -int computeIndexBlocks(IndexConfig *indexConfig, BlockCount *indexBlocksPtr) -__attribute__((warn_unused_result)); - -/** - * Set load config fields from a volume geometry. - * - * @param [in] geometry The geometry to use - * @param [out] loadConfig The load config to set - **/ -static inline void setLoadConfigFromGeometry(VolumeGeometry *geometry, - VDOLoadConfig *loadConfig) -{ - loadConfig->firstBlockOffset = getDataRegionOffset(*geometry); - loadConfig->releaseVersion = geometry->releaseVersion; - loadConfig->nonce = geometry->nonce; -} - -#endif // VOLUME_GEOMETRY_H diff --git a/vdo/base/waitQueue.c b/vdo/base/waitQueue.c deleted file mode 100644 index 3d7f175..0000000 --- a/vdo/base/waitQueue.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/waitQueue.c#1 $ - */ - -#include "waitQueue.h" - -#include "permassert.h" - -#include "statusCodes.h" - -/**********************************************************************/ -int enqueueWaiter(WaitQueue *queue, Waiter *waiter) -{ - int result = ASSERT((waiter->nextWaiter == NULL), - "new waiter must not already be in a waiter queue"); - if (result != VDO_SUCCESS) { - return result; - } - - if (queue->lastWaiter == NULL) { - // The queue is empty, so form the initial circular list by self-linking - // the initial waiter. - waiter->nextWaiter = waiter; - } else { - // Splice the new waiter in at the end of the queue. - waiter->nextWaiter = queue->lastWaiter->nextWaiter; - queue->lastWaiter->nextWaiter = waiter; - } - // In both cases, the waiter we added to the ring becomes the last waiter. - queue->lastWaiter = waiter; - queue->queueLength += 1; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void transferAllWaiters(WaitQueue *fromQueue, WaitQueue *toQueue) -{ - // If the source queue is empty, there's nothing to do. - if (!hasWaiters(fromQueue)) { - return; - } - - if (hasWaiters(toQueue)) { - // Both queues are non-empty. Splice the two circular lists together by - // swapping the next (head) pointers in the list tails. - Waiter *fromHead = fromQueue->lastWaiter->nextWaiter; - Waiter *toHead = toQueue->lastWaiter->nextWaiter; - toQueue->lastWaiter->nextWaiter = fromHead; - fromQueue->lastWaiter->nextWaiter = toHead; - } - - toQueue->lastWaiter = fromQueue->lastWaiter; - toQueue->queueLength += fromQueue->queueLength; - initializeWaitQueue(fromQueue); -} - -/**********************************************************************/ -void notifyAllWaiters(WaitQueue *queue, - WaiterCallback *callback, - void *context) -{ - // Copy and empty the queue first, avoiding the possibility of an infinite - // loop if entries are returned to the queue by the callback function. - WaitQueue waiters; - initializeWaitQueue(&waiters); - transferAllWaiters(queue, &waiters); - - // Drain the copied queue, invoking the callback on every entry. - while (notifyNextWaiter(&waiters, callback, context)) { - // All the work is done by the loop condition. - } -} - -/**********************************************************************/ -Waiter *getFirstWaiter(const WaitQueue *queue) -{ - Waiter *lastWaiter = queue->lastWaiter; - if (lastWaiter == NULL) { - // There are no waiters, so we're done. - return NULL; - } - - // The queue is circular, so the last entry links to the head of the queue. - return lastWaiter->nextWaiter; -} - -/**********************************************************************/ -int dequeueMatchingWaiters(WaitQueue *queue, - WaiterMatch *matchMethod, - void *matchContext, - WaitQueue *matchedQueue) -{ - WaitQueue matchedWaiters; - initializeWaitQueue(&matchedWaiters); - - WaitQueue iterationQueue; - initializeWaitQueue(&iterationQueue); - transferAllWaiters(queue, &iterationQueue); - while (hasWaiters(&iterationQueue)) { - Waiter *waiter = dequeueNextWaiter(&iterationQueue); - int result = VDO_SUCCESS; - if (!matchMethod(waiter, matchContext)) { - result = enqueueWaiter(queue, waiter); - } else { - result = enqueueWaiter(&matchedWaiters, waiter); - } - if (result != VDO_SUCCESS) { - transferAllWaiters(&matchedWaiters, queue); - transferAllWaiters(&iterationQueue, queue); - return result; - } - } - - transferAllWaiters(&matchedWaiters, matchedQueue); - return VDO_SUCCESS; -} - -/**********************************************************************/ -Waiter *dequeueNextWaiter(WaitQueue *queue) -{ - Waiter *firstWaiter = getFirstWaiter(queue); - if (firstWaiter == NULL) { - return NULL; - } - - Waiter *lastWaiter = queue->lastWaiter; - if (firstWaiter == lastWaiter) { - // The queue has a single entry, so just empty it out by nulling the tail. - queue->lastWaiter = NULL; - } else { - // The queue has more than one entry, so splice the first waiter out of - // the circular queue. - lastWaiter->nextWaiter = firstWaiter->nextWaiter; - } - - // The waiter is no longer in a wait queue. - firstWaiter->nextWaiter = NULL; - queue->queueLength -= 1; - return firstWaiter; -} - -/**********************************************************************/ -bool notifyNextWaiter(WaitQueue *queue, - WaiterCallback *callback, - void *context) -{ - Waiter *waiter = dequeueNextWaiter(queue); - if (waiter == NULL) { - return false; - } - - if (callback == NULL) { - callback = waiter->callback; - } - (*callback)(waiter, context); - return true; -} - -/**********************************************************************/ -const Waiter *getNextWaiter(const WaitQueue *queue, const Waiter *waiter) -{ - Waiter *firstWaiter = getFirstWaiter(queue); - if (waiter == NULL) { - return firstWaiter; - } - return ((waiter->nextWaiter != firstWaiter) ? waiter->nextWaiter : NULL); -} diff --git a/vdo/base/waitQueue.h b/vdo/base/waitQueue.h deleted file mode 100644 index 5eb754e..0000000 --- a/vdo/base/waitQueue.h +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/base/waitQueue.h#1 $ - */ - -#ifndef WAIT_QUEUE_H -#define WAIT_QUEUE_H - -#include "common.h" - -/** - * A wait queue is a circular list of entries waiting to be notified of a - * change in a condition. Keeping a circular list allows the queue structure - * to simply be a pointer to the tail (newest) entry in the queue, supporting - * constant-time enqueue and dequeue operations. A null pointer is an empty - * queue. - * - * An empty queue: - * queue0.lastWaiter -> NULL - * - * A singleton queue: - * queue1.lastWaiter -> entry1 -> entry1 -> [...] - * - * A three-element queue: - * queue2.lastWaiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...] - **/ - -typedef struct waiter Waiter; - -typedef struct { - /** The tail of the queue, the last (most recently added) entry */ - Waiter *lastWaiter; - /** The number of waiters currently in the queue */ - size_t queueLength; -} WaitQueue; - -/** - * Callback type for functions which will be called to resume processing of a - * waiter after it has been removed from its wait queue. - **/ -typedef void WaiterCallback(Waiter *waiter, void *context); - -/** - * Method type for Waiter matching methods. - * - * A WaiterMatch method returns false if the waiter does not match. - **/ -typedef bool WaiterMatch(Waiter *waiter, void *context); - -/** - * The queue entry structure for entries in a WaitQueue. - **/ -struct waiter { - /** - * The next waiter in the queue. If this entry is the last waiter, then this - * is actually a pointer back to the head of the queue. - **/ - struct waiter *nextWaiter; - - /** Optional waiter-specific callback to invoke when waking this waiter. */ - WaiterCallback *callback; -}; - -/** - * Check whether a Waiter is waiting. - * - * @param waiter The waiter to check - * - * @return true if the waiter is on some WaitQueue - **/ -static inline bool isWaiting(Waiter *waiter) -{ - return (waiter->nextWaiter != NULL); -} - -/** - * Initialize a wait queue. - * - * @param queue The queue to initialize - **/ -static inline void initializeWaitQueue(WaitQueue *queue) -{ - *queue = (WaitQueue) { - .lastWaiter = NULL, - .queueLength = 0, - }; -} - -/** - * Check whether a wait queue has any entries waiting in it. - * - * @param queue The queue to query - * - * @return true if there are any waiters in the queue - **/ -__attribute__((warn_unused_result)) -static inline bool hasWaiters(const WaitQueue *queue) -{ - return (queue->lastWaiter != NULL); -} - -/** - * Add a waiter to the tail end of a wait queue. The waiter must not already - * be waiting in a queue. - * - * @param queue The queue to which to add the waiter - * @param waiter The waiter to add to the queue - * - * @return VDO_SUCCESS or an error code - **/ -int enqueueWaiter(WaitQueue *queue, Waiter *waiter) - __attribute__((warn_unused_result)); - -/** - * Notify all the entries waiting in a queue to continue execution by invoking - * a callback function on each of them in turn. The queue is copied and - * emptied before invoking any callbacks, and only the waiters that were in - * the queue at the start of the call will be notified. - * - * @param queue The wait queue containing the waiters to notify - * @param callback The function to call to notify each waiter, or NULL - * to invoke the callback field registered in each waiter - * @param context The context to pass to the callback function - **/ -void notifyAllWaiters(WaitQueue *queue, - WaiterCallback *callback, - void *context); - -/** - * Notify the next entry waiting in a queue to continue execution by invoking - * a callback function on it after removing it from the queue. - * - * @param queue The wait queue containing the waiter to notify - * @param callback The function to call to notify the waiter, or NULL - * to invoke the callback field registered in the waiter - * @param context The context to pass to the callback function - * - * @return true if there was a waiter in the queue - **/ -bool notifyNextWaiter(WaitQueue *queue, - WaiterCallback *callback, - void *context); - -/** - * Transfer all waiters from one wait queue to a second queue, emptying the - * first queue. - * - * @param fromQueue The queue containing the waiters to move - * @param toQueue The queue that will receive the waiters from the - * the first queue - **/ -void transferAllWaiters(WaitQueue *fromQueue, WaitQueue *toQueue); - -/** - * Return the waiter that is at the head end of a wait queue. - * - * @param queue The queue from which to get the first waiter - * - * @return The first (oldest) waiter in the queue, or NULL if - * the queue is empty - **/ -Waiter *getFirstWaiter(const WaitQueue *queue); - -/** - * Remove all waiters that match based on the specified matching method and - * append them to a WaitQueue. - * - * @param queue The wait queue to process - * @param matchMethod The method to determine matching - * @param matchContext Contextual info for the match method - * @param matchedQueue A WaitQueue to store matches - * - * @return VDO_SUCCESS or an error code - **/ -int dequeueMatchingWaiters(WaitQueue *queue, - WaiterMatch *matchMethod, - void *matchContext, - WaitQueue *matchedQueue); - -/** - * Remove the first waiter from the head end of a wait queue. The caller will - * be responsible for waking the waiter by invoking the correct callback - * function to resume its execution. - * - * @param queue The wait queue from which to remove the first entry - * - * @return The first (oldest) waiter in the queue, or NULL if - * the queue is empty - **/ -Waiter *dequeueNextWaiter(WaitQueue *queue); - -/** - * Count the number of waiters in a wait queue. - * - * @param queue The wait queue to query - * - * @return the number of waiters in the queue - **/ -__attribute__((warn_unused_result)) -static inline size_t countWaiters(const WaitQueue *queue) -{ - return queue->queueLength; -} - -/** - * Get the waiter after this one, for debug iteration. - * - * @param queue The wait queue - * @param waiter A waiter - * - * @return the next waiter, or NULL - **/ -const Waiter *getNextWaiter(const WaitQueue *queue, const Waiter *waiter) - __attribute__((warn_unused_result)); - -#endif // WAIT_QUEUE_H diff --git a/vdo/kernel/batchProcessor.c b/vdo/kernel/batchProcessor.c deleted file mode 100644 index 5845960..0000000 --- a/vdo/kernel/batchProcessor.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/batchProcessor.c#2 $ - */ - -#include "batchProcessor.h" - -#include "memoryAlloc.h" - -#include "constants.h" - -#include "kernelLayer.h" - -/* - * On memory ordering: - * - * The producer thread does: enqueue item on queue (xchg, which is - * implicitly interlocked, then a store), memory barrier, then atomic - * cmpxchg of the state field. The x86 architecture spec says the - * xchg, store, lock-cmpxchg sequence cannot be reordered, but on - * architectures using load-linked and store-conditional for the - * cmpxchg, like AArch64, the LL can be reordered with the store, so - * we add a barrier. - * - * The consumer thread, when it is running out of work, does: read - * queue (find empty), set state, mfence, read queue again just to be - * sure. The set-state and read-queue cannot be reordered with respect - * to the mfence (but without the mfence, the read could be moved - * before the set). - * - * The xchg and mfence impose a total order across processors, and - * each processor sees the stores done by the other processor in the - * required order. If the xchg happens before the mfence, the - * consumer's "read queue again" operation will see the update. If the - * mfence happens first, the producer's "cmpxchg state" will see its - * updated value. - * - * These are the semantics implemented by memory set to WB (write-back - * caching) mode on x86-64. So, the simple analysis is that no wakeups - * should be missed. - * - * It's a little subtler with funnel queues, since one interrupted or - * delayed enqueue operation (see the commentary in funnelQueuePut) - * can cause another, concurrent enqueue operation to complete without - * actually making the entry visible to the consumer. In essence, one - * update makes no new work items visible to the consumer, and the - * other (when it eventually completes) makes two (or more) work items - * visible, and each one ensures that the consumer will process what - * it has made visible. - */ - -typedef enum batchProcessorState { - BATCH_PROCESSOR_IDLE, - BATCH_PROCESSOR_ENQUEUED, -} BatchProcessorState; - -struct batchProcessor { - spinlock_t consumerLock; - FunnelQueue *queue; - KvdoWorkItem workItem; - atomic_t state; - BatchProcessorCallback callback; - void *closure; - KernelLayer *layer; -}; - -static void scheduleBatchProcessing(BatchProcessor *batch); - -/** - * Apply the batch processing function to the accumulated set of - * objects. - * - * Runs in a "CPU queue". - * - * @param [in] item The work item embedded in the BatchProcessor - **/ -static void batchProcessorWork(KvdoWorkItem *item) -{ - BatchProcessor *batch = container_of(item, BatchProcessor, workItem); - spin_lock(&batch->consumerLock); - while (!isFunnelQueueEmpty(batch->queue)) { - batch->callback(batch, batch->closure); - } - atomic_set(&batch->state, BATCH_PROCESSOR_IDLE); - memoryFence(); - bool needReschedule = !isFunnelQueueEmpty(batch->queue); - spin_unlock(&batch->consumerLock); - if (needReschedule) { - scheduleBatchProcessing(batch); - } -} - -/** - * Ensure that the batch-processing function is scheduled to run. - * - * If we're the thread that switches the BatchProcessor state from - * idle to enqueued, we're the thread responsible for actually - * enqueueing it. If some other thread got there first, or it was - * already enqueued, it's not our problem. - * - * @param [in] batch The BatchProcessor control data - **/ -static void scheduleBatchProcessing(BatchProcessor *batch) -{ - /* - * We want this to be very fast in the common cases. - * - * In testing on our "mgh" class machines (HP ProLiant DL380p Gen8, - * Intel Xeon E5-2690, 2.9GHz), it appears that under some - * conditions it's a little faster to use a memory fence and then - * read the "state" field, skipping the cmpxchg if the state is - * already set to BATCH_PROCESSOR_ENQUEUED. (Sometimes slightly - * faster still if we prefetch the state field first.) Note that the - * read requires the fence, otherwise it could be executed before - * the preceding store by the FunnelQueue code to the "next" - * pointer, which can, very rarely, result in failing to issue a - * wakeup when needed. - * - * However, the gain is small, and in testing on our older "harvard" - * class machines (Intel Xeon X5680, 3.33GHz) it was a clear win to - * skip all of that and go right for the cmpxchg. - * - * Of course, the tradeoffs may be sensitive to the particular work - * going on, cache pressure, etc. - */ - smp_mb(); - BatchProcessorState oldState - = atomic_cmpxchg(&batch->state, BATCH_PROCESSOR_IDLE, - BATCH_PROCESSOR_ENQUEUED); - bool doSchedule = (oldState == BATCH_PROCESSOR_IDLE); - if (doSchedule) { - enqueueCPUWorkQueue(batch->layer, &batch->workItem); - } -} - -/**********************************************************************/ -int makeBatchProcessor(KernelLayer *layer, - BatchProcessorCallback callback, - void *closure, - BatchProcessor **batchPtr) -{ - BatchProcessor *batch; - - int result = ALLOCATE(1, BatchProcessor, "batchProcessor", &batch); - if (result != UDS_SUCCESS) { - return result; - } - result = makeFunnelQueue(&batch->queue); - if (result != UDS_SUCCESS) { - FREE(batch); - return result; - } - - spin_lock_init(&batch->consumerLock); - setupWorkItem(&batch->workItem, batchProcessorWork, - (KvdoWorkFunction) callback, CPU_Q_ACTION_COMPLETE_KVIO); - atomic_set(&batch->state, BATCH_PROCESSOR_IDLE); - batch->callback = callback; - batch->closure = closure; - batch->layer = layer; - - *batchPtr = batch; - return UDS_SUCCESS; -} - -/**********************************************************************/ -void addToBatchProcessor(BatchProcessor *batch, KvdoWorkItem *item) -{ - funnelQueuePut(batch->queue, &item->workQueueEntryLink); - scheduleBatchProcessing(batch); -} - -/**********************************************************************/ -KvdoWorkItem *nextBatchItem(BatchProcessor *batch) -{ - FunnelQueueEntry *fqEntry = funnelQueuePoll(batch->queue); - if (fqEntry == NULL) { - return NULL; - } - - return container_of(fqEntry, KvdoWorkItem, workQueueEntryLink); -} - -/**********************************************************************/ -void condReschedBatchProcessor(BatchProcessor *batch) -{ - cond_resched_lock(&batch->consumerLock); -} - -/**********************************************************************/ -void freeBatchProcessor(BatchProcessor **batchPtr) -{ - BatchProcessor *batch = *batchPtr; - if (batch) { - memoryFence(); - BUG_ON(atomic_read(&batch->state) == BATCH_PROCESSOR_ENQUEUED); - freeFunnelQueue(batch->queue); - FREE(batch); - *batchPtr = NULL; - } -} diff --git a/vdo/kernel/batchProcessor.h b/vdo/kernel/batchProcessor.h deleted file mode 100644 index 5e348c6..0000000 --- a/vdo/kernel/batchProcessor.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/batchProcessor.h#2 $ - */ - -#ifndef BATCHPROCESSOR_H -#define BATCHPROCESSOR_H - -#include "kernelTypes.h" -#include "util/funnelQueue.h" - -/** - * Control data for managing collections of objects to be operated on - * by a specified function. May be used when the work function is - * lightweight enough or cache-contentious enough that it makes sense - * to try to accumulate multiple objects and operate on them all at - * once in one thread. - * - * The work function is run in one of the kernel layer's "CPU queues", - * and care is taken to ensure that only one invocation can be running - * or scheduled at any given time. It can loop calling nextBatchItem - * repeatedly until there are no more objects to operate on. It should - * also call condReschedBatchProcessor now and then, to play nicely - * with the OS scheduler. - * - * Objects to operate on are manipulated through a FunnelQueueEntry - * object which must be contained within them. - **/ -typedef struct batchProcessor BatchProcessor; - -typedef void (*BatchProcessorCallback)(BatchProcessor *batch, void *closure); - -/** - * Creates a batch-processor control structure. - * - * @param [in] layer The kernel layer data, used to enqueue work items - * @param [in] callback A function to process the accumulated objects - * @param [in] closure A private data pointer for use by the callback - * @param [out] batchPtr Where to store the pointer to the new object - * - * @return UDS_SUCCESS or an error code - **/ -int makeBatchProcessor(KernelLayer *layer, - BatchProcessorCallback callback, - void *closure, - BatchProcessor **batchPtr); - -/** - * Adds an object to the processing queue. - * - *

If the callback function is not currently running or scheduled to be run, - * it gets queued up to run. - * - * @param [in] batch The batch-processor data - * @param [in] item The handle on the new object to add - **/ -void addToBatchProcessor(BatchProcessor *batch, KvdoWorkItem *item); - -/** - * Fetches the next object in the processing queue. - * - * @param [in] batch The batch-processor data - * - * @return An object pointer or NULL - **/ -KvdoWorkItem *nextBatchItem(BatchProcessor *batch) - __attribute__((warn_unused_result)); - -/** - * Free the batch-processor data and null out the pointer. - * - * @param [in,out] batchPtr Where the BatchProcessor pointer is stored - **/ -void freeBatchProcessor(BatchProcessor **batchPtr); - -/** - * Yield control to the scheduler if the kernel has indicated that - * other work needs to run on the current processor. - * - * The data structure is needed so that the spin lock can be - * (conditionally) released and re-acquired. - * - * @param [in] batch The batch-processor data - **/ -void condReschedBatchProcessor(BatchProcessor *batch); - -#endif // BATCHPROCESSOR_H diff --git a/vdo/kernel/bio.c b/vdo/kernel/bio.c deleted file mode 100644 index a8e3a5e..0000000 --- a/vdo/kernel/bio.c +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bio.c#8 $ - */ - -#include "bio.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "numeric.h" - -#include "flush.h" -#include "recoveryJournal.h" - -#include "bioIterator.h" -#include "ioSubmitter.h" - -/** - * Gets the raw buffer from a biovec. - * - * @param biovec The biovec in question - * - * @return the buffer - **/ -static char *getBufferForBiovec(struct bio_vec *biovec) -{ - return (page_address(biovec->bv_page) + biovec->bv_offset); -} - -/**********************************************************************/ -void bioCopyDataIn(BIO *bio, char *dataPtr) -{ - struct bio_vec *biovec; - for (BioIterator iter = createBioIterator(bio); - (biovec = getNextBiovec(&iter)) != NULL; - advanceBioIterator(&iter)) { - memcpy(dataPtr, getBufferForBiovec(biovec), biovec->bv_len); - dataPtr += biovec->bv_len; - } -} - -/**********************************************************************/ -void bioCopyDataOut(BIO *bio, char *dataPtr) -{ - struct bio_vec *biovec; - for (BioIterator iter = createBioIterator(bio); - (biovec = getNextBiovec(&iter)) != NULL; - advanceBioIterator(&iter)) { - memcpy(getBufferForBiovec(biovec), dataPtr, biovec->bv_len); - flush_dcache_page(biovec->bv_page); - dataPtr += biovec->bv_len; - } -} - -/**********************************************************************/ -void setBioOperation(BIO *bio, unsigned int operation) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - bio->bi_opf &= ~REQ_OP_MASK; - bio->bi_opf |= operation; -#else - - unsigned int OPERATION_MASK = WRITE | REQ_DISCARD | REQ_FLUSH; - - // Clear the relevant bits - bio->bi_rw &= ~OPERATION_MASK; - // Set the operation we care about - bio->bi_rw |= operation; -#endif -} - -/**********************************************************************/ -void freeBio(BIO *bio, KernelLayer *layer) -{ - bio_put(bio); -} - -/**********************************************************************/ -void countBios(AtomicBioStats *bioStats, BIO *bio) -{ - if (isWriteBio(bio)) { - atomic64_inc(&bioStats->write); - } else { - atomic64_inc(&bioStats->read); - } - if (isDiscardBio(bio)) { - atomic64_inc(&bioStats->discard); - } - if (isFlushBio(bio)) { - atomic64_inc(&bioStats->flush); - } - if (isFUABio(bio)) { - atomic64_inc(&bioStats->fua); - } -} - -/** - * The function determines whether a buffer contains all zeroes. - * - * @param buffer The buffer to check - * @param length The length of the buffer - * - * @return true is all zeroes, false otherwise - **/ -static inline bool isAllZeros(const char *buffer, unsigned int length) -{ - /* - * Handle expected common case of even the first word being nonzero, - * without getting into the more expensive (for one iteration) loop - * below. - */ - if (likely(length >= sizeof(uint64_t))) { - if (GET_UNALIGNED(uint64_t, buffer) != 0) { - return false; - } - - unsigned int wordCount = length / sizeof(uint64_t); - - // Unroll to process 64 bytes at a time - unsigned int chunkCount = wordCount / 8; - while (chunkCount-- > 0) { - uint64_t word0 = GET_UNALIGNED(uint64_t, buffer); - uint64_t word1 = GET_UNALIGNED(uint64_t, buffer + 1 * sizeof(uint64_t)); - uint64_t word2 = GET_UNALIGNED(uint64_t, buffer + 2 * sizeof(uint64_t)); - uint64_t word3 = GET_UNALIGNED(uint64_t, buffer + 3 * sizeof(uint64_t)); - uint64_t word4 = GET_UNALIGNED(uint64_t, buffer + 4 * sizeof(uint64_t)); - uint64_t word5 = GET_UNALIGNED(uint64_t, buffer + 5 * sizeof(uint64_t)); - uint64_t word6 = GET_UNALIGNED(uint64_t, buffer + 6 * sizeof(uint64_t)); - uint64_t word7 = GET_UNALIGNED(uint64_t, buffer + 7 * sizeof(uint64_t)); - uint64_t or = (word0 | word1 | word2 | word3 - | word4 | word5 | word6 | word7); - // Prevent compiler from using 8*(cmp;jne). - __asm__ __volatile__ ("" : : "g" (or)); - if (or != 0) { - return false; - } - buffer += 8 * sizeof(uint64_t); - } - wordCount %= 8; - - // Unroll to process 8 bytes at a time. - // (Is this still worthwhile?) - while (wordCount-- > 0) { - if (GET_UNALIGNED(uint64_t, buffer) != 0) { - return false; - } - buffer += sizeof(uint64_t); - } - length %= sizeof(uint64_t); - // Fall through to finish up anything left over. - } - - while (length-- > 0) { - if (*buffer++ != 0) { - return false; - } - } - return true; -} - -/**********************************************************************/ -bool bioIsZeroData(BIO *bio) -{ - struct bio_vec *biovec; - for (BioIterator iter = createBioIterator(bio); - (biovec = getNextBiovec(&iter)) != NULL; - advanceBioIterator(&iter)) { - if (!isAllZeros(getBufferForBiovec(biovec), biovec->bv_len)) { - return false; - } - } - return true; -} - -/**********************************************************************/ -void bioZeroData(BIO *bio) -{ - zero_fill_bio(bio); -} - -/**********************************************************************/ -static void setBioSize(BIO *bio, BlockSize bioSize) -{ -#ifdef USE_BI_ITER - bio->bi_iter.bi_size = bioSize; -#else - bio->bi_size = bioSize; -#endif -} - -/** - * Initialize a bio. - * - * @param bio The bio to initialize - * @param layer The layer to which it belongs. - **/ -static void initializeBio(BIO *bio, KernelLayer *layer) -{ - // Save off important info so it can be set back later - unsigned short vcnt = bio->bi_vcnt; - void *pvt = bio->bi_private; - bio_reset(bio); // Memsets large portion of bio. Reset all needed fields. - bio->bi_private = pvt; - bio->bi_vcnt = vcnt; - bio->bi_end_io = completeAsyncBio; - setBioSector(bio, (sector_t) -1); // Sector will be set later on. - setBioBlockDevice(bio, getKernelLayerBdev(layer)); -} - -/**********************************************************************/ -void resetBio(BIO *bio, KernelLayer *layer) -{ - initializeBio(bio, layer); - setBioSize(bio, VDO_BLOCK_SIZE); -} - -/**********************************************************************/ -int allocateBio(KernelLayer *layer, unsigned int bvecCount, BIO **bioPtr) -{ - BIO *bio = bio_alloc_bioset(GFP_NOIO, bvecCount, layer->bioset); - if (IS_ERR(bio)) { - logError("bio allocation failure %ld", PTR_ERR(bio)); - return PTR_ERR(bio); - } - - initializeBio(bio, layer); - - *bioPtr = bio; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int createBio(KernelLayer *layer, char *data, BIO **bioPtr) -{ - BIO *bio = NULL; - if (data == NULL) { - int result = allocateBio(layer, 0, &bio); - if (result != VDO_SUCCESS) { - return result; - } - - *bioPtr = bio; - return VDO_SUCCESS; - } - - unsigned int len = VDO_BLOCK_SIZE; - unsigned long kaddr = (unsigned long) data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int bvecCount = end - start; - - int result = allocateBio(layer, bvecCount, &bio); - if (result != VDO_SUCCESS) { - return result; - } - - int offset = offset_in_page(kaddr); - for (unsigned int i = 0; (i < bvecCount) && (len > 0); i++) { - unsigned int bytes = PAGE_SIZE - offset; - if (bytes > len) { - bytes = len; - } - - struct page *page - = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); - int bytesAdded = bio_add_page(bio, page, bytes, offset); - if (bytesAdded != bytes) { - freeBio(bio, layer); - return logErrorWithStringError(VDO_BIO_CREATION_FAILED, - "Could only add %i bytes to bio", - bytesAdded); - - } - - data += bytes; - len -= bytes; - offset = 0; - } - - *bioPtr = bio; - return VDO_SUCCESS; -} - -/**********************************************************************/ -void prepareFlushBIO(BIO *bio, - void *context, - struct block_device *device, - bio_end_io_t *endIOCallback) -{ - clearBioOperationAndFlags(bio); - /* - * One would think we could use REQ_OP_FLUSH on new kernels, but some - * layers of the stack don't recognize that as a flush. So do it - * like blkdev_issue_flush() and make it a write+flush. - */ - setBioOperationWrite(bio); - setBioOperationFlagPreflush(bio); - bio->bi_end_io = endIOCallback; - bio->bi_private = context; - bio->bi_vcnt = 0; - setBioBlockDevice(bio, device); - setBioSize(bio, 0); - setBioSector(bio, 0); -} diff --git a/vdo/kernel/bio.h b/vdo/kernel/bio.h deleted file mode 100644 index 1ba8234..0000000 --- a/vdo/kernel/bio.h +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bio.h#6 $ - */ - -#ifndef BIO_H -#define BIO_H - -#include -#include -#include - -#include "kernelTypes.h" - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) -#define USE_BI_ITER 1 -#endif - -/** - * Copy the bio data to a char array. - * - * @param bio The bio to copy the data from - * @param dataPtr The local array to copy the data to - **/ -void bioCopyDataIn(BIO *bio, char *dataPtr); - -/** - * Copy a char array to the bio data. - * - * @param bio The bio to copy the data to - * @param dataPtr The local array to copy the data from - **/ -void bioCopyDataOut(BIO *bio, char *dataPtr); - -/** - * Set the bi_rw or equivalent field of a bio to a particular data - * operation. Intended to be called only by setBioOperationRead() etc. - * - * @param bio The bio to modify - * @param operation The operation to set it to - **/ -void setBioOperation(BIO *bio, unsigned int operation); - -/**********************************************************************/ -static inline void setBioOperationRead(BIO *bio) -{ - setBioOperation(bio, READ); -} - -/**********************************************************************/ -static inline void setBioOperationWrite(BIO *bio) -{ - setBioOperation(bio, WRITE); -} - -/**********************************************************************/ -static inline void clearBioOperationAndFlags(BIO *bio) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - bio->bi_opf = 0; -#else - bio->bi_rw = 0; -#endif -} - -/**********************************************************************/ -static inline void copyBioOperationAndFlags(BIO *to, BIO *from) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - to->bi_opf = from->bi_opf; -#else - to->bi_rw = from->bi_rw; -#endif -} - -/**********************************************************************/ -static inline void setBioOperationFlag(BIO *bio, unsigned int flag) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - bio->bi_opf |= flag; -#else - bio->bi_rw |= flag; -#endif -} - -/**********************************************************************/ -static inline void clearBioOperationFlag(BIO *bio, unsigned int flag) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - bio->bi_opf &= ~flag; -#else - bio->bi_rw &= ~flag; -#endif -} - -/**********************************************************************/ -static inline void setBioOperationFlagPreflush(BIO *bio) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - setBioOperationFlag(bio, REQ_PREFLUSH); -#else - // Preflushes and empty flushes are not currently distinguished. - setBioOperation(bio, WRITE_FLUSH); -#endif -} - -/**********************************************************************/ -static inline void setBioOperationFlagSync(BIO *bio) -{ - setBioOperationFlag(bio, REQ_SYNC); -} - -/**********************************************************************/ -static inline void clearBioOperationFlagSync(BIO *bio) -{ - clearBioOperationFlag(bio, REQ_SYNC); -} - -/**********************************************************************/ -static inline void setBioOperationFlagFua(BIO *bio) -{ - setBioOperationFlag(bio, REQ_FUA); -} - -/**********************************************************************/ -static inline void clearBioOperationFlagFua(BIO *bio) -{ - clearBioOperationFlag(bio, REQ_FUA); -} - -/**********************************************************************/ -static inline bool isDiscardBio(BIO *bio) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - return (bio != NULL) && (bio_op(bio) == REQ_OP_DISCARD); -#else - return (bio != NULL) && ((bio->bi_rw & REQ_DISCARD) != 0); -#endif -} - -/**********************************************************************/ -static inline bool isFlushBio(BIO *bio) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - return (bio_op(bio) == REQ_OP_FLUSH) || ((bio->bi_opf & REQ_PREFLUSH) != 0); -#else - return (bio->bi_rw & REQ_FLUSH) != 0; -#endif -} - -/**********************************************************************/ -static inline bool isFUABio(BIO *bio) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - return (bio->bi_opf & REQ_FUA) != 0; -#else - return (bio->bi_rw & REQ_FUA) != 0; -#endif -} - -/**********************************************************************/ -static inline bool isReadBio(BIO *bio) -{ - return bio_data_dir(bio) == READ; -} - -/**********************************************************************/ -static inline bool isWriteBio(BIO *bio) -{ - return bio_data_dir(bio) == WRITE; -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) -/** - * Get the error from the bio. - * - * @param bio The bio - * - * @return the bio's error if any - **/ -static inline int getBioResult(BIO *bio) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) - return blk_status_to_errno(bio->bi_status); -#else - return bio->bi_error; -#endif -} -#endif // newer than 4.4 - -/** - * Set the block device for a bio. - * - * @param bio The bio to modify - * @param device The new block device for the bio - **/ -static inline void setBioBlockDevice(BIO *bio, struct block_device *device) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0) - bio_set_dev(bio, device); -#else - bio->bi_bdev = device; -#endif -} - -/** - * Get a bio's size. - * - * @param bio The bio - * - * @return the bio's size - **/ -static inline unsigned int getBioSize(BIO *bio) -{ -#ifdef USE_BI_ITER - return bio->bi_iter.bi_size; -#else - return bio->bi_size; -#endif -} - -/** - * Set the bio's sector. - * - * @param bio The bio - * @param sector The sector - **/ -static inline void setBioSector(BIO *bio, sector_t sector) -{ -#ifdef USE_BI_ITER - bio->bi_iter.bi_sector = sector; -#else - bio->bi_sector = sector; -#endif -} - -/** - * Get the bio's sector. - * - * @param bio The bio - * - * @return the sector - **/ -static inline sector_t getBioSector(BIO *bio) -{ -#ifdef USE_BI_ITER - return bio->bi_iter.bi_sector; -#else - return bio->bi_sector; -#endif -} - -/** - * Tell the kernel we've completed processing of this bio. - * - * @param bio The bio to complete - * @param error A system error code, or 0 for success - **/ -static inline void completeBio(BIO *bio, int error) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) - bio->bi_status = errno_to_blk_status(error); - bio_endio(bio); -#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) - bio->bi_error = error; - bio_endio(bio); -#else - bio_endio(bio, error); -#endif -} - -/** - * Frees up a bio structure - * - * @param bio The bio to free - * @param layer The layer the bio was created in - **/ -void freeBio(BIO *bio, KernelLayer *layer); - -/** - * Count the statistics for the bios. This is used for calls into VDO and - * for calls out of VDO. - * - * @param bioStats Statistics structure to update - * @param bio The bio - **/ -void countBios(AtomicBioStats *bioStats, BIO *bio); - -/** - * Reset a bio so it can be used again. - * - * @param bio The bio to reset - * @param layer The physical layer - **/ -void resetBio(BIO *bio, KernelLayer *layer); - -/** - * Check to see whether a bio's data are all zeroes. - * - * @param bio The bio - * - * @return true if the bio's data are all zeroes - **/ -bool bioIsZeroData(BIO *bio); - -/** - * Set a bio's data to all zeroes. - * - * @param [in] bio The bio - **/ -void bioZeroData(BIO *bio); - -/** - * Create a new bio structure for kernel buffer storage. - * - * @param [in] layer The physical layer - * @param [in] data The buffer (can be NULL) - * @param [out] bioPtr A pointer to hold new bio - * - * @return VDO_SUCCESS or an error - **/ -int createBio(KernelLayer *layer, char *data, BIO **bioPtr); - -/** - * Prepare a BIO to issue a flush to the device below. - * - * @param bio The flush BIO - * @param context The context for the callback - * @param device The device to flush - * @param endIOCallback The function to call when the flush is complete - **/ -void prepareFlushBIO(BIO *bio, - void *context, - struct block_device *device, - bio_end_io_t *endIOCallback); - -/** - * Perform IO with a bio, waiting for completion and returning its result. - * The bio must already have its sector, block device, and operation set. - * - * @param bio The bio to do IO with - * - * @return The bio result - **/ -static inline int submitBioAndWait(BIO *bio) -{ - submit_bio_wait(bio); - return getBioResult(bio); -} - -#endif /* BIO_H */ diff --git a/vdo/kernel/bioIterator.h b/vdo/kernel/bioIterator.h deleted file mode 100644 index 7445261..0000000 --- a/vdo/kernel/bioIterator.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bioIterator.h#1 $ - */ - -#ifndef BIO_ITERATOR_H -#define BIO_ITERATOR_H - -#include - -#include "bio.h" -#include "kernelTypes.h" - -typedef struct { - BIO *bio; -#ifdef USE_BI_ITER - struct bvec_iter iter; - // Needed so we can store the return value of bio_iter_iovec. - struct bio_vec temp; -#else - int index; -#endif -} BioIterator; - -/** - * Create an iterator over a bio's data. - * - * @param bio The bio to iterate over - * - * @return An iterator over a bio - **/ -static BioIterator createBioIterator(BIO *bio) -{ - BioIterator iterator = { - .bio = bio, -#ifdef USE_BI_ITER - .iter = bio->bi_iter, -#else - .index = bio->bi_idx, -#endif - }; - return iterator; -} - -/** - * Get the next biovec from the iterator, or NULL if there are no more. - * - * @param iterator The iterator from which to get data - * - * @return The next biovec from the iterator, or NULL. - **/ -static struct bio_vec *getNextBiovec(BioIterator *iterator) -{ - BIO *bio = iterator->bio; -#ifdef USE_BI_ITER - if (iterator->iter.bi_size == 0) { - return NULL; - } - - iterator->temp = bio_iter_iovec(bio, iterator->iter); - return &iterator->temp; -#else - if (iterator->index >= bio->bi_vcnt) { - return NULL; - } - return bio_iovec_idx(bio, iterator->index); -#endif -} - -/** - * Advance the iterator to the next biovec in the bio. - * - * @param [in,out] iterator The iterator to advance - **/ -static void advanceBioIterator(BioIterator *iterator) -{ -#ifdef USE_BI_ITER - bio_advance_iter(iterator->bio, &iterator->iter, iterator->temp.bv_len); -#else - iterator->index++; -#endif -} - -#endif /* BIO_ITERATOR_H */ diff --git a/vdo/kernel/bufferPool.c b/vdo/kernel/bufferPool.c deleted file mode 100644 index 9c950ca..0000000 --- a/vdo/kernel/bufferPool.c +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bufferPool.c#1 $ - */ - -#include "bufferPool.h" - -#include -#include - -#include "logger.h" -#include "memoryAlloc.h" - -#include "statusCodes.h" - -/* - * For list nodes on the free-object list, the data field describes - * the object available for reuse. - * - * For nodes on the "spare" list, the data field is meaningless; - * they're just nodes available for use when we need to add an object - * pointer to the freeObjectList. - * - * These are both "free lists", in a sense; don't get confused! - */ -typedef struct { - struct list_head list; // links in current list - void *data; // element data, if on free list -} BufferElement; - -struct bufferPool { - const char *name; // Pool name - void *data; // Associated pool data - spinlock_t lock; // Locks this object - unsigned int size; // Total number of buffers - struct list_head freeObjectList; // List of free buffers - struct list_head spareListNodes; // Unused list nodes - unsigned int numBusy; // Number of buffers in use - unsigned int maxBusy; // Maximum value of the above - BufferAllocateFunction *alloc; // Allocate function for buffer data - BufferFreeFunction *free; // Free function for buffer data - BufferDumpFunction *dump; // Dump function for buffer data - BufferElement *bhead; // Array of BufferElement structures - void **objects; -}; - -/*************************************************************************/ -int makeBufferPool(const char *poolName, - unsigned int size, - BufferAllocateFunction *allocateFunction, - BufferFreeFunction *freeFunction, - BufferDumpFunction *dumpFunction, - void *poolData, - BufferPool **poolPtr) -{ - BufferPool *pool; - - int result = ALLOCATE(1, BufferPool, "buffer pool", &pool); - if (result != VDO_SUCCESS) { - logError("buffer pool allocation failure %d", result); - return result; - } - - result = ALLOCATE(size, BufferElement, "buffer pool elements", &pool->bhead); - if (result != VDO_SUCCESS) { - logError("buffer element array allocation failure %d", result); - freeBufferPool(&pool); - return result; - } - - result = ALLOCATE(size, void *, "object pointers", &pool->objects); - if (result != VDO_SUCCESS) { - logError("buffer object array allocation failure %d", result); - freeBufferPool(&pool); - return result; - } - - pool->name = poolName; - pool->alloc = allocateFunction; - pool->free = freeFunction; - pool->dump = dumpFunction; - pool->data = poolData; - pool->size = size; - spin_lock_init(&pool->lock); - INIT_LIST_HEAD(&pool->freeObjectList); - INIT_LIST_HEAD(&pool->spareListNodes); - BufferElement *bh = pool->bhead; - for (int i = 0; i < pool->size; i++) { - result = pool->alloc(pool->data, &bh->data); - if (result != VDO_SUCCESS) { - logError("verify buffer data allocation failure %d", result); - freeBufferPool(&pool); - return result; - } - pool->objects[i] = bh->data; - list_add(&bh->list, &pool->freeObjectList); - bh++; - } - pool->numBusy = pool->maxBusy = 0; - - *poolPtr = pool; - return VDO_SUCCESS; -} - -/*************************************************************************/ -void freeBufferPool(BufferPool **poolPtr) -{ - BufferPool *pool = *poolPtr; - if (pool == NULL) { - return; - } - - ASSERT_LOG_ONLY((pool->numBusy == 0), "freeing busy buffer pool, numBusy=%d", - pool->numBusy); - if (pool->objects != NULL) { - for (int i = 0; i < pool->size; i++) { - if (pool->objects[i] != NULL) { - pool->free(pool->data, pool->objects[i]); - } - } - FREE(pool->objects); - } - FREE(pool->bhead); - FREE(pool); - *poolPtr = NULL; -} - -/*************************************************************************/ -static bool inFreeList(BufferPool *pool, void *data) -{ - struct list_head *node; - list_for_each(node, &pool->freeObjectList) { - if (container_of(node, BufferElement, list)->data == data) { - return true; - } - } - return false; -} - -/*************************************************************************/ -void dumpBufferPool(BufferPool *pool, bool dumpElements) -{ - // In order that syslog can empty its buffer, sleep after 35 elements for - // 4ms (till the second clock tick). These numbers chosen in October - // 2012 running on an lfarm. - enum { ELEMENTS_PER_BATCH = 35 }; - enum { SLEEP_FOR_SYSLOG = 4 }; - - if (pool == NULL) { - return; - } - spin_lock(&pool->lock); - logInfo("%s: %u of %u busy (max %u)", pool->name, pool->numBusy, pool->size, - pool->maxBusy); - if (dumpElements && (pool->dump != NULL)) { - int dumped = 0; - for (int i = 0; i < pool->size; i++) { - if (!inFreeList(pool, pool->objects[i])) { - pool->dump(pool->data, pool->objects[i]); - if (++dumped >= ELEMENTS_PER_BATCH) { - spin_unlock(&pool->lock); - dumped = 0; - msleep(SLEEP_FOR_SYSLOG); - spin_lock(&pool->lock); - } - } - } - } - spin_unlock(&pool->lock); -} - -/*************************************************************************/ -int allocBufferFromPool(BufferPool *pool, void **dataPtr) -{ - if (pool == NULL) { - return UDS_INVALID_ARGUMENT; - } - - spin_lock(&pool->lock); - if (unlikely(list_empty(&pool->freeObjectList))) { - spin_unlock(&pool->lock); - logDebug("no free buffers"); - return -ENOMEM; - } - - BufferElement *bh = list_first_entry(&pool->freeObjectList, BufferElement, - list); - list_move(&bh->list, &pool->spareListNodes); - pool->numBusy++; - if (pool->numBusy > pool->maxBusy) { - pool->maxBusy = pool->numBusy; - } - *dataPtr = bh->data; - spin_unlock(&pool->lock); - return VDO_SUCCESS; - -} - -/*************************************************************************/ -static bool freeBufferToPoolInternal(BufferPool *pool, void *data) -{ - if (unlikely(list_empty(&pool->spareListNodes))) { - return false; - } - BufferElement *bh = list_first_entry(&pool->spareListNodes, BufferElement, - list); - list_move(&bh->list, &pool->freeObjectList); - bh->data = data; - pool->numBusy--; - return true; -} - -/*************************************************************************/ -void freeBufferToPool(BufferPool *pool, void *data) -{ - spin_lock(&pool->lock); - bool success = freeBufferToPoolInternal(pool, data); - spin_unlock(&pool->lock); - if (!success) { - logDebug("trying to add to free list when already full"); - } -} - -/*************************************************************************/ -void freeBuffersToPool(BufferPool *pool, void **data, int count) -{ - spin_lock(&pool->lock); - bool success = true; - for (int i = 0; (i < count) && success; i++) { - success = freeBufferToPoolInternal(pool, data[i]); - } - spin_unlock(&pool->lock); - if (!success) { - logDebug("trying to add to free list when already full"); - } -} diff --git a/vdo/kernel/bufferPool.h b/vdo/kernel/bufferPool.h deleted file mode 100644 index 9c505c9..0000000 --- a/vdo/kernel/bufferPool.h +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/bufferPool.h#1 $ - */ -#ifndef BUFFERPOOL_H -#define BUFFERPOOL_H - -/* - * We need bug.h because in 3.10, kernel.h (indirectly) defines - * ARRAY_SIZE as a macro which (indirectly and conditionally) uses - * BUILD_BUG_ON_ZERO, which is defined in bug.h, which is *not* - * included. In earlier versions like 3.2 it Just Worked. - */ -#include -#include -#include - -typedef struct bufferPool BufferPool; - -typedef int BufferAllocateFunction(void *poolData, void **dataPtr); -typedef void BufferFreeFunction(void *poolData, void *data); -typedef void BufferDumpFunction(void *poolData, void *data); - -/** - * Creates a generic pool of buffer data. The elements in the pool are - * allocated up front and placed on a free list, which manages the - * reuse of the individual buffers in the pool. - * - * @param [in] poolName Name of the pool - * @param [in] size The number of elements to create for this pool - * @param [in] allocateFunction The function to call to create the actual data - * for each element - * @param [in] freeFunction The function to call to free the actual data - * for each element - * @param [in] dumpFunction The function to call to dump the actual data - * for each element into the log - * @param [in] poolData A pointer to the pool's associated data - * @param [out] poolPtr A pointer to hold the pool that was created - * - * @return a success or error code - */ -int makeBufferPool(const char *poolName, - unsigned int size, - BufferAllocateFunction *allocateFunction, - BufferFreeFunction *freeFunction, - BufferDumpFunction *dumpFunction, - void *poolData, - BufferPool **poolPtr) - __attribute__((warn_unused_result)); - -/** - * Free a buffer pool and null out the reference to it. This will free - * all the elements of the pool as well. - * - * @param [in] poolPtr The reference to the pool to free - **/ -void freeBufferPool(BufferPool **poolPtr); - -/** - * Dump a buffer pool to the log. - * - * @param [in] pool The buffer pool to allocate from - * @param [in] dumpElements True for complete output, or false for a - * one-line summary - **/ -void dumpBufferPool(BufferPool *pool, bool dumpElements); - -/** - * Acquires a free buffer from the free list of the pool and - * returns it's associated data. - * - * @param [in] pool The buffer pool to allocate from - * @param [out] dataPtr A pointer to hold the buffer data - * - * @return a success or error code - */ -int allocBufferFromPool(BufferPool *pool, void **dataPtr) - __attribute__((warn_unused_result)); - -/** - * Returns a buffer to the free list of a pool - * - * @param [in] pool The buffer pool to return the buffer to - * @param [in] data The buffer data to return - */ -void freeBufferToPool(BufferPool *pool, void *data); - -/** - * Returns a set of buffers to the free list of a pool - * - * @param [in] pool The buffer pool to return the buffer to - * @param [in] data The buffer data to return - * @param [in] count Number of entries in the data array - */ -void freeBuffersToPool(BufferPool *pool, void **data, int count); - -/** - * Control structure for freeing (releasing back to the pool) pointers - * in batches. - * - * Since the objects stored in a buffer pool are completely opaque, - * some external data structure is needed to manage a collection of - * them. This is a simple helper for doing that, since we're freeing - * batches of objects in a couple different places. Within the pool - * itself there's a pair of linked lists, but getting at them requires - * the locking that we're trying to minimize. - * - * We collect pointers until the array is full or until there are no - * more available, and we call freeBuffersToPool to release a batch - * all at once. - **/ -typedef struct freeBufferPointers { - BufferPool *pool; - int index; - void *pointers[30]; // size is arbitrary -} FreeBufferPointers; - -/** - * Initialize the control structure for batching buffer pointers to be - * released to their pool. - * - * @param [out] fbp The (caller-allocated) control structure - * @param [in] pool The buffer pool to return objects to. - **/ -static inline void initFreeBufferPointers(FreeBufferPointers *fbp, - BufferPool *pool) -{ - fbp->index = 0; - fbp->pool = pool; -} - -/** - * Release any buffers left in the collection. - * - * @param [in] fbp The control structure - **/ -static inline void freeBufferPointers(FreeBufferPointers *fbp) -{ - freeBuffersToPool(fbp->pool, fbp->pointers, fbp->index); - fbp->index = 0; -} - -/** - * Add another buffer pointer to the collection, and if we're full, - * release the whole batch to the pool. - * - * @param [in] fbp The control structure - * @param [in] pointer The buffer pointer to release - **/ -static inline void addFreeBufferPointer(FreeBufferPointers *fbp, - void *pointer) -{ - fbp->pointers[fbp->index] = pointer; - fbp->index++; - if (fbp->index == ARRAY_SIZE(fbp->pointers)) { - freeBufferPointers(fbp); - } -} - -#endif /* BUFFERPOOL_H */ diff --git a/vdo/kernel/dataKVIO.c b/vdo/kernel/dataKVIO.c deleted file mode 100644 index ba9c8e8..0000000 --- a/vdo/kernel/dataKVIO.c +++ /dev/null @@ -1,1192 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dataKVIO.c#18 $ - */ - -#include "dataKVIO.h" - - -#include "logger.h" -#include "memoryAlloc.h" -#include "murmur/MurmurHash3.h" - -#include "dataVIO.h" -#include "compressedBlock.h" -#include "hashLock.h" -#include "lz4.h" - -#include "bio.h" -#include "dedupeIndex.h" -#include "kvdoFlush.h" -#include "kvio.h" -#include "ioSubmitter.h" -#include "vdoCommon.h" -#include "verify.h" - -static void dumpPooledDataKVIO(void *poolData, void *data); - -enum { - WRITE_PROTECT_FREE_POOL = 0, - WP_DATA_KVIO_SIZE = (sizeof(DataKVIO) + PAGE_SIZE - 1 - - ((sizeof(DataKVIO) + PAGE_SIZE - 1) - % PAGE_SIZE)) -}; - -/** - * Alter the write-access permission to a page of memory, so that - * objects in the free pool may no longer be modified. - * - * To do: Deny read access as well. - * - * @param address The starting address to protect, which must be on a - * page boundary - * @param byteCount The number of bytes to protect, which must be a multiple - * of the page size - * @param mode The write protection mode (true means read-only) - **/ -static __always_inline void -setWriteProtect(void *address, - size_t byteCount, - bool mode __attribute__((unused))) -{ - BUG_ON((((long) address) % PAGE_SIZE) != 0); - BUG_ON((byteCount % PAGE_SIZE) != 0); - BUG(); // only works in internal code, sorry -} - -/**********************************************************************/ -static void maybeLogDataKVIOTrace(DataKVIO *dataKVIO) -{ - if (dataKVIO->kvio.layer->traceLogging) { - logKvioTrace(&dataKVIO->kvio); - } -} - -/** - * First tracing hook for VIO completion. - * - * If the SystemTap script vdotrace.stp is in use, it does stage 1 of - * its processing here. We must not call addTraceRecord between the - * two tap functions. - * - * @param dataKVIO The VIO we're finishing up - **/ -static void kvioCompletionTap1(DataKVIO *dataKVIO) -{ - /* - * Ensure that dataKVIO doesn't get optimized out, even under inline - * expansion. Also, make sure the compiler has to emit debug info - * for baseTraceLocation, which some of our SystemTap scripts will - * use here. - * - * First, make it look as though all memory could be clobbered; then - * require that a value be read into a register. That'll force at - * least one instruction to exist (so SystemTap can hook in) where - * dataKVIO is live. We use a field that the caller would've - * accessed recently anyway, so it may be cached. - */ - barrier(); - __asm__ __volatile__("" - : - : "g" (dataKVIO), "g" (baseTraceLocation), - "r" (dataKVIO->kvio.layer)); -} - -/** - * Second tracing hook for VIO completion. - * - * The SystemTap script vdotrace.stp splits its VIO-completion work - * into two stages, to reduce lock contention for script variables. - * Hence, it needs two hooks in the code. - * - * @param dataKVIO The VIO we're finishing up - **/ -static void kvioCompletionTap2(DataKVIO *dataKVIO) -{ - // Hack to ensure variable doesn't get optimized out. - barrier(); - __asm__ __volatile__("" : : "g" (dataKVIO), "r" (dataKVIO->kvio.layer)); -} - -/**********************************************************************/ -static void kvdoAcknowledgeDataKVIO(DataKVIO *dataKVIO) -{ - KernelLayer *layer = dataKVIO->kvio.layer; - ExternalIORequest *externalIORequest = &dataKVIO->externalIORequest; - BIO *bio = externalIORequest->bio; - if (bio == NULL) { - return; - } - - externalIORequest->bio = NULL; - - int error - = mapToSystemError(dataVIOAsCompletion(&dataKVIO->dataVIO)->result); - bio->bi_end_io = externalIORequest->endIO; - bio->bi_private = externalIORequest->private; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - bio->bi_opf = externalIORequest->rw; -#else - bio->bi_rw = externalIORequest->rw; -#endif - - countBios(&layer->biosAcknowledged, bio); - if (dataKVIO->isPartial) { - countBios(&layer->biosAcknowledgedPartial, bio); - } - - - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); - completeBio(bio, error); -} - -/**********************************************************************/ -static noinline void cleanDataKVIO(DataKVIO *dataKVIO, FreeBufferPointers *fbp) -{ - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); - kvdoAcknowledgeDataKVIO(dataKVIO); - - KVIO *kvio = dataKVIOAsKVIO(dataKVIO); - kvio->bio = NULL; - - if (unlikely(kvio->vio->trace != NULL)) { - maybeLogDataKVIOTrace(dataKVIO); - kvioCompletionTap1(dataKVIO); - kvioCompletionTap2(dataKVIO); - freeTraceToPool(kvio->layer, kvio->vio->trace); - } - - addFreeBufferPointer(fbp, dataKVIO); -} - -/**********************************************************************/ -void returnDataKVIOBatchToPool(BatchProcessor *batch, void *closure) -{ - KernelLayer *layer = closure; - uint32_t count = 0; - ASSERT_LOG_ONLY(batch != NULL, "batch not null"); - ASSERT_LOG_ONLY(layer != NULL, "layer not null"); - - FreeBufferPointers fbp; - initFreeBufferPointers(&fbp, layer->dataKVIOPool); - - KvdoWorkItem *item; - while ((item = nextBatchItem(batch)) != NULL) { - cleanDataKVIO(workItemAsDataKVIO(item), &fbp); - condReschedBatchProcessor(batch); - count++; - } - - if (fbp.index > 0) { - freeBufferPointers(&fbp); - } - - completeManyRequests(layer, count); -} - -/**********************************************************************/ -static void kvdoAcknowledgeThenCompleteDataKVIO(KvdoWorkItem *item) -{ - DataKVIO *dataKVIO = workItemAsDataKVIO(item); - kvdoAcknowledgeDataKVIO(dataKVIO); - addToBatchProcessor(dataKVIO->kvio.layer->dataKVIOReleaser, item); -} - -/**********************************************************************/ -void kvdoCompleteDataKVIO(VDOCompletion *completion) -{ - DataKVIO *dataKVIO = dataVIOAsDataKVIO(asDataVIO(completion)); - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); - - KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); - if (useBioAckQueue(layer) && USE_BIO_ACK_QUEUE_FOR_READ - && (dataKVIO->externalIORequest.bio != NULL)) { - launchDataKVIOOnBIOAckQueue(dataKVIO, kvdoAcknowledgeThenCompleteDataKVIO, - NULL, BIO_ACK_Q_ACTION_ACK); - } else { - addToBatchProcessor(layer->dataKVIOReleaser, - workItemFromDataKVIO(dataKVIO)); - } -} - -/** - * Copy the uncompressed data from a compressed block read into the user - * bio which requested the read. - * - * @param workItem The DataKVIO which requested the read - **/ -static void copyReadBlockData(KvdoWorkItem *workItem) -{ - DataKVIO *dataKVIO = workItemAsDataKVIO(workItem); - - // For a read-modify-write, copy the data into the dataBlock buffer so it - // will be set up for the write phase. - if (isReadModifyWriteVIO(dataKVIO->kvio.vio)) { - bioCopyDataOut(getBIOFromDataKVIO(dataKVIO), dataKVIO->readBlock.data); - kvdoEnqueueDataVIOCallback(dataKVIO); - return; - } - - // For a partial read, the callback will copy the requested data from the - // read block. - if (dataKVIO->isPartial) { - kvdoEnqueueDataVIOCallback(dataKVIO); - return; - } - - // For a full block read, copy the data to the bio and acknowledge. - bioCopyDataOut(getBIOFromDataKVIO(dataKVIO), dataKVIO->readBlock.data); - kvdoAcknowledgeDataVIO(&dataKVIO->dataVIO); -} - -/** - * Finish reading data for a compressed block. - * - * @param dataKVIO The DataKVIO which requested the read - **/ -static void readDataKVIOReadBlockCallback(DataKVIO *dataKVIO) -{ - if (dataKVIO->readBlock.status != VDO_SUCCESS) { - setCompletionResult(dataVIOAsCompletion(&dataKVIO->dataVIO), - dataKVIO->readBlock.status); - kvdoEnqueueDataVIOCallback(dataKVIO); - return; - } - - launchDataKVIOOnCPUQueue(dataKVIO, copyReadBlockData, NULL, - CPU_Q_ACTION_COMPRESS_BLOCK); -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) -/** - * Complete and reset a bio that was supplied by the user and then used for a - * read (so that we can complete it with the user's callback). - * - * @param bio The bio to complete - **/ -static void resetUserBio(BIO *bio) -#else -/** - * Complete and reset a bio that was supplied by the user and then used for a - * read (so that we can complete it with the user's callback). - * - * @param bio The bio to complete - * @param error Possible error from underlying block device - **/ -static void resetUserBio(BIO *bio, int error) -#endif -{ -#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0)) \ - && (LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0))) - // This is a user bio, and the device just called bio_endio() on it, so - // we need to re-increment bi_remaining so we too can call bio_endio(). - atomic_inc(&bio->bi_remaining); -#endif - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) - completeAsyncBio(bio); -#else - completeAsyncBio(bio, error); -#endif -} - -/** - * Uncompress the data that's just been read and then call back the requesting - * DataKVIO. - * - * @param workItem The DataKVIO requesting the data - **/ -static void uncompressReadBlock(KvdoWorkItem *workItem) -{ - DataKVIO *dataKVIO = workItemAsDataKVIO(workItem); - ReadBlock *readBlock = &dataKVIO->readBlock; - BlockSize blockSize = VDO_BLOCK_SIZE; - - // The DataKVIO's scratch block will be used to contain the - // uncompressed data. - uint16_t fragmentOffset, fragmentSize; - char *compressedData = readBlock->data; - int result = getCompressedBlockFragment(readBlock->mappingState, - compressedData, blockSize, - &fragmentOffset, - &fragmentSize); - if (result != VDO_SUCCESS) { - logDebug("%s: frag err %d", __func__, result); - readBlock->status = result; - readBlock->callback(dataKVIO); - return; - } - - char *fragment = compressedData + fragmentOffset; - int size = LZ4_uncompress_unknownOutputSize(fragment, dataKVIO->scratchBlock, - fragmentSize, blockSize); - if (size == blockSize) { - readBlock->data = dataKVIO->scratchBlock; - } else { - logDebug("%s: lz4 error", __func__); - readBlock->status = VDO_INVALID_FRAGMENT; - } - - readBlock->callback(dataKVIO); -} - -/** - * Now that we have gotten the data from storage, uncompress the data if - * necessary and then call back the requesting DataKVIO. - * - * @param dataKVIO The DataKVIO requesting the data - * @param result The result of the read operation - **/ -static void completeRead(DataKVIO *dataKVIO, int result) -{ - ReadBlock *readBlock = &dataKVIO->readBlock; - readBlock->status = result; - - if ((result == VDO_SUCCESS) && isCompressed(readBlock->mappingState)) { - launchDataKVIOOnCPUQueue(dataKVIO, uncompressReadBlock, NULL, - CPU_Q_ACTION_COMPRESS_BLOCK); - return; - } - - readBlock->callback(dataKVIO); -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) -/** - * Callback for a bio doing a read. - * - * @param bio The bio - */ -static void readBioCallback(BIO *bio) -#else -/** - * Callback for a bio doing a read. - * - * @param bio The bio - * @param result The result of the read operation - */ -static void readBioCallback(BIO *bio, int result) -#endif -{ - KVIO *kvio = (KVIO *) bio->bi_private; - DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); - dataKVIO->readBlock.data = dataKVIO->readBlock.buffer; - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); - countCompletedBios(bio); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) - completeRead(dataKVIO, getBioResult(bio)); -#else - completeRead(dataKVIO, result); -#endif -} - -/**********************************************************************/ -void kvdoReadBlock(DataVIO *dataVIO, - PhysicalBlockNumber location, - BlockMappingState mappingState, - BioQAction action, - DataKVIOCallback callback) -{ - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - - DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); - ReadBlock *readBlock = &dataKVIO->readBlock; - KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); - - readBlock->callback = callback; - readBlock->status = VDO_SUCCESS; - readBlock->mappingState = mappingState; - - BUG_ON(getBIOFromDataKVIO(dataKVIO)->bi_private != &dataKVIO->kvio); - // Read the data directly from the device using the read bio. - BIO *bio = readBlock->bio; - resetBio(bio, layer); - setBioSector(bio, blockToSector(layer, location)); - setBioOperationRead(bio); - bio->bi_end_io = readBioCallback; - submitBio(bio, action); -} - -/**********************************************************************/ -void kvdoReadDataVIO(DataVIO *dataVIO) -{ - ASSERT_LOG_ONLY(!isWriteVIO(dataVIOAsVIO(dataVIO)), - "operation set correctly for data read"); - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F;io=readData")); - - if (isCompressed(dataVIO->mapped.state)) { - kvdoReadBlock(dataVIO, dataVIO->mapped.pbn, dataVIO->mapped.state, - BIO_Q_ACTION_COMPRESSED_DATA, readDataKVIOReadBlockCallback); - return; - } - - KVIO *kvio = dataVIOAsKVIO(dataVIO); - BIO *bio = kvio->bio; - bio->bi_end_io = resetUserBio; - setBioSector(bio, blockToSector(kvio->layer, dataVIO->mapped.pbn)); - submitBio(bio, BIO_Q_ACTION_DATA); -} - -/**********************************************************************/ -static void kvdoAcknowledgeDataKVIOThenContinue(KvdoWorkItem *item) -{ - DataKVIO *dataKVIO = workItemAsDataKVIO(item); - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); - kvdoAcknowledgeDataKVIO(dataKVIO); - // Even if we're not using bio-ack threads, we may be in the wrong - // base-code thread. - kvdoEnqueueDataVIOCallback(dataKVIO); -} - -/**********************************************************************/ -void kvdoAcknowledgeDataVIO(DataVIO *dataVIO) -{ - DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); - KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); - - // If the remaining discard work is not completely processed by this VIO, - // don't acknowledge it yet. - if (isDiscardBio(dataKVIO->externalIORequest.bio) - && (dataKVIO->remainingDiscard - > (VDO_BLOCK_SIZE - dataKVIO->offset))) { - invokeCallback(dataVIOAsCompletion(dataVIO)); - return; - } - - // We've finished with the KVIO; acknowledge completion of the bio to the - // kernel. - if (useBioAckQueue(layer)) { - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - launchDataKVIOOnBIOAckQueue(dataKVIO, kvdoAcknowledgeDataKVIOThenContinue, - NULL, BIO_ACK_Q_ACTION_ACK); - } else { - kvdoAcknowledgeDataKVIOThenContinue(workItemFromDataKVIO(dataKVIO)); - } -} - -/**********************************************************************/ -void kvdoWriteDataVIO(DataVIO *dataVIO) -{ - ASSERT_LOG_ONLY(isWriteVIO(dataVIOAsVIO(dataVIO)), - "kvdoWriteDataVIO() called on write DataVIO"); - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("$F;io=writeData;j=normal")); - - KVIO *kvio = dataVIOAsKVIO(dataVIO); - BIO *bio = kvio->bio; - setBioOperationWrite(bio); - setBioSector(bio, blockToSector(kvio->layer, dataVIO->newMapped.pbn)); - submitBio(bio, BIO_Q_ACTION_DATA); -} - -/**********************************************************************/ -void kvdoModifyWriteDataVIO(DataVIO *dataVIO) -{ - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); - BIO *bio = dataKVIO->externalIORequest.bio; - KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); - resetBio(dataKVIO->dataBlockBio, layer); - - if (!isDiscardBio(bio)) { - bioCopyDataIn(bio, dataKVIO->dataBlock + dataKVIO->offset); - } else { - memset(dataKVIO->dataBlock + dataKVIO->offset, '\0', - min(dataKVIO->remainingDiscard, - (DiscardSize) (VDO_BLOCK_SIZE - dataKVIO->offset))); - } - - dataVIO->isZeroBlock = bioIsZeroData(dataKVIO->dataBlockBio); - dataKVIO->dataBlockBio->bi_private = &dataKVIO->kvio; - copyBioOperationAndFlags(dataKVIO->dataBlockBio, bio); - // Make the bio a write, not (potentially) a discard. - setBioOperationWrite(dataKVIO->dataBlockBio); -} - -/**********************************************************************/ -void kvdoZeroDataVIO(DataVIO *dataVIO) -{ - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION("zeroDataVIO;io=readData")); - bioZeroData(dataVIOAsKVIO(dataVIO)->bio); -} - -/**********************************************************************/ -void kvdoCopyDataVIO(DataVIO *source, DataVIO *destination) -{ - dataVIOAddTraceRecord(destination, THIS_LOCATION(NULL)); - bioCopyDataOut(dataVIOAsKVIO(destination)->bio, - dataVIOAsDataKVIO(source)->dataBlock); -} - -/**********************************************************************/ -static void kvdoCompressWork(KvdoWorkItem *item) -{ - DataKVIO *dataKVIO = workItemAsDataKVIO(item); - KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); - - char *context = getWorkQueuePrivateData(); - if (unlikely(context == NULL)) { - uint32_t index = atomicAdd32(&layer->compressionContextIndex, 1) - 1; - BUG_ON(index >= layer->deviceConfig->threadCounts.cpuThreads); - context = layer->compressionContext[index]; - setWorkQueuePrivateData(context); - } - - int size = LZ4_compress_ctx_limitedOutput(context, dataKVIO->dataBlock, - dataKVIO->scratchBlock, - VDO_BLOCK_SIZE, - VDO_BLOCK_SIZE); - DataVIO *dataVIO = &dataKVIO->dataVIO; - if (size > 0) { - // The scratch block will be used to contain the compressed data. - dataVIO->compression.data = dataKVIO->scratchBlock; - dataVIO->compression.size = size; - } else { - // Use block size plus one as an indicator for uncompressible data. - dataVIO->compression.size = VDO_BLOCK_SIZE + 1; - } - - kvdoEnqueueDataVIOCallback(dataKVIO); -} - -/**********************************************************************/ -void kvdoCompressDataVIO(DataVIO *dataVIO) -{ - dataVIOAddTraceRecord(dataVIO, - THIS_LOCATION("compressDataVIO;" - "io=compress;cb=compress")); - - /* - * If the orignal bio was a discard, but we got this far because the discard - * was a partial one (r/m/w), and it is part of a larger discard, we cannot - * compress this VIO. We need to make sure the VIO completes ASAP. - */ - DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); - if (isDiscardBio(dataKVIO->externalIORequest.bio) - && (dataKVIO->remainingDiscard > 0)) { - dataVIO->compression.size = VDO_BLOCK_SIZE + 1; - kvdoEnqueueDataVIOCallback(dataKVIO); - return; - } - - launchDataKVIOOnCPUQueue(dataKVIO, kvdoCompressWork, NULL, - CPU_Q_ACTION_COMPRESS_BLOCK); -} - -/** - * Construct a DataKVIO. - * - * @param [in] layer The physical layer - * @param [in] bio The bio to associate with this DataKVIO - * @param [out] dataKVIOPtr A pointer to hold the new DataKVIO - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int makeDataKVIO(KernelLayer *layer, BIO *bio, DataKVIO **dataKVIOPtr) -{ - DataKVIO *dataKVIO; - int result = allocBufferFromPool(layer->dataKVIOPool, (void **) &dataKVIO); - if (result != VDO_SUCCESS) { - return logErrorWithStringError(result, "data kvio allocation failure"); - } - - if (WRITE_PROTECT_FREE_POOL) { - setWriteProtect(dataKVIO, WP_DATA_KVIO_SIZE, false); - } - - KVIO *kvio = &dataKVIO->kvio; - kvio->vio = dataVIOAsVIO(&dataKVIO->dataVIO); - memset(&kvio->enqueueable, 0, sizeof(KvdoEnqueueable)); - memset(&dataKVIO->dedupeContext.pendingList, 0, sizeof(struct list_head)); - memset(&dataKVIO->dataVIO, 0, sizeof(DataVIO)); - kvio->bioToSubmit = NULL; - bio_list_init(&kvio->biosMerged); - - // The dataBlock is only needed for writes and some partial reads. - if (isWriteBio(bio) || (getBioSize(bio) < VDO_BLOCK_SIZE)) { - resetBio(dataKVIO->dataBlockBio, layer); - } - - initializeKVIO(kvio, layer, VIO_TYPE_DATA, VIO_PRIORITY_DATA, NULL, bio); - *dataKVIOPtr = dataKVIO; - return VDO_SUCCESS; -} - -/** - * Creates a new DataVIO structure. A DataVIO represents a single logical - * block of data. It is what most VDO operations work with. This function also - * creates a wrapping DataKVIO structure that is used when we want to - * physically read or write the data associated with the DataVIO. - * - * @param [in] layer The physical layer - * @param [in] bio The BIO from the request the new DataKVIO will - * service - * @param [in] arrivalTime The arrival time of the BIO - * @param [out] dataKVIOPtr A pointer to hold the new DataKVIO - * - * @return VDO_SUCCESS or an error - **/ -static int kvdoCreateKVIOFromBio(KernelLayer *layer, - BIO *bio, - Jiffies arrivalTime, - DataKVIO **dataKVIOPtr) -{ - ExternalIORequest externalIORequest = { - .bio = bio, - .private = bio->bi_private, - .endIO = bio->bi_end_io, -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - .rw = bio->bi_opf, -#else - .rw = bio->bi_rw, -#endif - }; - - // We will handle FUA at the end of the request (after we restore the - // bi_rw field from externalIORequest.rw). - clearBioOperationFlagFua(bio); - - DataKVIO *dataKVIO = NULL; - int result = makeDataKVIO(layer, bio, &dataKVIO); - if (result != VDO_SUCCESS) { - return result; - } - - dataKVIO->externalIORequest = externalIORequest; - dataKVIO->offset = sectorToBlockOffset(layer, getBioSector(bio)); - dataKVIO->isPartial = ((getBioSize(bio) < VDO_BLOCK_SIZE) - || (dataKVIO->offset != 0)); - - if (dataKVIO->isPartial) { - countBios(&layer->biosInPartial, bio); - } else { - /* - * Note that we unconditionally fill in the dataBlock array for - * non-read operations. There are places like kvdoCopyVIO that may - * look at kvio->dataBlock for a zero block (and maybe for - * discards?). We could skip filling in dataBlock for such cases, - * but only once we're sure all such places are fixed to check the - * isZeroBlock flag first. - */ - if (isDiscardBio(bio)) { - /* - * This is a discard/trim operation. This is treated much like the zero - * block, but we keep different stats and distinguish it in the block - * map. - */ - memset(dataKVIO->dataBlock, 0, VDO_BLOCK_SIZE); - } else if (bio_data_dir(bio) == WRITE) { - dataKVIO->dataVIO.isZeroBlock = bioIsZeroData(bio); - // Copy the bio data to a char array so that we can continue to use - // the data after we acknowledge the bio. - bioCopyDataIn(bio, dataKVIO->dataBlock); - } - } - - if (dataKVIO->isPartial || isWriteBio(bio)) { - /* - * dataKVIO->bio will point at kvio->dataBlockBio for all writes and - * partial block I/O so the rest of the kernel code doesn't need to - * make a decision as to what to use. - */ - dataKVIO->dataBlockBio->bi_private = &dataKVIO->kvio; - if (dataKVIO->isPartial && isWriteBio(bio)) { - clearBioOperationAndFlags(dataKVIO->dataBlockBio); - setBioOperationRead(dataKVIO->dataBlockBio); - } else { - copyBioOperationAndFlags(dataKVIO->dataBlockBio, bio); - } - dataKVIOAsKVIO(dataKVIO)->bio = dataKVIO->dataBlockBio; - dataKVIO->readBlock.data = dataKVIO->dataBlock; - } - - setBioBlockDevice(bio, getKernelLayerBdev(layer)); - bio->bi_end_io = completeAsyncBio; - *dataKVIOPtr = dataKVIO; - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void launchDataKVIOWork(KvdoWorkItem *item) -{ - runCallback(vioAsCompletion(workItemAsKVIO(item)->vio)); -} - -/** - * Continue discard processing for requests that span multiple physical blocks. - * If all have been processed the KVIO is completed. If we have already seen - * an error, we skip the rest of the discard and fail immediately. - * - *

Invoked in a request-queue thread after the discard of a block has - * completed. - * - * @param completion A completion representing the discard KVIO - **/ -static void kvdoContinueDiscardKVIO(VDOCompletion *completion) -{ - DataVIO *dataVIO = asDataVIO(completion); - DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); - KernelLayer *layer = getLayerFromDataKVIO(dataKVIO); - dataKVIO->remainingDiscard - -= min(dataKVIO->remainingDiscard, - (DiscardSize) (VDO_BLOCK_SIZE - dataKVIO->offset)); - if ((completion->result != VDO_SUCCESS) - || (dataKVIO->remainingDiscard == 0)) { - if (dataKVIO->hasDiscardPermit) { - limiterRelease(&layer->discardLimiter); - dataKVIO->hasDiscardPermit = false; - } - kvdoCompleteDataKVIO(completion); - return; - } - - BIO *bio = getBIOFromDataKVIO(dataKVIO); - resetBio(bio, layer); - dataKVIO->isPartial = (dataKVIO->remainingDiscard < VDO_BLOCK_SIZE); - dataKVIO->offset = 0; - - VIOOperation operation; - if (dataKVIO->isPartial) { - operation = VIO_READ_MODIFY_WRITE; - setBioOperationRead(bio); - } else { - operation = VIO_WRITE; - } - - if (requestorSetFUA(dataKVIO)) { - operation |= VIO_FLUSH_AFTER; - } - - prepareDataVIO(dataVIO, dataVIO->logical.lbn + 1, operation, - !dataKVIO->isPartial, kvdoContinueDiscardKVIO); - enqueueDataKVIO(dataKVIO, launchDataKVIOWork, completion->callback, - REQ_Q_ACTION_MAP_BIO); -} - -/** - * Finish a partial read. - * - * @param completion The partial read KVIO - **/ -static void kvdoCompletePartialRead(VDOCompletion *completion) -{ - DataKVIO *dataKVIO = dataVIOAsDataKVIO(asDataVIO(completion)); - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); - - bioCopyDataOut(dataKVIO->externalIORequest.bio, - dataKVIO->readBlock.data + dataKVIO->offset); - kvdoCompleteDataKVIO(completion); - return; -} - -/**********************************************************************/ -int kvdoLaunchDataKVIOFromBio(KernelLayer *layer, - BIO *bio, - uint64_t arrivalTime, - bool hasDiscardPermit) -{ - - DataKVIO *dataKVIO = NULL; - int result = kvdoCreateKVIOFromBio(layer, bio, arrivalTime, &dataKVIO); - if (unlikely(result != VDO_SUCCESS)) { - logInfo("%s: KVIO allocation failure", __func__); - if (hasDiscardPermit) { - limiterRelease(&layer->discardLimiter); - } - limiterRelease(&layer->requestLimiter); - return mapToSystemError(result); - } - - /* - * Discards behave very differently than other requests when coming - * in from device-mapper. We have to be able to handle any size discards - * and with various sector offsets within a block. - */ - KVIO *kvio = &dataKVIO->kvio; - VDOAction *callback = kvdoCompleteDataKVIO; - VIOOperation operation = VIO_WRITE; - bool isTrim = false; - if (isDiscardBio(bio)) { - dataKVIO->hasDiscardPermit = hasDiscardPermit; - dataKVIO->remainingDiscard = getBioSize(bio); - callback = kvdoContinueDiscardKVIO; - if (dataKVIO->isPartial) { - operation = VIO_READ_MODIFY_WRITE; - } else { - isTrim = true; - } - } else if (dataKVIO->isPartial) { - if (bio_data_dir(bio) == READ) { - callback = kvdoCompletePartialRead; - operation = VIO_READ; - } else { - operation = VIO_READ_MODIFY_WRITE; - } - } else if (bio_data_dir(bio) == READ) { - operation = VIO_READ; - } - - if (requestorSetFUA(dataKVIO)) { - operation |= VIO_FLUSH_AFTER; - } - - LogicalBlockNumber lbn - = sectorToBlock(layer, getBioSector(bio) - layer->startingSectorOffset); - prepareDataVIO(&dataKVIO->dataVIO, lbn, operation, isTrim, callback); - enqueueKVIO(kvio, launchDataKVIOWork, vioAsCompletion(kvio->vio)->callback, - REQ_Q_ACTION_MAP_BIO); - return VDO_SUCCESS; -} - -/** - * Hash a DataKVIO and set its chunk name. - * - * @param item The DataKVIO to be hashed - **/ -static void kvdoHashDataWork(KvdoWorkItem *item) -{ - DataKVIO *dataKVIO = workItemAsDataKVIO(item); - DataVIO *dataVIO = &dataKVIO->dataVIO; - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - - MurmurHash3_x64_128(dataKVIO->dataBlock, VDO_BLOCK_SIZE, 0x62ea60be, - &dataVIO->chunkName); - dataKVIO->dedupeContext.chunkName = &dataVIO->chunkName; - - kvdoEnqueueDataVIOCallback(dataKVIO); -} - -/**********************************************************************/ -void kvdoHashDataVIO(DataVIO *dataVIO) -{ - dataVIOAddTraceRecord(dataVIO, THIS_LOCATION(NULL)); - launchDataKVIOOnCPUQueue(dataVIOAsDataKVIO(dataVIO), kvdoHashDataWork, NULL, - CPU_Q_ACTION_HASH_BLOCK); -} - -/**********************************************************************/ -void kvdoCheckForDuplication(DataVIO *dataVIO) -{ - dataVIOAddTraceRecord(dataVIO, - THIS_LOCATION("checkForDuplication;dup=post")); - ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, - "zero block not checked for duplication"); - ASSERT_LOG_ONLY(dataVIO->newMapped.state != MAPPING_STATE_UNMAPPED, - "discard not checked for duplication"); - - DataKVIO *dataKVIO = dataVIOAsDataKVIO(dataVIO); - if (hasAllocation(dataVIO)) { - postDedupeAdvice(dataKVIO); - } else { - // This block has not actually been written (presumably because we are - // full), so attempt to dedupe without posting bogus advice. - queryDedupeAdvice(dataKVIO); - } -} - -/**********************************************************************/ -void kvdoUpdateDedupeAdvice(DataVIO *dataVIO) -{ - updateDedupeAdvice(dataVIOAsDataKVIO(dataVIO)); -} - -/** - * Implements BufferFreeFunction. - **/ -static void freePooledDataKVIO(void *poolData, void *data) -{ - if (data == NULL) { - return; - } - - DataKVIO *dataKVIO = (DataKVIO *) data; - KernelLayer *layer = (KernelLayer *) poolData; - if (WRITE_PROTECT_FREE_POOL) { - setWriteProtect(dataKVIO, WP_DATA_KVIO_SIZE, false); - } - - if (dataKVIO->dataBlockBio != NULL) { - freeBio(dataKVIO->dataBlockBio, layer); - } - - if (dataKVIO->readBlock.bio != NULL) { - freeBio(dataKVIO->readBlock.bio, layer); - } - - FREE(dataKVIO->readBlock.buffer); - FREE(dataKVIO->dataBlock); - FREE(dataKVIO->scratchBlock); - FREE(dataKVIO); -} - -/** - * Allocate a DataKVIO. This function is the internals of makePooledDataKVIO(). - * - * @param [in] layer The layer in which the DataKVIO will operate - * @param [out] dataKVIOPtr A pointer to hold the newly allocated DataKVIO - * - * @return VDO_SUCCESS or an error - **/ -static int allocatePooledDataKVIO(KernelLayer *layer, DataKVIO **dataKVIOPtr) -{ - DataKVIO *dataKVIO; - int result; - if (WRITE_PROTECT_FREE_POOL) { - STATIC_ASSERT(WP_DATA_KVIO_SIZE >= sizeof(DataKVIO)); - result = allocateMemory(WP_DATA_KVIO_SIZE, 0, __func__, &dataKVIO); - if (result == VDO_SUCCESS) { - BUG_ON((((size_t) dataKVIO) & (PAGE_SIZE - 1)) != 0); - } - } else { - result = ALLOCATE(1, DataKVIO, __func__, &dataKVIO); - } - - if (result != VDO_SUCCESS) { - return logErrorWithStringError(result, "DataKVIO allocation failure"); - } - - STATIC_ASSERT(VDO_BLOCK_SIZE <= PAGE_SIZE); - result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio data", - &dataKVIO->dataBlock); - if (result != VDO_SUCCESS) { - freePooledDataKVIO(layer, dataKVIO); - return logErrorWithStringError(result, "DataKVIO data allocation failure"); - } - - result = createBio(layer, dataKVIO->dataBlock, &dataKVIO->dataBlockBio); - if (result != VDO_SUCCESS) { - freePooledDataKVIO(layer, dataKVIO); - return logErrorWithStringError(result, - "DataKVIO data bio allocation failure"); - } - - result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio read buffer", - &dataKVIO->readBlock.buffer); - if (result != VDO_SUCCESS) { - freePooledDataKVIO(layer, dataKVIO); - return logErrorWithStringError(result, - "DataKVIO read allocation failure"); - } - - result = createBio(layer, dataKVIO->readBlock.buffer, - &dataKVIO->readBlock.bio); - if (result != VDO_SUCCESS) { - freePooledDataKVIO(layer, dataKVIO); - return logErrorWithStringError(result, - "DataKVIO read bio allocation failure"); - } - - dataKVIO->readBlock.bio->bi_private = &dataKVIO->kvio; - - result = allocateMemory(VDO_BLOCK_SIZE, 0, "kvio scratch", - &dataKVIO->scratchBlock); - if (result != VDO_SUCCESS) { - freePooledDataKVIO(layer, dataKVIO); - return logErrorWithStringError(result, - "DataKVIO scratch allocation failure"); - } - - *dataKVIOPtr = dataKVIO; - return VDO_SUCCESS; -} - -/** - * Implements BufferAllocateFunction. - **/ -static int makePooledDataKVIO(void *poolData, void **dataPtr) -{ - DataKVIO *dataKVIO = NULL; - int result = allocatePooledDataKVIO((KernelLayer *) poolData, &dataKVIO); - if (result != VDO_SUCCESS) { - freePooledDataKVIO(poolData, dataKVIO); - return result; - } - - *dataPtr = dataKVIO; - return VDO_SUCCESS; -} - -/** - * Dump out the waiters on each DataVIO in the DataVIO buffer pool. - * - * @param queue The queue to check (logical or physical) - * @param waitOn The label to print for queue (logical or physical) - **/ -static void dumpVIOWaiters(WaitQueue *queue, char *waitOn) -{ - Waiter *first = getFirstWaiter(queue); - if (first == NULL) { - return; - } - - DataVIO *dataVIO = waiterAsDataVIO(first); - logInfo(" %s is locked. Waited on by: VIO %" PRIptr " pbn %" PRIu64 - " lbn %llu d-pbn %llu lastOp %s", - waitOn, dataVIO, getDataVIOAllocation(dataVIO), - dataVIO->logical.lbn, dataVIO->duplicate.pbn, - getOperationName(dataVIO)); - - Waiter *waiter; - for (waiter = first->nextWaiter; - waiter != first; - waiter = waiter->nextWaiter) { - dataVIO = waiterAsDataVIO(waiter); - logInfo(" ... and : VIO %" PRIptr " pbn %llu lbn %" - PRIu64 " d-pbn %llu lastOp %s", - dataVIO, getDataVIOAllocation(dataVIO), dataVIO->logical.lbn, - dataVIO->duplicate.pbn, getOperationName(dataVIO)); - } -} - -/** - * Encode various attributes of a VIO as a string of one-character flags for - * dump logging. This encoding is for logging brevity: - * - * R => VIO completion result not VDO_SUCCESS - * W => VIO is on a wait queue - * D => VIO is a duplicate - * - *

The common case of no flags set will result in an empty, null-terminated - * buffer. If any flags are encoded, the first character in the string will be - * a space character. - * - * @param dataVIO The VIO to encode - * @param buffer The buffer to receive a null-terminated string of encoded - * flag character - **/ -static void encodeVIODumpFlags(DataVIO *dataVIO, char buffer[8]) -{ - char *pFlag = buffer; - *pFlag++ = ' '; - if (dataVIOAsCompletion(dataVIO)->result != VDO_SUCCESS) { - *pFlag++ = 'R'; - } - if (dataVIOAsAllocatingVIO(dataVIO)->waiter.nextWaiter != NULL) { - *pFlag++ = 'W'; - } - if (dataVIO->isDuplicate) { - *pFlag++ = 'D'; - } - if (pFlag == &buffer[1]) { - // No flags, so remove the blank space. - pFlag = buffer; - } - *pFlag = '\0'; -} - -/** - * Dump out info on a DataKVIO from the DataKVIO pool. - * - *

Implements BufferDumpFunction. - * - * @param poolData The pool data - * @param data The DataKVIO to dump - **/ -static void dumpPooledDataKVIO(void *poolData __attribute__((unused)), - void *data) -{ - DataKVIO *dataKVIO = (DataKVIO *) data; - DataVIO *dataVIO = &dataKVIO->dataVIO; - - /* - * This just needs to be big enough to hold a queue (thread) name - * and a function name (plus a separator character and NUL). The - * latter is limited only by taste. - * - * In making this static, we're assuming only one "dump" will run at - * a time. If more than one does run, the log output will be garbled - * anyway. - */ - static char vioWorkItemDumpBuffer[100 + MAX_QUEUE_NAME_LEN]; - /* - * We're likely to be logging a couple thousand of these lines, and - * in some circumstances syslogd may have trouble keeping up, so - * keep it BRIEF rather than user-friendly. - */ - dumpWorkItemToBuffer(&dataKVIO->kvio.enqueueable.workItem, - vioWorkItemDumpBuffer, sizeof(vioWorkItemDumpBuffer)); - // Another static buffer... - // log10(256) = 2.408+, round up: - enum { DECIMAL_DIGITS_PER_UINT64_T = (int) (1 + 2.41 * sizeof(uint64_t)) }; - static char vioBlockNumberDumpBuffer[sizeof("P L D") - + 3 * DECIMAL_DIGITS_PER_UINT64_T]; - if (dataVIO->isDuplicate) { - snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), - "P%llu L%llu D%llu", - getDataVIOAllocation(dataVIO), dataVIO->logical.lbn, - dataVIO->duplicate.pbn); - } else if (hasAllocation(dataVIO)) { - snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), - "P%llu L%llu", - getDataVIOAllocation(dataVIO), dataVIO->logical.lbn); - } else { - snprintf(vioBlockNumberDumpBuffer, sizeof(vioBlockNumberDumpBuffer), - "L%llu", - dataVIO->logical.lbn); - } - - static char vioFlushGenerationBuffer[sizeof(" FG") - + DECIMAL_DIGITS_PER_UINT64_T] = ""; - if (dataVIO->flushGeneration != 0) { - snprintf(vioFlushGenerationBuffer, sizeof(vioFlushGenerationBuffer), - " FG%llu", dataVIO->flushGeneration); - } - - // Encode VIO attributes as a string of one-character flags, usually empty. - static char flagsDumpBuffer[8]; - encodeVIODumpFlags(dataVIO, flagsDumpBuffer); - - logInfo(" kvio %" PRIptr " %s%s %s %s%s", - dataKVIO, vioBlockNumberDumpBuffer, vioFlushGenerationBuffer, - getOperationName(dataVIO), vioWorkItemDumpBuffer, flagsDumpBuffer); - // might want info on: wantAlbireoAnswer / operation / status - // might want info on: bio / bioToSubmit / biosMerged - - dumpVIOWaiters(&dataVIO->logical.waiters, "lbn"); - - // might want to dump more info from VIO here -} - -/**********************************************************************/ -int makeDataKVIOBufferPool(KernelLayer *layer, - uint32_t poolSize, - BufferPool **bufferPoolPtr) -{ - return makeBufferPool("DataKVIO Pool", poolSize, - makePooledDataKVIO, freePooledDataKVIO, - dumpPooledDataKVIO, layer, bufferPoolPtr); -} - -/**********************************************************************/ -DataLocation getDedupeAdvice(const DedupeContext *context) -{ - DataKVIO *dataKVIO = container_of(context, DataKVIO, dedupeContext); - return (DataLocation) { - .state = dataKVIO->dataVIO.newMapped.state, - .pbn = dataKVIO->dataVIO.newMapped.pbn, - }; -} - -/**********************************************************************/ -void setDedupeAdvice(DedupeContext *context, const DataLocation *advice) -{ - DataKVIO *dataKVIO = container_of(context, DataKVIO, dedupeContext); - receiveDedupeAdvice(&dataKVIO->dataVIO, advice); -} diff --git a/vdo/kernel/dataKVIO.h b/vdo/kernel/dataKVIO.h deleted file mode 100644 index c3989f4..0000000 --- a/vdo/kernel/dataKVIO.h +++ /dev/null @@ -1,468 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dataKVIO.h#5 $ - */ - -#ifndef DATA_KVIO_H -#define DATA_KVIO_H - -#include "dataVIO.h" -#include "kvio.h" -#include "uds-block.h" - -typedef struct { - /* - * The BIO which was received from the device mapper to initiate an I/O - * request. This field will be non-NULL only until the request is - * acknowledged. - */ - BIO *bio; - // Cached copies of fields from the bio which will need to be reset after - // we're done. - void *private; - void *endIO; - // This is a copy of the bi_rw field of the BIO which sadly is not just - // a boolean read-write flag, but also includes other flag bits. - unsigned long rw; -} ExternalIORequest; - -/* Dedupe support */ -struct dedupeContext { - UdsRequest udsRequest; - struct list_head pendingList; - Jiffies submissionTime; - Atomic32 requestState; - int status; - bool isPending; - /** Hash of the associated VIO (NULL if not calculated) */ - const UdsChunkName *chunkName; -}; - -typedef struct { - /** - * A pointer to a block that holds the data from the last read operation. - **/ - char *data; - /** - * Temporary storage for doing reads from the underlying device. - **/ - char *buffer; - /** - * A bio structure wrapping the buffer. - **/ - BIO *bio; - /** - * Callback to invoke after completing the read I/O operation. - **/ - DataKVIOCallback callback; - /** - * Mapping state passed to kvdoReadBlock(), used to determine whether - * the data must be uncompressed. - **/ - BlockMappingState mappingState; - /** - * The result code of the read attempt. - **/ - int status; -} ReadBlock; - -struct dataKVIO { - /* The embedded base code's DataVIO */ - DataVIO dataVIO; - /* The embedded KVIO */ - KVIO kvio; - /* The BIO from the request which is being serviced by this KVIO. */ - ExternalIORequest externalIORequest; - /* Dedupe */ - DedupeContext dedupeContext; - /* Read cache */ - ReadBlock readBlock; - /* partial block support */ - BlockSize offset; - bool isPartial; - /* discard support */ - bool hasDiscardPermit; - DiscardSize remainingDiscard; - /** - * A copy of user data written, so we can do additional processing - * (dedupe, compression) after acknowledging the I/O operation and - * thus losing access to the original data. - * - * Also used as buffer space for read-modify-write cycles when - * emulating smaller-than-blockSize I/O operations. - **/ - char *dataBlock; - /** A bio structure describing the #dataBlock buffer. */ - BIO *dataBlockBio; - /** A block used as output during compression or uncompression. */ - char *scratchBlock; -}; - -/** - * Convert a KVIO to a DataKVIO. - * - * @param kvio The KVIO to convert - * - * @return The KVIO as a DataKVIO - **/ -static inline DataKVIO *kvioAsDataKVIO(KVIO *kvio) -{ - ASSERT_LOG_ONLY(isData(kvio), "KVIO is a DataKVIO"); - return container_of(kvio, DataKVIO, kvio); -} - -/** - * Convert a DataKVIO to a KVIO. - * - * @param dataKVIO The DataKVIO to convert - * - * @return The DataKVIO as a KVIO - **/ -static inline KVIO *dataKVIOAsKVIO(DataKVIO *dataKVIO) -{ - return &dataKVIO->kvio; -} - -/** - * Returns a pointer to the DataKVIO wrapping a DataVIO. - * - * @param dataVIO the DataVIO - * - * @return the DataKVIO - **/ -static inline DataKVIO *dataVIOAsDataKVIO(DataVIO *dataVIO) -{ - return container_of(dataVIO, DataKVIO, dataVIO); -} - -/** - * Returns a pointer to the KVIO associated with a DataVIO. - * - * @param dataVIO the DataVIO - * - * @return the KVIO - **/ -static inline KVIO *dataVIOAsKVIO(DataVIO *dataVIO) -{ - return dataKVIOAsKVIO(dataVIOAsDataKVIO(dataVIO)); -} - -/** - * Returns a pointer to the DataKVIO wrapping a work item. - * - * @param item the work item - * - * @return the DataKVIO - **/ -static inline DataKVIO *workItemAsDataKVIO(KvdoWorkItem *item) -{ - return kvioAsDataKVIO(workItemAsKVIO(item)); -} - -/** - * Get the WorkItem from a DataKVIO. - * - * @param dataKVIO The DataKVIO - * - * @return the DataKVIO's work item - **/ -static inline KvdoWorkItem *workItemFromDataKVIO(DataKVIO *dataKVIO) -{ - return &dataKVIOAsKVIO(dataKVIO)->enqueueable.workItem; -} - -/** - * Get the BIO from a DataKVIO. - * - * @param dataKVIO The DataKVIO from which to get the BIO - * - * @return The DataKVIO's BIO - **/ -static inline BIO *getBIOFromDataKVIO(DataKVIO *dataKVIO) -{ - return dataKVIOAsKVIO(dataKVIO)->bio; -} - -/** - * Get the KernelLayer from a DataKVIO. - * - * @param dataKVIO The DataKVIO from which to get the KernelLayer - * - * @return The DataKVIO's KernelLayer - **/ -static inline KernelLayer *getLayerFromDataKVIO(DataKVIO *dataKVIO) -{ - return dataKVIOAsKVIO(dataKVIO)->layer; -} - -/** - * Set up and enqueue a DataKVIO's work item to be processed in the base code - * context. - * - * @param dataKVIO The DataKVIO with the work item to be run - * @param work The function pointer to execute - * @param statsFunction A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -static inline void enqueueDataKVIO(DataKVIO *dataKVIO, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action) -{ - enqueueKVIO(dataKVIOAsKVIO(dataKVIO), work, statsFunction, action); -} - -/** - * Enqueue a DataKVIO on a work queue. - * - * @param queue The queue - * @param dataKVIO The DataKVIO - **/ -static inline void enqueueDataKVIOWork(KvdoWorkQueue *queue, - DataKVIO *dataKVIO) -{ - enqueueKVIOWork(queue, dataKVIOAsKVIO(dataKVIO)); -} - -/** - * Add a trace record for the current source location. - * - * @param dataKVIO The DataKVIO structure to be updated - * @param location The source-location descriptor to be recorded - **/ -static inline void dataKVIOAddTraceRecord(DataKVIO *dataKVIO, - TraceLocation location) -{ - dataVIOAddTraceRecord(&dataKVIO->dataVIO, location); -} - -/** - * Set up and enqueue a DataKVIO on the CPU queue. - * - * @param dataKVIO The DataKVIO to set up - * @param work The function pointer to execute - * @param statsFunction A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -static inline void launchDataKVIOOnCPUQueue(DataKVIO *dataKVIO, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action) -{ - KVIO *kvio = dataKVIOAsKVIO(dataKVIO); - launchKVIO(kvio, work, statsFunction, action, kvio->layer->cpuQueue); -} - -/** - * Set up and enqueue a DataKVIO on the BIO Ack queue. - * - * @param dataKVIO The DataKVIO to set up - * @param work The function pointer to execute - * @param statsFunction A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -static inline void launchDataKVIOOnBIOAckQueue(DataKVIO *dataKVIO, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action) -{ - KVIO *kvio = dataKVIOAsKVIO(dataKVIO); - launchKVIO(kvio, work, statsFunction, action, kvio->layer->bioAckQueue); -} - -/** - * Move a DataKVIO back to the base threads. - * - * @param dataKVIO The DataKVIO to enqueue - **/ -static inline void kvdoEnqueueDataVIOCallback(DataKVIO *dataKVIO) -{ - kvdoEnqueueVIOCallback(dataKVIOAsKVIO(dataKVIO)); -} - -/** - * Check whether the external request bio had FUA set. - * - * @param dataKVIO The DataKVIO to check - * - * @return true if the external request bio had FUA set - **/ -static inline bool requestorSetFUA(DataKVIO *dataKVIO) -{ - return ((dataKVIO->externalIORequest.rw & REQ_FUA) == REQ_FUA); -} - -/** - * Associate a KVIO with a BIO passed in from the block layer, and start - * processing the KVIO. - * - * If setting up a KVIO fails, a message is logged, and the limiter permits - * (request and maybe discard) released, but the caller is responsible for - * disposing of the bio. - * - * @param layer The physical layer - * @param bio The bio for which to create KVIO - * @param arrivalTime The time (in jiffies) when the external request - * entered the device mapbio function - * @param hasDiscardPermit Whether we got a permit from the discardLimiter - * of the kernel layer - * - * @return VDO_SUCCESS or a system error code - **/ -int kvdoLaunchDataKVIOFromBio(KernelLayer *layer, - BIO *bio, - Jiffies arrivalTime, - bool hasDiscardPermit) - __attribute__((warn_unused_result)); - -/** - * Return a batch of DataKVIOs to the pool. - * - *

Implements BatchProcessorCallback. - * - * @param batch The batch processor - * @param closure The kernal layer - **/ -void returnDataKVIOBatchToPool(BatchProcessor *batch, void *closure); - -/** - * Implements DataVIOZeroer. - * - * @param dataVIO The DataVIO to zero - **/ -void kvdoZeroDataVIO(DataVIO *dataVIO); - -/** - * Implements DataCopier. - * - * @param source The DataVIO to copy from - * @param destination The DataVIO to copy to - **/ -void kvdoCopyDataVIO(DataVIO *source, DataVIO *destination); - -/** - * Fetch the data for a block from storage. The fetched data will be - * uncompressed when the callback is called, and the result of the read - * operation will be stored in the ReadBlock's status field. On success, - * the data will be in the ReadBlock's data pointer. - * - * @param dataVIO The DataVIO to read a block in for - * @param location The physical block number to read from - * @param mappingState The mapping state of the block to read - * @param action The bio queue action - * @param callback The function to call when the read is done - **/ -void kvdoReadBlock(DataVIO *dataVIO, - PhysicalBlockNumber location, - BlockMappingState mappingState, - BioQAction action, - DataKVIOCallback callback); - -/** - * Implements DataReader. - * - * @param dataVIO The DataVIO to read - **/ -void kvdoReadDataVIO(DataVIO *dataVIO); - -/** - * Implements DataWriter. - * - * @param dataVIO The DataVIO to write - **/ -void kvdoWriteDataVIO(DataVIO *dataVIO); - -/** - * Implements DataModifier. - * - * @param dataVIO The DataVIO to modify - **/ -void kvdoModifyWriteDataVIO(DataVIO *dataVIO); - -/** - * Implements DataHasher. - * - * @param dataVIO The DataVIO to hash - **/ -void kvdoHashDataVIO(DataVIO *dataVIO); - -/** - * Implements DuplicationChecker. - * - * @param dataVIO The DataVIO containing the block to check - **/ -void kvdoCheckForDuplication(DataVIO *dataVIO); - -/** - * Implements DataAcknowledger. - * - * @param dataVIO The DataVIO to acknowledge - **/ -void kvdoAcknowledgeDataVIO(DataVIO *dataVIO); - -/** - * Implements DataCompressor. - * - * @param dataVIO The DataVIO to compress - **/ -void kvdoCompressDataVIO(DataVIO *dataVIO); - -/** - * Implements AlbireoUpdater. - * - * @param dataVIO The DataVIO which needs to change the entry for its data - **/ -void kvdoUpdateDedupeAdvice(DataVIO *dataVIO); - -/** - * Allocate a buffer pool of DataKVIOs. - * - * @param [in] layer The layer in which the DataKVIOs will operate - * @param [in] poolSize The number of DataKVIOs in the pool - * @param [out] bufferPoolPtr A pointer to hold the new buffer pool - * - * @return VDO_SUCCESS or an error - **/ -int makeDataKVIOBufferPool(KernelLayer *layer, - uint32_t poolSize, - BufferPool **bufferPoolPtr) - __attribute__((warn_unused_result)); - -/** - * Get the state needed to generate UDS metadata from the DataKVIO - * associated with a DedupeContext. - * - * @param context The DedupeContext - * - * @return the advice to store in the UDS index - **/ -DataLocation getDedupeAdvice(const DedupeContext *context) - __attribute__((warn_unused_result)); - -/** - * Set the result of a dedupe query for the DataKVIO associated with a - * DedupeContext. - * - * @param context The context receiving advice - * @param advice A data location at which the chunk named in the context - * might be stored (will be NULL if no advice was found) - **/ -void setDedupeAdvice(DedupeContext *context, const DataLocation *advice); - -#endif /* DATA_KVIO_H */ diff --git a/vdo/kernel/deadlockQueue.c b/vdo/kernel/deadlockQueue.c deleted file mode 100644 index 2350b35..0000000 --- a/vdo/kernel/deadlockQueue.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deadlockQueue.c#1 $ - */ - -#include "deadlockQueue.h" - -/**********************************************************************/ -void initializeDeadlockQueue(DeadlockQueue *queue) -{ - spin_lock_init(&queue->lock); - bio_list_init(&queue->list); -} - -/**********************************************************************/ -void addToDeadlockQueue(DeadlockQueue *queue, BIO *bio, Jiffies arrivalTime) -{ - spin_lock(&queue->lock); - if (bio_list_empty(&queue->list)) { - /* - * If we get more than one pending at once, this will be inaccurate for - * some of them. Oh well. If we've gotten here, we're trying to avoid a - * deadlock; stats are a secondary concern. - */ - queue->arrivalTime = arrivalTime; - } - bio_list_add(&queue->list, bio); - spin_unlock(&queue->lock); -} diff --git a/vdo/kernel/deadlockQueue.h b/vdo/kernel/deadlockQueue.h deleted file mode 100644 index 85e0b46..0000000 --- a/vdo/kernel/deadlockQueue.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deadlockQueue.h#1 $ - */ - -#ifndef DEADLOCK_QUEUE_H -#define DEADLOCK_QUEUE_H - -#include - -#include "bio.h" - -/** - * A holding space for incoming bios if we're not able to block until VIOs - * become available to process them. - **/ -typedef struct deadlockQueue { - /* Protection for the other fields. */ - spinlock_t lock; - /* List of bios we had to accept but don't have VIOs for. */ - struct bio_list list; - /* - * Arrival time to use for statistics tracking for the above bios, since we - * haven't the space to store individual arrival times for each. - */ - Jiffies arrivalTime; -} DeadlockQueue; - -/** - * Initialize the DeadlockQueue structure. - * - * @param queue The structure to initialize - **/ -void initializeDeadlockQueue(DeadlockQueue *queue); - -/** - * Add an incoming bio to the list of saved-up bios we're not ready to start - * processing yet. - * - * This excess buffering on top of what the caller implements is generally a - * bad idea, and should be used only when necessary, such as to avoid a - * possible deadlock situation. - * - * @param queue The incoming-bio queue structure - * @param bio The new incoming bio to save - * @param arrivalTime The arrival time of this new bio - **/ -void addToDeadlockQueue(DeadlockQueue *queue, BIO *bio, Jiffies arrivalTime); - -/** - * Pull an incoming bio off the queue. - * - * The arrival time returned may be incorrect if multiple bios were saved, as - * there is no per-bio storage used, only one saved arrival time for the whole - * queue. - * - * @param [in] queue The incoming-bio queue - * @param [out] arrivalTime The arrival time to use for this bio - * - * @return a BIO pointer, or NULL if none were queued - **/ -static inline BIO *pollDeadlockQueue(DeadlockQueue *queue, - Jiffies *arrivalTime) -{ - spin_lock(&queue->lock); - BIO *bio = bio_list_pop(&queue->list); - if (unlikely(bio != NULL)) { - *arrivalTime = queue->arrivalTime; - } - spin_unlock(&queue->lock); - return bio; -} - -#endif // DEADLOCK_QUEUE_H diff --git a/vdo/kernel/dedupeIndex.c b/vdo/kernel/dedupeIndex.c deleted file mode 100644 index 811cd93..0000000 --- a/vdo/kernel/dedupeIndex.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dedupeIndex.c#1 $ - */ - -#include "dedupeIndex.h" - -#include "numeric.h" - -#include "udsIndex.h" - -// These times are in milliseconds -unsigned int albireoTimeoutInterval = 5000; -unsigned int minAlbireoTimerInterval = 100; - -// These times are in jiffies -Jiffies albireoTimeoutJiffies = 0; -static Jiffies minAlbireoTimerJiffies = 0; - -/**********************************************************************/ -Jiffies getAlbireoTimeout(Jiffies startJiffies) -{ - return maxULong(startJiffies + albireoTimeoutJiffies, - jiffies + minAlbireoTimerJiffies); -} - -/**********************************************************************/ -void setAlbireoTimeoutInterval(unsigned int value) -{ - // Arbitrary maximum value is two minutes - if (value > 120000) { - value = 120000; - } - // Arbitrary minimum value is 2 jiffies - Jiffies albJiffies = msecs_to_jiffies(value); - if (albJiffies < 2) { - albJiffies = 2; - value = jiffies_to_msecs(albJiffies); - } - albireoTimeoutInterval = value; - albireoTimeoutJiffies = albJiffies; -} - -/**********************************************************************/ -void setMinAlbireoTimerInterval(unsigned int value) -{ - // Arbitrary maximum value is one second - if (value > 1000) { - value = 1000; - } - - // Arbitrary minimum value is 2 jiffies - Jiffies minJiffies = msecs_to_jiffies(value); - if (minJiffies < 2) { - minJiffies = 2; - value = jiffies_to_msecs(minJiffies); - } - - minAlbireoTimerInterval = value; - minAlbireoTimerJiffies = minJiffies; -} - -/**********************************************************************/ -int makeDedupeIndex(DedupeIndex **indexPtr, KernelLayer *layer) -{ - if (albireoTimeoutJiffies == 0) { - setAlbireoTimeoutInterval(albireoTimeoutInterval); - } - - if (minAlbireoTimerJiffies == 0) { - setMinAlbireoTimerInterval(minAlbireoTimerInterval); - } - - return makeUDSIndex(layer, indexPtr); -} diff --git a/vdo/kernel/dedupeIndex.h b/vdo/kernel/dedupeIndex.h deleted file mode 100644 index 31d7631..0000000 --- a/vdo/kernel/dedupeIndex.h +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dedupeIndex.h#5 $ - */ - -#ifndef DEDUPE_INDEX_H -#define DEDUPE_INDEX_H - -#include "dataKVIO.h" - -struct dedupeIndex { - - /** - * Do the dedupe section of dmsetup message vdo0 0 dump ... - * - * @param index The dedupe index - * @param showQueue true to dump a dedupe work queue - **/ - void (*dump)(DedupeIndex *index, bool showQueue); - - /** - * Free a dedupe index. The "finish" method must have been called - * first. - * - * @param index The dedupe index - **/ - void (*free)(DedupeIndex *index); - - /** - * Get the name of the deduplication state - * - * @param index The dedupe index - * - * @return the dedupe state name - **/ - const char *(*getDedupeStateName)(DedupeIndex *index); - - /** - * Get the index statistics - * - * @param index The dedupe index - * @param stats The index statistics - **/ - void (*getStatistics)(DedupeIndex *index, IndexStatistics *stats); - - /** - * Process a dmsetup message directed to the index. - * - * @param index The dedupe index - * @param name The message name - * - * @return 0 or an error code - **/ - int (*message)(DedupeIndex *index, const char *name); - - /** - * Look up the chunkname of the DataKVIO. If found, return the PBN - * previously associated with the name. If not found, associate the - * new PBN with the name. - * - * @param dataKVIO The DataKVIO - **/ - void (*post)(DataKVIO *dataKVIO); - - /** - * Look up the chunkname of the DataKVIO. If found, return the PBN - * previously associated with the name. If not found, do nothing. - * - * @param dataKVIO The DataKVIO - **/ - void (*query)(DataKVIO *dataKVIO); - - /** - * Start the dedupe index. - * - * @param index The dedupe index - * @param createFlag If true, create a new index without first attempting - * to load an existing index - **/ - void (*start)(DedupeIndex *index, bool createFlag); - - /** - * Stop the dedupe index. May be called by any thread, but will wait for - * the shutdown to be completed. - * - * @param index The dedupe index - **/ - void (*stop)(DedupeIndex *index); - - /** - * Suspend the dedupe index. If there are any outstanding index - * requests, wait for them to finish. If the index is doing any - * asynchronous writing, wait for the I/O to complete. If the index - * is not open yet and we are doing a rebuild of the master index, - * pause the rebuild so that it can be resumed later. May be called - * from any thread. - * - * @param index The dedupe index - * @param saveFlag True if we should save the index - **/ - void (*suspend)(DedupeIndex *index, bool saveFlag); - - /** - * Resume a suspended dedupe index. May be called from any thread. - * - * @param index The dedupe index - **/ - void (*resume)(DedupeIndex *index); - - /** - * Finish the dedupe index; shuts it down for good and prepares to - * free resources. After this point, no more requests may be sent to - * it. - * - * @param index The dedupe index - **/ - void (*finish)(DedupeIndex *index); - - /** - * Look up the chunkname of the DataKVIO and associate the new PBN with the - * name. - * - * @param dataKVIO The DataKVIO - **/ - void (*update)(DataKVIO *dataKVIO); -}; - -/** - * Make a dedupe index - * - * @param indexPtr dedupe index returned here - * @param layer the kernel layer - * - * @return VDO_SUCCESS or an error code - **/ -int makeDedupeIndex(DedupeIndex **indexPtr, KernelLayer *layer) - __attribute__((warn_unused_result)); - - -/** - * Do the dedupe section of dmsetup message vdo0 0 dump ... - * - * @param index The dedupe index - * @param showQueue true to dump a dedupe work queue - **/ -static inline void dumpDedupeIndex(DedupeIndex *index, bool showQueue) -{ - index->dump(index, showQueue); -} - -/** - * Free the dedupe index - * - * @param index The dedupe index - **/ -static inline void freeDedupeIndex(DedupeIndex **index) -{ - if (*index != NULL) { - (*index)->free(*index); - *index = NULL; - } -} - -/** - * Get the name of the deduplication state - * - * @param index The dedupe index - * - * @return the dedupe state name - **/ -static inline const char *getDedupeStateName(DedupeIndex *index) -{ - return index->getDedupeStateName(index); -} - -/** - * Get the index statistics - * - * @param index The dedupe index - * @param stats The index statistics - **/ -static inline void getIndexStatistics(DedupeIndex *index, - IndexStatistics *stats) -{ - return index->getStatistics(index, stats); -} - -/** - * Return from a dedupe operation by invoking the callback function - * - * @param dataKVIO The DataKVIO - **/ -static inline void invokeDedupeCallback(DataKVIO *dataKVIO) -{ - - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION("$F($dup);cb=dedupe($dup)")); - kvdoEnqueueDataVIOCallback(dataKVIO); -} - -/** - * Process a dmsetup message directed to the index. - * - * @param index The dedupe index - * @param name The message name - * - * @return 0 or an error code - **/ -static inline int messageDedupeIndex(DedupeIndex *index, const char *name) -{ - return index->message(index, name); -} - -/** - * Look up the chunkname of the DataKVIO and identify duplicated chunks. - * - * @param dataKVIO The DataKVIO. These fields are used: - * dedupeContext.chunkName is the chunk name. - * The advice to offer to the index will be obtained - * via getDedupeAdvice(). The advice found in the index - * (or NULL if none) will be returned via setDedupeAdvice(). - * dedupeContext.status is set to the return status code of - * any asynchronous index processing. - **/ -static inline void postDedupeAdvice(DataKVIO *dataKVIO) -{ - KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; - layer->dedupeIndex->post(dataKVIO); -} - -/** - * Look up the chunkname of the DataKVIO and identify duplicated chunks. - * - * @param dataKVIO The DataKVIO. These fields are used: - * dedupeContext.chunkName is the chunk name. - * The advice found in the index (or NULL if none) will - * be returned via setDedupeAdvice(). - * dedupeContext.status is set to the return status code of - * any asynchronous index processing. - **/ -static inline void queryDedupeAdvice(DataKVIO *dataKVIO) -{ - KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; - layer->dedupeIndex->query(dataKVIO); -} - -/** - * Start the dedupe index. - * - * @param index The dedupe index - * @param createFlag If true, create a new index without first attempting - * to load an existing index - **/ -static inline void startDedupeIndex(DedupeIndex *index, bool createFlag) -{ - index->start(index, createFlag); -} - -/** - * Stop the dedupe index. May be called by any thread, but will wait for - * the shutdown to be completed. - * - * @param index The dedupe index - **/ -static inline void stopDedupeIndex(DedupeIndex *index) -{ - return index->stop(index); -} - -/** - * Suspend the dedupe index. If there are any outstanding index - * requests, wait for them to finish. If the index is doing any - * asynchronous writing, wait for the I/O to complete. If the index is - * not open yet and we are doing a rebuild of the master index, pause - * the rebuild so that it can be resumed later. May be called from any - * thread. - * - * @param index The dedupe index - * @param saveFlag True if we should save the index - **/ -static inline void suspendDedupeIndex(DedupeIndex *index, bool saveFlag) -{ - index->suspend(index, saveFlag); -} - -/** - * Resume a suspended dedupe index. May be called from any thread. - * - * @param index The dedupe index - **/ -static inline void resumeDedupeIndex(DedupeIndex *index) -{ - index->resume(index); -} - -/** - * Finish the dedupe index. - * - * @param index The dedupe index - **/ -static inline void finishDedupeIndex(DedupeIndex *index) -{ - return index->finish(index); -} - -/** - * Look up the chunkname of the DataKVIO and associate the new PBN with the - * name. - * - * @param dataKVIO The DataKVIO. These fields are used: - * dedupeContext.chunkName is the chunk name. - * The advice to offer to the index will be obtained - * via getDedupeAdvice(). dedupeContext.status is set to the - * return status code of any asynchronous index processing. - **/ -static inline void updateDedupeAdvice(DataKVIO *dataKVIO) -{ - KernelLayer *layer = dataKVIOAsKVIO(dataKVIO)->layer; - layer->dedupeIndex->update(dataKVIO); -} - -// Interval (in milliseconds or jiffies) from submission until switching to -// fast path and skipping Albireo. -extern unsigned int albireoTimeoutInterval; -extern Jiffies albireoTimeoutJiffies; - -// Minimum time interval (in milliseconds) between timer invocations to -// check for requests waiting for Albireo that should now time out. -extern unsigned int minAlbireoTimerInterval; - -/** - * Calculate the actual end of a timer, taking into account the absolute - * start time and the present time. - * - * @param startJiffies The absolute start time, in jiffies - * - * @return the absolute end time for the timer, in jiffies - **/ -Jiffies getAlbireoTimeout(Jiffies startJiffies); - -/** - * Set the interval from submission until switching to fast path and - * skipping Albireo. - * - * @param value The number of milliseconds - **/ -void setAlbireoTimeoutInterval(unsigned int value); - -/** - * Set the minimum time interval between timer invocations to check for - * requests waiting for Albireo that should now time out. - * - * @param value The number of milliseconds - **/ -void setMinAlbireoTimerInterval(unsigned int value); - -#endif /* DEDUPE_INDEX_H */ diff --git a/vdo/kernel/deviceConfig.c b/vdo/kernel/deviceConfig.c deleted file mode 100644 index 08e864c..0000000 --- a/vdo/kernel/deviceConfig.c +++ /dev/null @@ -1,769 +0,0 @@ -/** - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceConfig.c#14 $ - */ - -#include "deviceConfig.h" - -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" - -#include "kernelLayer.h" -#include "vdoStringUtils.h" - -#include "constants.h" - -enum { - // If we bump this, update the arrays below - TABLE_VERSION = 2, - // Limits used when parsing thread-count config spec strings - BIO_ROTATION_INTERVAL_LIMIT = 1024, - LOGICAL_THREAD_COUNT_LIMIT = 60, - PHYSICAL_THREAD_COUNT_LIMIT = 16, - THREAD_COUNT_LIMIT = 100, - // XXX The bio-submission queue configuration defaults are temporarily - // still being defined here until the new runtime-based thread - // configuration has been fully implemented for managed VDO devices. - - // How many bio submission work queues to use - DEFAULT_NUM_BIO_SUBMIT_QUEUES = 4, - // How often to rotate between bio submission work queues - DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64, -}; - -// arrays for handling different table versions -static const uint8_t REQUIRED_ARGC[] = {10, 12, 9}; -static const uint8_t POOL_NAME_ARG_INDEX[] = {8, 10, 8}; - -/** - * Decide the version number from argv. - * - * @param [in] argc The number of table values - * @param [in] argv The array of table values - * @param [out] errorPtr A pointer to return a error string in - * @param [out] versionPtr A pointer to return the version - * - * @return VDO_SUCCESS or an error code - **/ -static int getVersionNumber(int argc, - char **argv, - char **errorPtr, - TableVersion *versionPtr) -{ - // version, if it exists, is in a form of V - if (sscanf(argv[0], "V%u", versionPtr) == 1) { - if (*versionPtr < 1 || *versionPtr > TABLE_VERSION) { - *errorPtr = "Unknown version number detected"; - return VDO_BAD_CONFIGURATION; - } - } else { - // V0 actually has no version number in the table string - *versionPtr = 0; - } - - // V0 and V1 have no optional parameters. There will always be - // a parameter for thread config, even if its a "." to show - // its an empty list. - if (*versionPtr <= 1) { - if (argc != REQUIRED_ARGC[*versionPtr]) { - *errorPtr = "Incorrect number of arguments for version"; - return VDO_BAD_CONFIGURATION; - } - } else if (argc < REQUIRED_ARGC[*versionPtr]) { - *errorPtr = "Incorrect number of arguments for version"; - return VDO_BAD_CONFIGURATION; - } - - if (*versionPtr != TABLE_VERSION) { - logWarning("Detected version mismatch between kernel module and tools " - " kernel: %d, tool: %d", TABLE_VERSION, *versionPtr); - logWarning("Please consider upgrading management tools to match kernel."); - } - return VDO_SUCCESS; -} - -/**********************************************************************/ -int getPoolNameFromArgv(int argc, - char **argv, - char **errorPtr, - char **poolNamePtr) -{ - TableVersion version; - int result = getVersionNumber(argc, argv, errorPtr, &version); - if (result != VDO_SUCCESS) { - return result; - } - *poolNamePtr = argv[POOL_NAME_ARG_INDEX[version]]; - return VDO_SUCCESS; -} - -/** - * Resolve the config with write policy, physical size, and other unspecified - * fields based on the device, if needed. - * - * @param [in,out] config The config possibly missing values - * @param [in] verbose Whether to log about the underlying device - **/ -static void resolveConfigWithDevice(DeviceConfig *config, - bool verbose) -{ - struct dm_dev *dev = config->ownedDevice; - struct request_queue *requestQueue = bdev_get_queue(dev->bdev); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,7,0) - bool flushSupported - = ((requestQueue->queue_flags & (1ULL << QUEUE_FLAG_WC)) != 0); - bool fuaSupported - = ((requestQueue->queue_flags & (1ULL << QUEUE_FLAG_FUA)) != 0); -#else - bool flushSupported = ((requestQueue->flush_flags & REQ_FLUSH) == REQ_FLUSH); - bool fuaSupported = ((requestQueue->flush_flags & REQ_FUA) == REQ_FUA); -#endif - if (verbose) { - logInfo("underlying device, REQ_FLUSH: %s, REQ_FUA: %s", - (flushSupported ? "supported" : "not supported"), - (fuaSupported ? "supported" : "not supported")); - } else { - // We should probably always log, but need to make sure that makes sense - // before changing behavior. - } - - if (config->writePolicy == WRITE_POLICY_AUTO) { - config->writePolicy - = (flushSupported ? WRITE_POLICY_ASYNC : WRITE_POLICY_SYNC); - logInfo("Using write policy %s automatically.", - getConfigWritePolicyString(config)); - } else { - logInfo("Using write policy %s.", getConfigWritePolicyString(config)); - } - - if (flushSupported && (config->writePolicy == WRITE_POLICY_SYNC)) { - logWarning("WARNING: Running in sync mode atop a device supporting flushes" - " is dangerous!"); - } - - if (config->version == 0) { - uint64_t deviceSize = i_size_read(dev->bdev->bd_inode); - config->physicalBlocks = deviceSize / VDO_BLOCK_SIZE; - } -} - -/** - * Parse a two-valued option into a bool. - * - * @param [in] boolStr The string value to convert to a bool - * @param [in] trueStr The string value which should be converted to true - * @param [in] falseStr The string value which should be converted to false - * @param [out] boolPtr A pointer to return the bool value in - * - * @return VDO_SUCCESS or an error if boolStr is neither trueStr nor falseStr - **/ -__attribute__((warn_unused_result)) -static inline int parseBool(const char *boolStr, - const char *trueStr, - const char *falseStr, - bool *boolPtr) -{ - bool value = false; - if (strcmp(boolStr, trueStr) == 0) { - value = true; - } else if (strcmp(boolStr, falseStr) == 0) { - value = false; - } else { - return VDO_BAD_CONFIGURATION; - } - - *boolPtr = value; - return VDO_SUCCESS; -} - -/** - * Process one component of a thread parameter configuration string and - * update the configuration data structure. - * - * If the thread count requested is invalid, a message is logged and - * -EINVAL returned. If the thread name is unknown, a message is logged - * but no error is returned. - * - * @param threadParamType The type of thread specified - * @param count The thread count requested - * @param config The configuration data structure to update - * - * @return VDO_SUCCESS or -EINVAL - **/ -static int processOneThreadConfigSpec(const char *threadParamType, - unsigned int count, - ThreadCountConfig *config) -{ - // Handle limited thread parameters - if (strcmp(threadParamType, "bioRotationInterval") == 0) { - if (count == 0) { - logError("thread config string error:" - " 'bioRotationInterval' of at least 1 is required"); - return -EINVAL; - } else if (count > BIO_ROTATION_INTERVAL_LIMIT) { - logError("thread config string error:" - " 'bioRotationInterval' cannot be higher than %d", - BIO_ROTATION_INTERVAL_LIMIT); - return -EINVAL; - } - config->bioRotationInterval = count; - return VDO_SUCCESS; - } else if (strcmp(threadParamType, "logical") == 0) { - if (count > LOGICAL_THREAD_COUNT_LIMIT) { - logError("thread config string error: at most %d 'logical' threads" - " are allowed", - LOGICAL_THREAD_COUNT_LIMIT); - return -EINVAL; - } - config->logicalZones = count; - return VDO_SUCCESS; - } else if (strcmp(threadParamType, "physical") == 0) { - if (count > PHYSICAL_THREAD_COUNT_LIMIT) { - logError("thread config string error: at most %d 'physical' threads" - " are allowed", - PHYSICAL_THREAD_COUNT_LIMIT); - return -EINVAL; - } - config->physicalZones = count; - return VDO_SUCCESS; - } else { - // Handle other thread count parameters - if (count > THREAD_COUNT_LIMIT) { - logError("thread config string error: at most %d '%s' threads" - " are allowed", - THREAD_COUNT_LIMIT, threadParamType); - return -EINVAL; - } - - if (strcmp(threadParamType, "hash") == 0) { - config->hashZones = count; - return VDO_SUCCESS; - } else if (strcmp(threadParamType, "cpu") == 0) { - if (count == 0) { - logError("thread config string error:" - " at least one 'cpu' thread required"); - return -EINVAL; - } - config->cpuThreads = count; - return VDO_SUCCESS; - } else if (strcmp(threadParamType, "ack") == 0) { - config->bioAckThreads = count; - return VDO_SUCCESS; - } else if (strcmp(threadParamType, "bio") == 0) { - if (count == 0) { - logError("thread config string error:" - " at least one 'bio' thread required"); - return -EINVAL; - } - config->bioThreads = count; - return VDO_SUCCESS; - } - } - - // Don't fail, just log. This will handle version mismatches between - // user mode tools and kernel. - logInfo("unknown thread parameter type \"%s\"", threadParamType); - return VDO_SUCCESS; -} - -/** - * Parse one component of a thread parameter configuration string and - * update the configuration data structure. - * - * @param spec The thread parameter specification string - * @param config The configuration data to be updated - **/ -static int parseOneThreadConfigSpec(const char *spec, - ThreadCountConfig *config) -{ - char **fields; - int result = splitString(spec, '=', &fields); - if (result != UDS_SUCCESS) { - return result; - } - if ((fields[0] == NULL) || (fields[1] == NULL) || (fields[2] != NULL)) { - logError("thread config string error:" - " expected thread parameter assignment, saw \"%s\"", - spec); - freeStringArray(fields); - return -EINVAL; - } - - unsigned int count; - result = stringToUInt(fields[1], &count); - if (result != UDS_SUCCESS) { - logError("thread config string error: integer value needed, found \"%s\"", - fields[1]); - freeStringArray(fields); - return result; - } - - result = processOneThreadConfigSpec(fields[0], count, config); - freeStringArray(fields); - return result; -} - -/** - * Parse the configuration string passed and update the specified - * counts and other parameters of various types of threads to be created. - * - * The configuration string should contain one or more comma-separated specs - * of the form "typename=number"; the supported type names are "cpu", "ack", - * "bio", "bioRotationInterval", "logical", "physical", and "hash". - * - * If an error occurs during parsing of a single key/value pair, we deem - * it serious enough to stop further parsing. - * - * This function can't set the "reason" value the caller wants to pass - * back, because we'd want to format it to say which field was - * invalid, and we can't allocate the "reason" strings dynamically. So - * if an error occurs, we'll log the details and pass back an error. - * - * @param string Thread parameter configuration string - * @param config The thread configuration data to update - * - * @return VDO_SUCCESS or -EINVAL or -ENOMEM - **/ -static int parseThreadConfigString(const char *string, - ThreadCountConfig *config) -{ - int result = VDO_SUCCESS; - - char **specs; - if (strcmp(".", string) != 0) { - result = splitString(string, ',', &specs); - if (result != UDS_SUCCESS) { - return result; - } - for (unsigned int i = 0; specs[i] != NULL; i++) { - result = parseOneThreadConfigSpec(specs[i], config); - if (result != VDO_SUCCESS) { - break; - } - } - freeStringArray(specs); - } - return result; -} - -/** - * Process one component of an optional parameter string and - * update the configuration data structure. - * - * If the value requested is invalid, a message is logged and - * -EINVAL returned. If the key is unknown, a message is logged - * but no error is returned. - * - * @param key The optional parameter key name - * @param value The optional parameter value - * @param config The configuration data structure to update - * - * @return VDO_SUCCESS or -EINVAL - **/ -static int processOneKeyValuePair(const char *key, - unsigned int value, - DeviceConfig *config) -{ - // Non thread optional parameters - if (strcmp(key, "maxDiscard") == 0) { - if (value == 0) { - logError("optional parameter error:" - " at least one max discard block required"); - return -EINVAL; - } - // Max discard sectors in blkdev_issue_discard is UINT_MAX >> 9 - if (value > (UINT_MAX / VDO_BLOCK_SIZE)) { - logError("optional parameter error: at most %d max discard" - " blocks are allowed", UINT_MAX / VDO_BLOCK_SIZE); - return -EINVAL; - } - config->maxDiscardBlocks = value; - return VDO_SUCCESS; - } - // Handles unknown key names - return processOneThreadConfigSpec(key, value, &config->threadCounts); -} - -/** - * Parse one key/value pair and update the configuration - * data structure. - * - * @param key The optional key name - * @param value The optional value - * @param config The configuration data to be updated - * - * @return VDO_SUCCESS or error - **/ -static int parseOneKeyValuePair(const char *key, - const char *value, - DeviceConfig *config) -{ - if (strcmp(key, "deduplication") == 0) { - return parseBool(value, "on", "off", &config->deduplication); - } - - // The remaining arguments must have integral values. - unsigned int count; - int result = stringToUInt(value, &count); - if (result != UDS_SUCCESS) { - logError("optional config string error: integer value needed, found \"%s\"", - value); - return result; - } - return processOneKeyValuePair(key, count, config); -} - -/** - * Parse all key/value pairs from a list of arguments. - * - * If an error occurs during parsing of a single key/value pair, we deem - * it serious enough to stop further parsing. - * - * This function can't set the "reason" value the caller wants to pass - * back, because we'd want to format it to say which field was - * invalid, and we can't allocate the "reason" strings dynamically. So - * if an error occurs, we'll log the details and return the error. - * - * @param argc The total number of arguments in list - * @param argv The list of key/value pairs - * @param config The device configuration data to update - * - * @return VDO_SUCCESS or error - **/ -static int parseKeyValuePairs(int argc, - char **argv, - DeviceConfig *config) -{ - int result = VDO_SUCCESS; - while (argc) { - result = parseOneKeyValuePair(argv[0], argv[1], config); - if (result != VDO_SUCCESS) { - break; - } - - argc -= 2; - argv += 2; - } - - return result; -} - -/** - * Parse the configuration string passed in for optional arguments. - * - * For V0/V1 configurations, there will only be one optional parameter; - * the thread configuration. The configuration string should contain - * one or more comma-separated specs of the form "typename=number"; the - * supported type names are "cpu", "ack", "bio", "bioRotationInterval", - * "logical", "physical", and "hash". - * - * For V2 configurations and beyond, there could be any number of - * arguments. They should contain one or more key/value pairs - * separated by a space. - * - * @param argSet The structure holding the arguments to parse - * @param errorPtr Pointer to a buffer to hold the error string - * @param config Pointer to device configuration data to update - * - * @return VDO_SUCCESS or error - */ -int parseOptionalArguments(struct dm_arg_set *argSet, - char **errorPtr, - DeviceConfig *config) -{ - int result = VDO_SUCCESS; - - if (config->version == 0 || config->version == 1) { - result = parseThreadConfigString(argSet->argv[0], - &config->threadCounts); - if (result != VDO_SUCCESS) { - *errorPtr = "Invalid thread-count configuration"; - return VDO_BAD_CONFIGURATION; - } - } else { - if ((argSet->argc % 2) != 0) { - *errorPtr = "Odd number of optional arguments given but they" - " should be pairs"; - return VDO_BAD_CONFIGURATION; - } - result = parseKeyValuePairs(argSet->argc, argSet->argv, config); - if (result != VDO_SUCCESS) { - *errorPtr = "Invalid optional argument configuration"; - return VDO_BAD_CONFIGURATION; - } - } - return result; -} - -/** - * Handle a parsing error. - * - * @param configPtr A pointer to the config to free - * @param errorPtr A place to store a constant string about the error - * @param errorStr A constant string to store in errorPtr - **/ -static void handleParseError(DeviceConfig **configPtr, - char **errorPtr, - char *errorStr) -{ - freeDeviceConfig(configPtr); - *errorPtr = errorStr; -} - -/**********************************************************************/ -int parseDeviceConfig(int argc, - char **argv, - struct dm_target *ti, - bool verbose, - DeviceConfig **configPtr) -{ - char **errorPtr = &ti->error; - DeviceConfig *config = NULL; - int result = ALLOCATE(1, DeviceConfig, "DeviceConfig", &config); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Could not allocate config structure"); - return VDO_BAD_CONFIGURATION; - } - - config->owningTarget = ti; - initializeRing(&config->configNode); - - // Save the original string. - result = joinStrings(argv, argc, ' ', &config->originalString); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Could not populate string"); - return VDO_BAD_CONFIGURATION; - } - - // Set defaults. - // - // XXX Defaults for bioThreads and bioRotationInterval are currently defined - // using the old configuration scheme of constants. These values are relied - // upon for performance testing on MGH machines currently. - // This should be replaced with the normally used testing defaults being - // defined in the file-based thread-configuration settings. The values used - // as defaults internally should really be those needed for VDO in its - // default shipped-product state. - config->threadCounts = (ThreadCountConfig) { - .bioAckThreads = 1, - .bioThreads = DEFAULT_NUM_BIO_SUBMIT_QUEUES, - .bioRotationInterval = DEFAULT_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL, - .cpuThreads = 1, - .logicalZones = 0, - .physicalZones = 0, - .hashZones = 0, - }; - config->maxDiscardBlocks = 1; - config->deduplication = true; - - struct dm_arg_set argSet; - - argSet.argc = argc; - argSet.argv = argv; - - result = getVersionNumber(argc, argv, errorPtr, &config->version); - if (result != VDO_SUCCESS) { - // getVersionNumber sets errorPtr itself. - handleParseError(&config, errorPtr, *errorPtr); - return result; - } - // Move the arg pointer forward only if the argument was there. - if (config->version >= 1) { - dm_shift_arg(&argSet); - } - - result = duplicateString(dm_shift_arg(&argSet), "parent device name", - &config->parentDeviceName); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Could not copy parent device name"); - return VDO_BAD_CONFIGURATION; - } - - // Get the physical blocks, if known. - if (config->version >= 1) { - result = kstrtoull(dm_shift_arg(&argSet), 10, &config->physicalBlocks); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Invalid physical block count"); - return VDO_BAD_CONFIGURATION; - } - } - - // Get the logical block size and validate - bool enable512e; - result = parseBool(dm_shift_arg(&argSet), "512", "4096", &enable512e); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Invalid logical block size"); - return VDO_BAD_CONFIGURATION; - } - config->logicalBlockSize = (enable512e ? 512 : 4096); - - // Skip past the two no longer used read cache options. - if (config->version <= 1) { - dm_consume_args(&argSet, 2); - } - - // Get the page cache size. - result = stringToUInt(dm_shift_arg(&argSet), &config->cacheSize); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Invalid block map page cache size"); - return VDO_BAD_CONFIGURATION; - } - - // Get the block map era length. - result = stringToUInt(dm_shift_arg(&argSet), &config->blockMapMaximumAge); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Invalid block map maximum age"); - return VDO_BAD_CONFIGURATION; - } - - // Get the MD RAID5 optimization mode and validate - result = parseBool(dm_shift_arg(&argSet), "on", "off", - &config->mdRaid5ModeEnabled); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Invalid MD RAID5 mode"); - return VDO_BAD_CONFIGURATION; - } - - // Get the write policy and validate. - if (strcmp(argSet.argv[0], "async") == 0) { - config->writePolicy = WRITE_POLICY_ASYNC; - } else if (strcmp(argSet.argv[0], "async-unsafe") == 0) { - config->writePolicy = WRITE_POLICY_ASYNC_UNSAFE; - } else if (strcmp(argSet.argv[0], "sync") == 0) { - config->writePolicy = WRITE_POLICY_SYNC; - } else if (strcmp(argSet.argv[0], "auto") == 0) { - config->writePolicy = WRITE_POLICY_AUTO; - } else { - handleParseError(&config, errorPtr, "Invalid write policy"); - return VDO_BAD_CONFIGURATION; - } - dm_shift_arg(&argSet); - - // Make sure the enum to get the pool name from argv directly is still in - // sync with the parsing of the table line. - if (&argSet.argv[0] != &argv[POOL_NAME_ARG_INDEX[config->version]]) { - handleParseError(&config, errorPtr, "Pool name not in expected location"); - return VDO_BAD_CONFIGURATION; - } - - // Get the address where the albserver is running. Check for validation - // is done in dedupe.c code during startKernelLayer call - result = duplicateString(dm_shift_arg(&argSet), "pool name", - &config->poolName); - if (result != VDO_SUCCESS) { - handleParseError(&config, errorPtr, "Could not copy pool name"); - return VDO_BAD_CONFIGURATION; - } - - // Get the optional arguments and validate. - result = parseOptionalArguments(&argSet, errorPtr, config); - if (result != VDO_SUCCESS) { - // parseOptionalArguments sets errorPtr itself. - handleParseError(&config, errorPtr, *errorPtr); - return result; - } - - /* - * Logical, physical, and hash zone counts can all be zero; then we get one - * thread doing everything, our older configuration. If any zone count is - * non-zero, the others must be as well. - */ - if (((config->threadCounts.logicalZones == 0) - != (config->threadCounts.physicalZones == 0)) - || ((config->threadCounts.physicalZones == 0) - != (config->threadCounts.hashZones == 0)) - ) { - handleParseError(&config, errorPtr, - "Logical, physical, and hash zones counts must all be" - " zero or all non-zero"); - return VDO_BAD_CONFIGURATION; - } - - result = dm_get_device(ti, config->parentDeviceName, - dm_table_get_mode(ti->table), &config->ownedDevice); - if (result != 0) { - logError("couldn't open device \"%s\": error %d", - config->parentDeviceName, result); - handleParseError(&config, errorPtr, "Unable to open storage device"); - return VDO_BAD_CONFIGURATION; - } - - resolveConfigWithDevice(config, verbose); - - *configPtr = config; - return result; -} - -/**********************************************************************/ -void freeDeviceConfig(DeviceConfig **configPtr) -{ - if (configPtr == NULL) { - return; - } - - DeviceConfig *config = *configPtr; - if (config == NULL) { - *configPtr = NULL; - return; - } - - if (config->ownedDevice != NULL) { - dm_put_device(config->owningTarget, config->ownedDevice); - } - - FREE(config->poolName); - FREE(config->parentDeviceName); - FREE(config->originalString); - - // Reduce the chance a use-after-free (as in BZ 1669960) happens to work. - memset(config, 0, sizeof(*config)); - - FREE(config); - *configPtr = NULL; -} - -/**********************************************************************/ -const char *getConfigWritePolicyString(DeviceConfig *config) -{ - switch (config->writePolicy) { - case WRITE_POLICY_AUTO: - return "auto"; - case WRITE_POLICY_ASYNC: - return "async"; - case WRITE_POLICY_ASYNC_UNSAFE: - return "async-unsafe"; - case WRITE_POLICY_SYNC: - return "sync"; - default: - return "unknown"; - } -} - -/**********************************************************************/ -void setDeviceConfigLayer(DeviceConfig *config, KernelLayer *layer) -{ - unspliceRingNode(&config->configNode); - if (layer != NULL) { - pushRingNode(&layer->deviceConfigRing, &config->configNode); - } - config->layer = layer; -} diff --git a/vdo/kernel/deviceConfig.h b/vdo/kernel/deviceConfig.h deleted file mode 100644 index 36199dd..0000000 --- a/vdo/kernel/deviceConfig.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceConfig.h#11 $ - */ -#ifndef DEVICE_CONFIG_H -#define DEVICE_CONFIG_H - -#include - -#include "ringNode.h" - -#include "kernelTypes.h" - -// This structure is memcmp'd for equality. Keep it -// packed and don't add any fields that are not -// properly set in both extant and parsed configs. -typedef struct { - int bioAckThreads; - int bioThreads; - int bioRotationInterval; - int cpuThreads; - int logicalZones; - int physicalZones; - int hashZones; -} __attribute__((packed)) ThreadCountConfig; - -typedef uint32_t TableVersion; - -typedef struct { - struct dm_target *owningTarget; - struct dm_dev *ownedDevice; - KernelLayer *layer; - /** All configs referencing a layer are kept on a ring in the layer */ - RingNode configNode; - char *originalString; - TableVersion version; - char *parentDeviceName; - BlockCount physicalBlocks; - unsigned int logicalBlockSize; - WritePolicy writePolicy; - unsigned int cacheSize; - unsigned int blockMapMaximumAge; - bool mdRaid5ModeEnabled; - bool deduplication; - char *poolName; - ThreadCountConfig threadCounts; - BlockCount maxDiscardBlocks; -} DeviceConfig; - -/** - * Convert a RingNode to the DeviceConfig that contains it. - * - * @param node The RingNode to convert - * - * @return The DeviceConfig wrapping the RingNode - **/ -static inline DeviceConfig *asDeviceConfig(RingNode *node) -{ - if (node == NULL) { - return NULL; - } - return (DeviceConfig *) ((byte *) node - offsetof(DeviceConfig, configNode)); -} - -/** - * Grab a pointer to the pool name out of argv. - * - * @param [in] argc The number of table values - * @param [in] argv The array of table values - * @param [out] errorPtr A pointer to return a error string in - * @param [out] poolNamePtr A pointer to return the pool name - * - * @return VDO_SUCCESS or an error code - **/ -int getPoolNameFromArgv(int argc, - char **argv, - char **errorPtr, - char **poolNamePtr) - __attribute__((warn_unused_result)); - -/** - * Convert the dmsetup table into a DeviceConfig. - * - * @param [in] argc The number of table values - * @param [in] argv The array of table values - * @param [in] ti The target structure for this table - * @param [in] verbose Whether to log about the underlying device - * @param [out] configPtr A pointer to return the allocated config - * - * @return VDO_SUCCESS or an error code - **/ -int parseDeviceConfig(int argc, - char **argv, - struct dm_target *ti, - bool verbose, - DeviceConfig **configPtr) - __attribute__((warn_unused_result)); - -/** - * Free a device config created by parseDeviceConfig(). - * - * @param configPtr The pointer holding the config, which will be nulled - **/ -void freeDeviceConfig(DeviceConfig **configPtr); - -/** - * Get the text describing the write policy. - * - * @param config The device config - * - * @returns a pointer to a string describing the write policy - **/ -const char *getConfigWritePolicyString(DeviceConfig *config) - __attribute__((warn_unused_result)); - -/** - * Acquire or release a reference from the config to a kernel layer. - * - * @param config The config in question - * @param layer The kernel layer in question - **/ -void setDeviceConfigLayer(DeviceConfig *config, KernelLayer *layer); - -#endif // DEVICE_CONFIG_H diff --git a/vdo/kernel/deviceRegistry.c b/vdo/kernel/deviceRegistry.c deleted file mode 100644 index 13764b4..0000000 --- a/vdo/kernel/deviceRegistry.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceRegistry.c#3 $ - */ - -#include "deviceRegistry.h" - -#include -#include -#include - -#include "memoryAlloc.h" - -/* - * We don't expect this set to ever get really large, so a linked list - * is adequate. We can use a PointerMap if we need to later. - */ -typedef struct { - struct list_head links; - rwlock_t lock; -} DeviceRegistry; - -typedef struct { - struct list_head links; - KernelLayer *layer; -} RegisteredDevice; - -static DeviceRegistry registry; - -/**********************************************************************/ -void initializeDeviceRegistryOnce(void) -{ - INIT_LIST_HEAD(®istry.links); - rwlock_init(®istry.lock); -} - -/** - * Implements LayerFilter. - **/ -static bool layerIsEqual(KernelLayer *layer, void *context) -{ - return ((void *) layer == context); -} - -/** - * Find a layer in the registry if it exists there. Must be called holding - * the lock. - * - * @param filter The filter function to apply to devices - * @param context A bit of context to provide the filter. - * - * @return the layer object found, if any - **/ -__attribute__((warn_unused_result)) -static KernelLayer *filterLayersLocked(LayerFilter *filter, void *context) -{ - RegisteredDevice *device; - list_for_each_entry(device, ®istry.links, links) { - if (filter(device->layer, context)) { - return device->layer; - } - } - return NULL; -} - -/**********************************************************************/ -int addLayerToDeviceRegistry(KernelLayer *layer) -{ - RegisteredDevice *newDevice; - int result = ALLOCATE(1, RegisteredDevice, __func__, &newDevice); - if (result != VDO_SUCCESS) { - return result; - } - - INIT_LIST_HEAD(&newDevice->links); - newDevice->layer = layer; - - write_lock(®istry.lock); - KernelLayer *oldLayer = filterLayersLocked(layerIsEqual, layer); - result = ASSERT(oldLayer == NULL, "Layer not already registered"); - if (result == VDO_SUCCESS) { - list_add_tail(&newDevice->links, ®istry.links); - } - write_unlock(®istry.lock); - - return result; -} - -/**********************************************************************/ -void removeLayerFromDeviceRegistry(KernelLayer *layer) -{ - write_lock(®istry.lock); - RegisteredDevice *device = NULL; - list_for_each_entry(device, ®istry.links, links) { - if (device->layer == layer) { - list_del_init(&device->links); - FREE(device); - break; - } - } - write_unlock(®istry.lock); -} - -/**********************************************************************/ -KernelLayer *findLayerMatching(LayerFilter *filter, void *context) -{ - read_lock(®istry.lock); - KernelLayer *layer = filterLayersLocked(filter, context); - read_unlock(®istry.lock); - return layer; -} diff --git a/vdo/kernel/deviceRegistry.h b/vdo/kernel/deviceRegistry.h deleted file mode 100644 index 94c1635..0000000 --- a/vdo/kernel/deviceRegistry.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/deviceRegistry.h#2 $ - */ - -#ifndef DEVICE_REGISTRY_H -#define DEVICE_REGISTRY_H - -#include "kernelTypes.h" - -/** - * Initialize the necessary structures for the device registry. - **/ -void initializeDeviceRegistryOnce(void); - -/** - * Add a layer to the device registry. The layer must not already exist in the - * registry. - * - * @param layer The layer to add - * - * @return VDO_SUCCESS or an error - **/ -int addLayerToDeviceRegistry(KernelLayer *layer) - __attribute__((warn_unused_result)); - -/** - * Remove a layer from the device registry. - * - * @param layer The layer to remove - **/ -void removeLayerFromDeviceRegistry(KernelLayer *layer); - -/** - * Find and return the first (if any) layer matching a given filter function. - * - * @param filter The filter function to apply to layers - * @param context A bit of context to provide the filter. - **/ -KernelLayer *findLayerMatching(LayerFilter *filter, void *context); - -#endif // DEVICE_REGISTRY_H diff --git a/vdo/kernel/dmvdo.c b/vdo/kernel/dmvdo.c deleted file mode 100644 index a6c7b98..0000000 --- a/vdo/kernel/dmvdo.c +++ /dev/null @@ -1,889 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dmvdo.c#42 $ - */ - -#include "dmvdo.h" - -#include - -#include "logger.h" -#include "memoryAlloc.h" - -#include "constants.h" -#include "ringNode.h" -#include "threadConfig.h" -#include "vdo.h" - -#include "dedupeIndex.h" -#include "deviceRegistry.h" -#include "dump.h" -#include "instanceNumber.h" -#include "ioSubmitter.h" -#include "kernelLayer.h" -#include "kvdoFlush.h" -#include "memoryUsage.h" -#include "statusProcfs.h" -#include "stringUtils.h" -#include "sysfs.h" -#include "threadDevice.h" -#include "threadRegistry.h" - -struct kvdoDevice kvdoDevice; // global driver state (poorly named) - -/* - * Pre kernel version 4.3, we use the functionality in blkdev_issue_discard - * and the value in max_discard_sectors to split large discards into smaller - * ones. 4.3 to 4.18 kernels have removed the code in blkdev_issue_discard - * and so in place of that, we use the code in device mapper itself to - * split the discards. Unfortunately, it uses the same value to split large - * discards as it does to split large data bios. - * - * In kernel version 4.18, support for splitting discards was added - * back into blkdev_issue_discard. Since this mode of splitting - * (based on max_discard_sectors) is preferable to splitting always - * on 4k, we are turning off the device mapper splitting from 4.18 - * on. - */ -#define HAS_NO_BLKDEV_SPLIT LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) \ - && LINUX_VERSION_CODE < KERNEL_VERSION(4,18,0) - -/**********************************************************************/ - -/** - * Get the kernel layer associated with a dm target structure. - * - * @param ti The dm target structure - * - * @return The kernel layer, or NULL. - **/ -static KernelLayer *getKernelLayerForTarget(struct dm_target *ti) -{ - return ((DeviceConfig *) ti->private)->layer; -} - -/** - * Begin VDO processing of a bio. This is called by the device mapper - * through the "map" function, and has resulted from a call to either - * submit_bio or generic_make_request. - * - * @param ti The dm_target. We only need the "private" member to give - * us the KernelLayer. - * @param bio The bio. - * - * @return One of these values: - * - * negative A negative value is an error code. - * Usually -EIO. - * - * DM_MAPIO_SUBMITTED VDO will take care of this I/O, either - * processing it completely and calling - * bio_endio, or forwarding it onward by - * calling generic_make_request. - * - * DM_MAPIO_REMAPPED VDO has modified the bio and the device - * mapper will immediately forward the bio - * onward using generic_make_request. - * - * DM_MAPIO_REQUEUE We do not use this. It is used by device - * mapper devices to defer an I/O request - * during suspend/resume processing. - **/ -static int vdoMapBio(struct dm_target *ti, BIO *bio) -{ - KernelLayer *layer = getKernelLayerForTarget(ti); - return kvdoMapBio(layer, bio); -} - -/**********************************************************************/ -static void vdoIoHints(struct dm_target *ti, struct queue_limits *limits) -{ - KernelLayer *layer = getKernelLayerForTarget(ti); - - limits->logical_block_size = layer->deviceConfig->logicalBlockSize; - limits->physical_block_size = VDO_BLOCK_SIZE; - - // The minimum io size for random io - blk_limits_io_min(limits, VDO_BLOCK_SIZE); - // The optimal io size for streamed/sequential io - blk_limits_io_opt(limits, VDO_BLOCK_SIZE); - - /* - * Sets the maximum discard size that will be passed into VDO. This value - * comes from a table line value passed in during dmsetup create. - * - * The value 1024 is the largest usable value on HD systems. A 2048 sector - * discard on a busy HD system takes 31 seconds. We should use a value no - * higher than 1024, which takes 15 to 16 seconds on a busy HD system. - * - * But using large values results in 120 second blocked task warnings in - * /var/log/kern.log. In order to avoid these warnings, we choose to use the - * smallest reasonable value. See VDO-3062 and VDO-3087. - * - * We allow setting of the value for max_discard_sectors even in situations - * where we only split on 4k (see comments for HAS_NO_BLKDEV_SPLIT) as the - * value is still used in other code, like sysfs display of queue limits and - * most especially in dm-thin to determine whether to pass down discards. - */ - limits->max_discard_sectors - = layer->deviceConfig->maxDiscardBlocks * VDO_SECTORS_PER_BLOCK; - - limits->discard_granularity = VDO_BLOCK_SIZE; -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0) - limits->discard_zeroes_data = 1; -#endif -} - -/**********************************************************************/ -static int vdoIterateDevices(struct dm_target *ti, - iterate_devices_callout_fn fn, - void *data) -{ - KernelLayer *layer = getKernelLayerForTarget(ti); - sector_t len = blockToSector(layer, layer->deviceConfig->physicalBlocks); - - return fn(ti, layer->deviceConfig->ownedDevice, 0, len, data); -} - -/* - * Status line is: - * - * - */ - -/**********************************************************************/ -static void vdoStatus(struct dm_target *ti, - status_type_t status_type, - unsigned int status_flags, - char *result, - unsigned int maxlen) -{ - KernelLayer *layer = getKernelLayerForTarget(ti); - char nameBuffer[BDEVNAME_SIZE]; - // N.B.: The DMEMIT macro uses the variables named "sz", "result", "maxlen". - int sz = 0; - - switch (status_type) { - case STATUSTYPE_INFO: - // Report info for dmsetup status - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - VDOStatistics *stats = &layer->vdoStatsStorage; - DMEMIT("/dev/%s %s %s %s %s %llu %llu", - bdevname(getKernelLayerBdev(layer), nameBuffer), - stats->mode, - stats->inRecoveryMode ? "recovering" : "-", - getDedupeStateName(layer->dedupeIndex), - getKVDOCompressing(&layer->kvdo) ? "online" : "offline", - stats->dataBlocksUsed + stats->overheadBlocksUsed, - stats->physicalBlocks); - mutex_unlock(&layer->statsMutex); - break; - - case STATUSTYPE_TABLE: - // Report the string actually specified in the beginning. - DMEMIT("%s", ((DeviceConfig *) ti->private)->originalString); - break; - } - -// spin_unlock_irqrestore(&layer->lock, flags); -} - - -/** - * Get the size of the underlying device, in blocks. - * - * @param [in] layer The layer - * - * @return The size in blocks - **/ -static BlockCount getUnderlyingDeviceBlockCount(KernelLayer *layer) -{ - uint64_t physicalSize = i_size_read(getKernelLayerBdev(layer)->bd_inode); - return physicalSize / VDO_BLOCK_SIZE; -} - -/**********************************************************************/ -static int vdoPrepareToGrowLogical(KernelLayer *layer, char *sizeString) -{ - BlockCount logicalCount; - if (sscanf(sizeString, "%llu", &logicalCount) != 1) { - logWarning("Logical block count \"%s\" is not a number", sizeString); - return -EINVAL; - } - - if (logicalCount > MAXIMUM_LOGICAL_BLOCKS) { - logWarning("Logical block count \"%llu\" exceeds the maximum (%" - PRIu64 ")", logicalCount, MAXIMUM_LOGICAL_BLOCKS); - return -EINVAL; - } - - return prepareToResizeLogical(layer, logicalCount); -} - -/** - * Process a dmsetup message now that we know no other message is being - * processed. - * - * @param layer The layer to which the message was sent - * @param argc The argument count of the message - * @param argv The arguments to the message - * - * @return -EINVAL if the message is unrecognized or the result of processing - * the message - **/ -__attribute__((warn_unused_result)) -static int processVDOMessageLocked(KernelLayer *layer, - unsigned int argc, - char **argv) -{ - // Messages with variable numbers of arguments. - if (strncasecmp(argv[0], "x-", 2) == 0) { - int result = performKVDOExtendedCommand(&layer->kvdo, argc, argv); - if (result == VDO_UNKNOWN_COMMAND) { - logWarning("unknown extended command '%s' to dmsetup message", argv[0]); - result = -EINVAL; - } - - return result; - } - - // Messages with fixed numbers of arguments. - switch (argc) { - case 1: - if (strcasecmp(argv[0], "sync-dedupe") == 0) { - waitForNoRequestsActive(layer); - return 0; - } - - if (strcasecmp(argv[0], "trace-on") == 0) { - logInfo("Tracing on"); - layer->traceLogging = true; - return 0; - } - - if (strcasecmp(argv[0], "trace-off") == 0) { - logInfo("Tracing off"); - layer->traceLogging = false; - return 0; - } - - if (strcasecmp(argv[0], "prepareToGrowPhysical") == 0) { - return prepareToResizePhysical(layer, - getUnderlyingDeviceBlockCount(layer)); - } - - if (strcasecmp(argv[0], "growPhysical") == 0) { - // The actual growPhysical will happen when the device is resumed. - - if (layer->deviceConfig->version != 0) { - // XXX Uncomment this branch when new VDO manager is updated to not - // send this message. - - // Old style message on new style table is unexpected; it means the - // user started the VDO with new manager and is growing with old. - // logInfo("Mismatch between growPhysical method and table version."); - // return -EINVAL; - } else { - layer->deviceConfig->physicalBlocks - = getUnderlyingDeviceBlockCount(layer); - } - return 0; - } - - break; - - case 2: - if (strcasecmp(argv[0], "compression") == 0) { - if (strcasecmp(argv[1], "on") == 0) { - setKVDOCompressing(&layer->kvdo, true); - return 0; - } - - if (strcasecmp(argv[1], "off") == 0) { - setKVDOCompressing(&layer->kvdo, false); - return 0; - } - - logWarning("invalid argument '%s' to dmsetup compression message", - argv[1]); - return -EINVAL; - } - - if (strcasecmp(argv[0], "prepareToGrowLogical") == 0) { - return vdoPrepareToGrowLogical(layer, argv[1]); - } - - break; - - - default: - break; - } - - logWarning("unrecognized dmsetup message '%s' received", argv[0]); - return -EINVAL; -} - -/** - * Process a dmsetup message. If the message is a dump, just do it. Otherwise, - * check that no other message is being processed, and only proceed if so. - * - * @param layer The layer to which the message was sent - * @param argc The argument count of the message - * @param argv The arguments to the message - * - * @return -EBUSY if another message is being processed or the result of - * processsing the message - **/ -__attribute__((warn_unused_result)) -static int processVDOMessage(KernelLayer *layer, - unsigned int argc, - char **argv) -{ - /* - * All messages which may be processed in parallel with other messages should - * be handled here before the atomic check below. Messages which should be - * exclusive should be processed in processVDOMessageLocked(). - */ - - // Dump messages should always be processed - if (strcasecmp(argv[0], "dump") == 0) { - return vdoDump(layer, argc, argv, "dmsetup message"); - } - - if (argc == 1) { - if (strcasecmp(argv[0], "dump-on-shutdown") == 0) { - layer->dumpOnShutdown = true; - return 0; - } - - // Index messages should always be processed - if ((strcasecmp(argv[0], "index-close") == 0) - || (strcasecmp(argv[0], "index-create") == 0) - || (strcasecmp(argv[0], "index-disable") == 0) - || (strcasecmp(argv[0], "index-enable") == 0)) { - return messageDedupeIndex(layer->dedupeIndex, argv[0]); - } - - // XXX - the "connect" messages are misnamed for the kernel index. These - // messages should go away when all callers have been fixed to use - // "index-enable" or "index-disable". - if (strcasecmp(argv[0], "reconnect") == 0) { - return messageDedupeIndex(layer->dedupeIndex, "index-enable"); - } - - if (strcasecmp(argv[0], "connect") == 0) { - return messageDedupeIndex(layer->dedupeIndex, "index-enable"); - } - - if (strcasecmp(argv[0], "disconnect") == 0) { - return messageDedupeIndex(layer->dedupeIndex, "index-disable"); - } - } - - if (!compareAndSwapBool(&layer->processingMessage, false, true)) { - return -EBUSY; - } - - int result = processVDOMessageLocked(layer, argc, argv); - atomicStoreBool(&layer->processingMessage, false); - return result; -} - -/**********************************************************************/ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,17,0) -static int vdoMessage(struct dm_target *ti, - unsigned int argc, - char **argv, - char *resultBuffer, - unsigned int maxlen) -#else -static int vdoMessage(struct dm_target *ti, unsigned int argc, char **argv) -#endif -{ - if (argc == 0) { - logWarning("unspecified dmsetup message"); - return -EINVAL; - } - - KernelLayer *layer = getKernelLayerForTarget(ti); - RegisteredThread allocatingThread, instanceThread; - registerAllocatingThread(&allocatingThread, NULL); - registerThreadDevice(&instanceThread, layer); - int result = processVDOMessage(layer, argc, argv); - unregisterThreadDeviceID(); - unregisterAllocatingThread(); - return mapToSystemError(result); -} - -/** - * Configure the dm_target with our capabilities. - * - * @param ti The device mapper target representing our device - * @param layer The kernel layer to get the write policy from - **/ -static void configureTargetCapabilities(struct dm_target *ti, - KernelLayer *layer) -{ - ti->discards_supported = 1; - - /** - * This may appear to indicate we don't support flushes in sync mode. - * However, dm will set up the request queue to accept flushes if any - * device in the stack accepts flushes. Hence if the device under VDO - * accepts flushes, we will receive flushes. - **/ - ti->flush_supported = shouldProcessFlush(layer); - ti->num_discard_bios = 1; - ti->num_flush_bios = 1; - - // If this value changes, please make sure to update the - // value for maxDiscardSectors accordingly. - BUG_ON(dm_set_target_max_io_len(ti, VDO_SECTORS_PER_BLOCK) != 0); - -/* - * Please see comments above where the macro is defined. - */ -#if HAS_NO_BLKDEV_SPLIT - ti->split_discard_bios = 1; -#endif -} - -/** - * Handle a vdoInitialize failure, freeing all appropriate structures. - * - * @param ti The device mapper target representing our device - * @param threadConfig The thread config (possibly NULL) - * @param layer The kernel layer (possibly NULL) - * @param instance The instance number to be released - * @param why The reason for failure - **/ -static void cleanupInitialize(struct dm_target *ti, - ThreadConfig *threadConfig, - KernelLayer *layer, - unsigned int instance, - char *why) -{ - if (threadConfig != NULL) { - freeThreadConfig(&threadConfig); - } - if (layer != NULL) { - // This releases the instance number too. - freeKernelLayer(layer); - } else { - // With no KernelLayer taking ownership we have to release explicitly. - releaseKVDOInstance(instance); - } - - ti->error = why; -} - -/** - * Initializes a single VDO instance and loads the data from disk - * - * @param ti The device mapper target representing our device - * @param instance The device instantiation counter - * @param config The parsed config for the instance - * - * @return VDO_SUCCESS or an error code - * - **/ -static int vdoInitialize(struct dm_target *ti, - unsigned int instance, - DeviceConfig *config) -{ - logInfo("loading device '%s'", config->poolName); - - uint64_t blockSize = VDO_BLOCK_SIZE; - uint64_t logicalSize = to_bytes(ti->len); - BlockCount logicalBlocks = logicalSize / blockSize; - - logDebug("Logical block size = %llu", - (uint64_t) config->logicalBlockSize); - logDebug("Logical blocks = %llu", logicalBlocks); - logDebug("Physical block size = %llu", (uint64_t) blockSize); - logDebug("Physical blocks = %llu", config->physicalBlocks); - logDebug("Block map cache blocks = %u", config->cacheSize); - logDebug("Block map maximum age = %u", config->blockMapMaximumAge); - logDebug("MD RAID5 mode = %s", (config->mdRaid5ModeEnabled - ? "on" : "off")); - logDebug("Write policy = %s", getConfigWritePolicyString(config)); - logDebug("Deduplication = %s", (config->deduplication - ? "on" : "off")); - - // The threadConfig will be copied by the VDO if it's successfully - // created. - VDOLoadConfig loadConfig = { - .cacheSize = config->cacheSize, - .threadConfig = NULL, - .writePolicy = config->writePolicy, - .maximumAge = config->blockMapMaximumAge, - }; - - char *failureReason; - KernelLayer *layer; - int result = makeKernelLayer(ti->begin, instance, config, - &kvdoDevice.kobj, &loadConfig.threadConfig, - &failureReason, &layer); - if (result != VDO_SUCCESS) { - logError("Could not create kernel physical layer. (VDO error %d," - " message %s)", result, failureReason); - cleanupInitialize(ti, loadConfig.threadConfig, NULL, instance, - failureReason); - return result; - } - - // Now that we have read the geometry, we can finish setting up the - // VDOLoadConfig. - setLoadConfigFromGeometry(&layer->geometry, &loadConfig); - - if (config->cacheSize < (2 * MAXIMUM_USER_VIOS - * loadConfig.threadConfig->logicalZoneCount)) { - logWarning("Insufficient block map cache for logical zones"); - cleanupInitialize(ti, loadConfig.threadConfig, layer, instance, - "Insufficient block map cache for logical zones"); - return VDO_BAD_CONFIGURATION; - } - - // Henceforth it is the kernel layer's responsibility to clean up the - // ThreadConfig. - result = preloadKernelLayer(layer, &loadConfig, &failureReason); - if (result != VDO_SUCCESS) { - logError("Could not start kernel physical layer. (VDO error %d," - " message %s)", result, failureReason); - cleanupInitialize(ti, NULL, layer, instance, failureReason); - return result; - } - - setDeviceConfigLayer(config, layer); - setKernelLayerActiveConfig(layer, config); - ti->private = config; - configureTargetCapabilities(ti, layer); - return VDO_SUCCESS; -} - -/**********************************************************************/ -static int vdoCtr(struct dm_target *ti, unsigned int argc, char **argv) -{ - int result = VDO_SUCCESS; - - RegisteredThread allocatingThread; - registerAllocatingThread(&allocatingThread, NULL); - - const char *deviceName = dm_device_name(dm_table_get_md(ti->table)); - KernelLayer *oldLayer = findLayerMatching(layerIsNamed, (void *)deviceName); - unsigned int instance; - if (oldLayer == NULL) { - result = allocateKVDOInstance(&instance); - if (result != VDO_SUCCESS) { - unregisterAllocatingThread(); - return -ENOMEM; - } - } else { - instance = oldLayer->instance; - } - - RegisteredThread instanceThread; - registerThreadDeviceID(&instanceThread, &instance); - - bool verbose = (oldLayer == NULL); - DeviceConfig *config = NULL; - result = parseDeviceConfig(argc, argv, ti, verbose, &config); - if (result != VDO_SUCCESS) { - unregisterThreadDeviceID(); - unregisterAllocatingThread(); - if (oldLayer == NULL) { - releaseKVDOInstance(instance); - } - return -EINVAL; - } - - // Is there already a device of this name? - if (oldLayer != NULL) { - /* - * To preserve backward compatibility with old VDO Managers, we need to - * allow this to happen when either suspended or not. We could assert - * that if the config is version 0, we are suspended, and if not, we - * are not, but we can't do that till new VDO Manager does the right - * order. - */ - logInfo("preparing to modify device '%s'", config->poolName); - result = prepareToModifyKernelLayer(oldLayer, config, &ti->error); - if (result != VDO_SUCCESS) { - result = mapToSystemError(result); - freeDeviceConfig(&config); - } else { - setDeviceConfigLayer(config, oldLayer); - ti->private = config; - configureTargetCapabilities(ti, oldLayer); - } - unregisterThreadDeviceID(); - unregisterAllocatingThread(); - return result; - } - - result = vdoInitialize(ti, instance, config); - if (result != VDO_SUCCESS) { - // vdoInitialize calls into various VDO routines, so map error - result = mapToSystemError(result); - freeDeviceConfig(&config); - } - - unregisterThreadDeviceID(); - unregisterAllocatingThread(); - return result; -} - -/**********************************************************************/ -static void vdoDtr(struct dm_target *ti) -{ - DeviceConfig *config = ti->private; - KernelLayer *layer = config->layer; - - setDeviceConfigLayer(config, NULL); - - if (isRingEmpty(&layer->deviceConfigRing)) { - // This was the last config referencing the layer. Free it. - unsigned int instance = layer->instance; - RegisteredThread allocatingThread, instanceThread; - registerThreadDeviceID(&instanceThread, &instance); - registerAllocatingThread(&allocatingThread, NULL); - - waitForNoRequestsActive(layer); - logInfo("stopping device '%s'", config->poolName); - - if (layer->dumpOnShutdown) { - vdoDumpAll(layer, "device shutdown"); - } - - freeKernelLayer(layer); - logInfo("device '%s' stopped", config->poolName); - unregisterThreadDeviceID(); - unregisterAllocatingThread(); - } else if (config == layer->deviceConfig) { - // The layer still references this config. Give it a reference to a - // config that isn't being destroyed. - layer->deviceConfig = asDeviceConfig(layer->deviceConfigRing.next); - } - - freeDeviceConfig(&config); - ti->private = NULL; -} - -/**********************************************************************/ -static void vdoPresuspend(struct dm_target *ti) -{ - KernelLayer *layer = getKernelLayerForTarget(ti); - RegisteredThread instanceThread; - registerThreadDevice(&instanceThread, layer); - if (dm_noflush_suspending(ti)) { - layer->noFlushSuspend = true; - } - unregisterThreadDeviceID(); -} - -/**********************************************************************/ -static void vdoPostsuspend(struct dm_target *ti) -{ - KernelLayer *layer = getKernelLayerForTarget(ti); - RegisteredThread instanceThread; - registerThreadDevice(&instanceThread, layer); - const char *poolName = layer->deviceConfig->poolName; - logInfo("suspending device '%s'", poolName); - int result = suspendKernelLayer(layer); - if (result == VDO_SUCCESS) { - logInfo("device '%s' suspended", poolName); - } else { - logError("suspend of device '%s' failed with error: %d", poolName, result); - } - layer->noFlushSuspend = false; - unregisterThreadDeviceID(); -} - -/**********************************************************************/ -static int vdoPreresume(struct dm_target *ti) -{ - KernelLayer *layer = getKernelLayerForTarget(ti); - DeviceConfig *config = ti->private; - RegisteredThread instanceThread; - - BlockCount backingBlocks = getUnderlyingDeviceBlockCount(layer); - if (backingBlocks < config->physicalBlocks) { - logError("resume of device '%s' failed: backing device has %" PRIu64 - " blocks but VDO physical size is %llu blocks", - config->poolName, backingBlocks, config->physicalBlocks); - return -EINVAL; - } - - registerThreadDevice(&instanceThread, layer); - - if (getKernelLayerState(layer) == LAYER_STARTING) { - // This is the first time this device has been resumed, so run it. - logInfo("starting device '%s'", config->poolName); - char *failureReason; - int result = startKernelLayer(layer, &failureReason); - if (result != VDO_SUCCESS) { - logError("Could not run kernel physical layer. (VDO error %d," - " message %s)", result, failureReason); - setKVDOReadOnly(&layer->kvdo, result); - unregisterThreadDeviceID(); - return mapToSystemError(result); - } - - logInfo("device '%s' started", config->poolName); - } - - logInfo("resuming device '%s'", config->poolName); - - // This is a noop if nothing has changed, and by calling it every time - // we capture old-style growPhysicals, which change the config in place. - int result = modifyKernelLayer(layer, config); - if (result != VDO_SUCCESS) { - logErrorWithStringError(result, "Commit of modifications to device '%s'" - " failed", config->poolName); - setKernelLayerActiveConfig(layer, config); - setKVDOReadOnly(&layer->kvdo, result); - } else { - setKernelLayerActiveConfig(layer, config); - result = resumeKernelLayer(layer); - if (result != VDO_SUCCESS) { - logError("resume of device '%s' failed with error: %d", - layer->deviceConfig->poolName, result); - } - } - unregisterThreadDeviceID(); - return mapToSystemError(result); -} - -/**********************************************************************/ -static void vdoResume(struct dm_target *ti) -{ - KernelLayer *layer = getKernelLayerForTarget(ti); - RegisteredThread instanceThread; - registerThreadDevice(&instanceThread, layer); - logInfo("device '%s' resumed", layer->deviceConfig->poolName); - unregisterThreadDeviceID(); -} - -/* - * If anything changes that affects how user tools will interact - * with vdo, update the version number and make sure - * documentation about the change is complete so tools can - * properly update their management code. - */ -static struct target_type vdoTargetBio = { - .features = DM_TARGET_SINGLETON, - .name = "vdo", - .version = {6, 2, 3}, - .module = THIS_MODULE, - .ctr = vdoCtr, - .dtr = vdoDtr, - .io_hints = vdoIoHints, - .iterate_devices = vdoIterateDevices, - .map = vdoMapBio, - .message = vdoMessage, - .status = vdoStatus, - .presuspend = vdoPresuspend, - .postsuspend = vdoPostsuspend, - .preresume = vdoPreresume, - .resume = vdoResume, -}; - -static bool dmRegistered = false; -static bool sysfsInitialized = false; - -/**********************************************************************/ -static void vdoDestroy(void) -{ - logDebug("in %s", __func__); - - kvdoDevice.status = SHUTTING_DOWN; - - if (sysfsInitialized) { - vdoPutSysfs(&kvdoDevice.kobj); - } - vdoDestroyProcfs(); - - kvdoDevice.status = UNINITIALIZED; - - if (dmRegistered) { - dm_unregister_target(&vdoTargetBio); - } - - cleanUpInstanceNumberTracking(); - - logInfo("unloaded version %s", CURRENT_VERSION); -} - -/**********************************************************************/ -static int __init vdoInit(void) -{ - int result = 0; - - initializeThreadDeviceRegistry(); - initializeStandardErrorBlocks(); - initializeDeviceRegistryOnce(); - logInfo("loaded version %s", CURRENT_VERSION); - - result = dm_register_target(&vdoTargetBio); - if (result < 0) { - logError("dm_register_target failed %d", result); - vdoDestroy(); - return result; - } - dmRegistered = true; - - kvdoDevice.status = UNINITIALIZED; - - vdoInitProcfs(); - /* - * Set up global sysfs stuff - */ - result = vdoInitSysfs(&kvdoDevice.kobj); - if (result < 0) { - logError("sysfs initialization failed %d", result); - vdoDestroy(); - // vdoInitSysfs only returns system error codes - return result; - } - sysfsInitialized = true; - - initWorkQueueOnce(); - initializeTraceLoggingOnce(); - initKernelVDOOnce(); - initializeInstanceNumberTracking(); - - kvdoDevice.status = READY; - return result; -} - -/**********************************************************************/ -static void __exit vdoExit(void) -{ - vdoDestroy(); -} - -module_init(vdoInit); -module_exit(vdoExit); - -MODULE_DESCRIPTION(DM_NAME " target for transparent deduplication"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); -MODULE_VERSION(CURRENT_VERSION); diff --git a/vdo/kernel/dmvdo.h b/vdo/kernel/dmvdo.h deleted file mode 100644 index a71e39d..0000000 --- a/vdo/kernel/dmvdo.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dmvdo.h#2 $ - */ - -#ifndef DMVDO_H -#define DMVDO_H - -#include -#include -#include - -#include "kernelLayer.h" - -typedef enum { - UNINITIALIZED = 0, - READY, - SHUTTING_DOWN, -} KVDOStatus; - -/* - * The internal representation of our device. - */ -struct kvdoDevice { - KVDOStatus status; - struct kobject kobj; -}; - -extern struct kvdoDevice kvdoDevice; - -#endif /* DMVDO_H */ diff --git a/vdo/kernel/dump.c b/vdo/kernel/dump.c deleted file mode 100644 index b9b02e2..0000000 --- a/vdo/kernel/dump.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dump.c#2 $ - */ - -#include "dump.h" - -#include - -#include "memoryAlloc.h" -#include "typeDefs.h" - -#include "constants.h" -#include "vdo.h" - -#include "dedupeIndex.h" -#include "histogram.h" -#include "ioSubmitter.h" -#include "logger.h" - -enum dumpOptions { - // WorkQueues - SHOW_ALBIREO_QUEUE, - SHOW_BIO_ACK_QUEUE, - SHOW_BIO_QUEUE, - SHOW_CPU_QUEUES, - SHOW_REQUEST_QUEUE, - // MemoryPools - SHOW_VIO_POOL, - // Others - SHOW_VDO_STATUS, - // This one means an option overrides the "default" choices, instead - // of altering them. - SKIP_DEFAULT -}; - -enum dumpOptionFlags { - // WorkQueues - FLAG_SHOW_ALBIREO_QUEUE = (1 << SHOW_ALBIREO_QUEUE), - FLAG_SHOW_BIO_ACK_QUEUE = (1 << SHOW_BIO_ACK_QUEUE), - FLAG_SHOW_BIO_QUEUE = (1 << SHOW_BIO_QUEUE), - FLAG_SHOW_CPU_QUEUES = (1 << SHOW_CPU_QUEUES), - FLAG_SHOW_REQUEST_QUEUE = (1 << SHOW_REQUEST_QUEUE), - // MemoryPools - FLAG_SHOW_VIO_POOL = (1 << SHOW_VIO_POOL), - // Others - FLAG_SHOW_VDO_STATUS = (1 << SHOW_VDO_STATUS), - // Special - FLAG_SKIP_DEFAULT = (1 << SKIP_DEFAULT) - }; - -enum { - FLAGS_ALL_POOLS = (FLAG_SHOW_VIO_POOL), - FLAGS_ALL_QUEUES = (FLAG_SHOW_REQUEST_QUEUE - | FLAG_SHOW_ALBIREO_QUEUE - | FLAG_SHOW_BIO_ACK_QUEUE - | FLAG_SHOW_BIO_QUEUE - | FLAG_SHOW_CPU_QUEUES), - FLAGS_ALL_THREADS = (FLAGS_ALL_QUEUES), - DEFAULT_DUMP_FLAGS = (FLAGS_ALL_THREADS | FLAG_SHOW_VDO_STATUS) -}; - -/**********************************************************************/ -static inline bool isArgString(const char *arg, const char *thisOption) -{ - // device-mapper convention seems to be case-independent options - return strncasecmp(arg, thisOption, strlen(thisOption)) == 0; -} - -/**********************************************************************/ -static void doDump(KernelLayer *layer, - unsigned int dumpOptionsRequested, - const char *why) -{ - logInfo("%s dump triggered via %s", THIS_MODULE->name, why); - // XXX Add in number of outstanding requests being processed by vdo - uint32_t active, maximum; - getLimiterValuesAtomically(&layer->requestLimiter, &active, &maximum); - int64_t outstanding = atomic64_read(&layer->biosSubmitted) - - atomic64_read(&layer->biosCompleted); - logInfo("%" PRIu32 " device requests outstanding (max %" PRIu32 "), " - "%" PRId64 " bio requests outstanding, poolName '%s'", - active, maximum, outstanding, layer->deviceConfig->poolName); - if ((dumpOptionsRequested & FLAG_SHOW_REQUEST_QUEUE) != 0) { - dumpKVDOWorkQueue(&layer->kvdo); - } - if ((dumpOptionsRequested & FLAG_SHOW_BIO_QUEUE) != 0) { - dumpBioWorkQueue(layer->ioSubmitter); - } - if (useBioAckQueue(layer) - && ((dumpOptionsRequested & FLAG_SHOW_BIO_ACK_QUEUE) != 0)) { - dumpWorkQueue(layer->bioAckQueue); - } - if ((dumpOptionsRequested & FLAG_SHOW_CPU_QUEUES) != 0) { - dumpWorkQueue(layer->cpuQueue); - } - dumpDedupeIndex(layer->dedupeIndex, - (dumpOptionsRequested & FLAG_SHOW_ALBIREO_QUEUE) != 0); - dumpBufferPool(layer->dataKVIOPool, - (dumpOptionsRequested & FLAG_SHOW_VIO_POOL) != 0); - if ((dumpOptionsRequested & FLAG_SHOW_VDO_STATUS) != 0) { - // Options should become more fine-grained when we have more to - // display here. - dumpKVDOStatus(&layer->kvdo); - } - reportMemoryUsage(); - logInfo("end of %s dump", THIS_MODULE->name); -} - -/**********************************************************************/ -static int parseDumpOptions(unsigned int argc, - char * const *argv, - unsigned int *dumpOptionsRequestedPtr) -{ - unsigned int dumpOptionsRequested = 0; - - static const struct { - const char *name; - unsigned int flags; - } optionNames[] = { - // Should "albireo" mean sending queue + receiving thread + outstanding? - { "dedupe", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, - { "dedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, - { "kvdodedupeq", FLAG_SKIP_DEFAULT | FLAG_SHOW_ALBIREO_QUEUE }, - { "bioack", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, - { "kvdobioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, - { "bioackq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_ACK_QUEUE }, - { "bio", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, - { "kvdobioq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, - { "bioq", FLAG_SKIP_DEFAULT | FLAG_SHOW_BIO_QUEUE }, - { "cpu", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, - { "kvdocpuq", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, - { "cpuq", FLAG_SKIP_DEFAULT | FLAG_SHOW_CPU_QUEUES }, - { "request", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, - { "kvdoreqq", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, - { "reqq", FLAG_SKIP_DEFAULT | FLAG_SHOW_REQUEST_QUEUE }, - { "viopool", FLAG_SKIP_DEFAULT | FLAG_SHOW_VIO_POOL }, - { "vdo", FLAG_SKIP_DEFAULT | FLAG_SHOW_VDO_STATUS }, - - { "pools", FLAG_SKIP_DEFAULT | FLAGS_ALL_POOLS }, - { "queues", FLAG_SKIP_DEFAULT | FLAGS_ALL_QUEUES }, - { "threads", FLAG_SKIP_DEFAULT | FLAGS_ALL_THREADS }, - { "default", FLAG_SKIP_DEFAULT | DEFAULT_DUMP_FLAGS }, - { "all", ~0 }, - }; - - bool optionsOkay = true; - for (int i = 1; i < argc; i++) { - int j; - for (j = 0; j < COUNT_OF(optionNames); j++) { - if (isArgString(argv[i], optionNames[j].name)) { - dumpOptionsRequested |= optionNames[j].flags; - break; - } - } - if (j == COUNT_OF(optionNames)) { - logWarning("dump option name '%s' unknown", argv[i]); - optionsOkay = false; - } - } - if (!optionsOkay) { - return -EINVAL; - } - if ((dumpOptionsRequested & FLAG_SKIP_DEFAULT) == 0) { - dumpOptionsRequested |= DEFAULT_DUMP_FLAGS; - } - *dumpOptionsRequestedPtr = dumpOptionsRequested; - return 0; -} - -/**********************************************************************/ -int vdoDump(KernelLayer *layer, - unsigned int argc, - char * const *argv, - const char *why) -{ - unsigned int dumpOptionsRequested = 0; - int result = parseDumpOptions(argc, argv, &dumpOptionsRequested); - if (result != 0) { - return result; - } - doDump(layer, dumpOptionsRequested, why); - return 0; -} - -/**********************************************************************/ -void vdoDumpAll(KernelLayer *layer, const char *why) -{ - doDump(layer, ~0, why); -} diff --git a/vdo/kernel/dump.h b/vdo/kernel/dump.h deleted file mode 100644 index 5187d4f..0000000 --- a/vdo/kernel/dump.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/dump.h#1 $ - */ - -#ifndef DUMP_H -#define DUMP_H - -#include "kernelLayer.h" - -/** - * Dump internal state and/or statistics to the kernel log, as - * specified by zero or more string arguments. - * - * @param layer The kernel layer - * @param argc Number of arguments - * @param argv The argument list - * @param why Reason for doing the dump - **/ -int vdoDump(KernelLayer *layer, - unsigned int argc, - char * const *argv, - const char *why); - -/** - * Dump lots of internal state and statistics to the kernel log. - * Identical to "dump all", without each caller needing to set up the - * argument list. - * - * @param layer The kernel layer - * @param why Reason for doing the dump - **/ -void vdoDumpAll(KernelLayer *layer, const char *why); - -#endif // DUMP_H diff --git a/vdo/kernel/errors.c b/vdo/kernel/errors.c deleted file mode 100644 index dc9303e..0000000 --- a/vdo/kernel/errors.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/errors.c#2 $ - */ - -#include "errors.h" - -#include -#include -#include - -#include "permassert.h" -#include "statusCodes.h" - -static const struct errorInfo errorList[] = { - { "UDS_UNINITIALIZED", "UDS library is not initialized" }, - { "UDS_SHUTTINGDOWN", "UDS library is shutting down" }, - { "UDS_EMODULE_LOAD", "Could not load modules" }, - { "UDS_ENOTHREADS", "Could not create a new thread" }, - { "UDS_NOCONTEXT", "Could not find the requested library context" }, - { "UDS_DISABLED", "UDS library context is disabled" }, - { "UDS_CORRUPT_FILE", "Corrupt file" }, - { "UDS_UNKNOWN_ERROR", "Unknown error" }, - { "UDS_GRID_NO_SERVERS", "No servers in grid configuration" }, - { "UDS_GRID_CONFIG_INCONSISTENT", "Grid configuration inconsistent" }, - { "UDS_UNSUPPORTED_VERSION", "Unsupported version" }, - { "UDS_NO_INDEXSESSION", "Index session not known" }, - { "UDS_CORRUPT_DATA", "Index data in memory is corrupt" }, - { "UDS_SHORT_READ", "Could not read requested number of bytes" }, - { "UDS_AI_ERROR", "Network address and service translation error" }, - { "UDS_RESOURCE_LIMIT_EXCEEDED", "Internal resource limits exceeded" }, - { "UDS_WRONG_CONTEXT_TYPE", "Context type mismatch" }, - { "UDS_BLOCK_ADDRESS_REQUIRED", "A block address is required" }, - { "UDS_CHUNK_DATA_REQUIRED", "Block data is required" }, - { "UDS_CHUNK_NAME_REQUIRED", "A chunk name is required" }, - { "UDS_CONF_PTR_REQUIRED", "A configuration pointer is required" }, - { "UDS_INDEX_STATS_PTR_REQUIRED", "An index stats pointer is required" }, - { "UDS_CONTEXT_STATS_PTR_REQUIRED", "A context stats pointer is required" }, - { "UDS_CONTEXT_PTR_REQUIRED", "A context pointer is required" }, - { "UDS_FILEID_REQUIRED", "A file ID is required" }, - { "UDS_STREAM_REQUIRED", "A stream is required" }, - { "UDS_STREAMID_REQUIRED", "A stream ID is required" }, - { "UDS_STREAM_PTR_REQUIRED", "A stream pointer is required" }, - { "UDS_INVALID_MEMORY_SIZE", - "Configured memory too small or unsupported size" }, - { "UDS_INVALID_METADATA_SIZE", "Invalid metadata size" }, - { "UDS_INDEX_NAME_REQUIRED", "An index name is required" }, - { "UDS_CONF_REQUIRED", "A configuration is required" }, - { "UDS_BAD_FILE_DESCRIPTOR", "Bad file descriptor" }, - { "UDS_INDEX_EXISTS", "Index already exists" }, - { "UDS_REQUESTS_OUT_OF_RANGE", "Maximum request value out of range" }, - { "UDS_BAD_NAMESPACE", "Bad namespace" }, - { "UDS_MIGRATOR_MISMATCH", - "Migrator arguments do not match reader arguments" }, - { "UDS_NO_INDEX", "No index found" }, - { "UDS_BAD_CHECKPOINT_FREQUENCY", "Checkpoint frequency out of range" }, - { "UDS_WRONG_INDEX_CONFIG", "Wrong type of index configuration" }, - { "UDS_INDEX_PATH_NOT_DIR", "Index path does not point to a directory" }, - { "UDS_ALREADY_OPEN", "Open invoked on already opened connection" }, - { "UDS_CALLBACK_ALREADY_REGISTERED", "Callback already registered" }, - { "UDS_INDEX_PATH_TOO_LONG", "Index path too long" }, - { "UDS_END_OF_FILE", "Unexpected end of file" }, - { "UDS_INDEX_NOT_SAVED_CLEANLY", "Index not saved cleanly" }, -}; - -static const struct errorInfo internalErrorList[] = { - { "UDS_PROTOCOL_ERROR", "Client/server protocol error" }, - { "UDS_OVERFLOW", "Index overflow" }, - { "UDS_FILLDONE", "Fill phase done" }, - { "UDS_INVALID_ARGUMENT", "Invalid argument passed to internal routine" }, - { "UDS_BAD_STATE", "UDS data structures are in an invalid state" }, - { "UDS_DUPLICATE_NAME", - "Attempt to enter the same name into a delta index twice" }, - { "UDS_UNEXPECTED_RESULT", "Unexpected result from internal routine" }, - { "UDS_INJECTED_ERROR", "Injected error" }, - { "UDS_ASSERTION_FAILED", "Assertion failed" }, - { "UDS_UNSCANNABLE", "Unscannable" }, - { "UDS_QUEUED", "Request queued" }, - { "UDS_QUEUE_ALREADY_CONNECTED", "Queue already connected" }, - { "UDS_BAD_FILL_PHASE", "Fill phase not supported" }, - { "UDS_BUFFER_ERROR", "Buffer error" }, - { "UDS_CONNECTION_LOST", "Lost connection to peer" }, - { "UDS_TIMEOUT", "A time out has occurred" }, - { "UDS_NO_DIRECTORY", "Expected directory is missing" }, - { "UDS_CHECKPOINT_INCOMPLETE", "Checkpoint not completed" }, - { "UDS_INVALID_RUN_ID", "Invalid albGenTest server run ID" }, - { "UDS_RUN_CANCELED", "albGenTest server run canceled" }, - { "UDS_ALREADY_REGISTERED", "error range already registered" }, -}; - -/** Error attributes - or into top half of error code */ -enum { - UDS_UNRECOVERABLE = (1 << 17) -}; - -typedef struct errorBlock { - const char *name; - int base; - int last; - int max; - const ErrorInfo *infos; -} ErrorBlock; - -enum { - MAX_ERROR_BLOCKS = 6 // needed for testing -}; - -static struct errorInformation { - int allocated; - int count; - ErrorBlock blocks[MAX_ERROR_BLOCKS]; -} registeredErrors; - -/**********************************************************************/ -void initializeStandardErrorBlocks(void) -{ - registeredErrors.allocated = MAX_ERROR_BLOCKS; - registeredErrors.count = 0; - - - registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { - .name = "UDS Error", - .base = UDS_ERROR_CODE_BASE, - .last = UDS_ERROR_CODE_LAST, - .max = UDS_ERROR_CODE_BLOCK_END, - .infos = errorList, - }; - - registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { - .name = "UDS Internal Error", - .base = UDS_INTERNAL_ERROR_CODE_BASE, - .last = UDS_INTERNAL_ERROR_CODE_LAST, - .max = UDS_INTERNAL_ERROR_CODE_BLOCK_END, - .infos = internalErrorList, - }; - - registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { - .name = THIS_MODULE->name, - .base = VDO_BLOCK_START, - .last = VDO_STATUS_CODE_LAST, - .max = VDO_BLOCK_END, - .infos = vdoStatusList, - }; -} - -/** - * Fetch the error info (if any) for the error number. - * - * @param errnum the error number - * @param infoPtr the place to store the info for this error (if known), - * otherwise set to NULL - * - * @return the name of the error block (if known), NULL otherwise - **/ -static const char *getErrorInfo(int errnum, const ErrorInfo **infoPtr) -{ - for (ErrorBlock *block = registeredErrors.blocks; - block < registeredErrors.blocks + registeredErrors.count; - ++block) { - if ((errnum >= block->base) && (errnum < block->last)) { - if (infoPtr != NULL) { - *infoPtr = block->infos + (errnum - block->base); - } - return block->name; - } else if ((errnum >= block->last) && (errnum < block->max)) { - if (infoPtr != NULL) { - *infoPtr = NULL; - } - return block->name; - } - } - if (infoPtr != NULL) { - *infoPtr = NULL; - } - return NULL; -} - -/*****************************************************************************/ -const char *stringError(int errnum, char *buf, size_t buflen) -{ - if (buf == NULL) { - return NULL; - } - - const ErrorInfo *info = NULL; - const char *blockName = getErrorInfo(errnum, &info); - - if (blockName != NULL) { - if (info != NULL) { - snprintf(buf, buflen, "%s: %s", blockName, info->message); - } else { - snprintf(buf, buflen, "Unknown %s %d", blockName, errnum); - } - } else { - snprintf(buf, buflen, "System error %d", errnum); - } - return buf; -} - -/*****************************************************************************/ -const char *stringErrorName(int errnum, char *buf, size_t buflen) -{ - const ErrorInfo *info = NULL; - const char *blockName = getErrorInfo(errnum, &info); - - if (blockName != NULL) { - if (info != NULL) { - snprintf(buf, buflen, "%s: %s", blockName, info->name); - } else { - snprintf(buf, buflen, "Unknown %s %d", blockName, errnum); - } - } else { - snprintf(buf, buflen, "System error %d", errnum); - } - return buf; -} - -/*****************************************************************************/ -int makeUnrecoverable(int resultCode) -{ - return ((resultCode == UDS_SUCCESS) - ? resultCode - : (resultCode | UDS_UNRECOVERABLE)); -} - -/*****************************************************************************/ -int sansUnrecoverable(int resultCode) -{ - return resultCode & ~UDS_UNRECOVERABLE; -} - -/*****************************************************************************/ -bool isUnrecoverable(int resultCode) -{ - return (bool)(resultCode & UDS_UNRECOVERABLE); -} - -/*****************************************************************************/ -int registerErrorBlock(const char *blockName, - int firstError, - int lastReservedError, - const ErrorInfo *infos, - size_t infoSize) -{ - int result = ASSERT(firstError < lastReservedError, - "bad error block range"); - if (result != UDS_SUCCESS) { - return result; - } - - if (registeredErrors.count == registeredErrors.allocated) { - // could reallocate and grow, but should never happen - return UDS_OVERFLOW; - } - - for (ErrorBlock *block = registeredErrors.blocks; - block < registeredErrors.blocks + registeredErrors.count; - ++block) { - if (strcmp(blockName, block->name) == 0) { - return UDS_DUPLICATE_NAME; - } - // check for overlap in error ranges - if ((firstError < block->max) && (lastReservedError > block->base)) { - return UDS_ALREADY_REGISTERED; - } - } - - registeredErrors.blocks[registeredErrors.count++] = (ErrorBlock) { - .name = blockName, - .base = firstError, - .last = firstError + (infoSize / sizeof(ErrorInfo)), - .max = lastReservedError, - .infos = infos - }; - - return UDS_SUCCESS; -} diff --git a/vdo/kernel/errors.h b/vdo/kernel/errors.h deleted file mode 100644 index acfb777..0000000 --- a/vdo/kernel/errors.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/errors.h#1 $ - */ - -#ifndef ERRORS_H -#define ERRORS_H - -#include -#include "uds-error.h" - -enum udsInternalErrorCodes { - /** Used as a base value for reporting internal errors */ - UDS_INTERNAL_ERROR_CODE_BASE = 66560, - /** Client/server protocol framing error */ - UDS_PROTOCOL_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 0, - /** Index overflow */ - UDS_OVERFLOW = UDS_INTERNAL_ERROR_CODE_BASE + 1, - /** Fill phase done (intended for albfill only) */ - UDS_FILLDONE = UDS_INTERNAL_ERROR_CODE_BASE + 2, - /** Invalid argument passed to internal routine */ - UDS_INVALID_ARGUMENT = UDS_INTERNAL_ERROR_CODE_BASE + 3, - /** UDS data structures are in an invalid state */ - UDS_BAD_STATE = UDS_INTERNAL_ERROR_CODE_BASE + 4, - /** Attempt to enter the same name into an internal structure twice */ - UDS_DUPLICATE_NAME = UDS_INTERNAL_ERROR_CODE_BASE + 5, - /** An internal protocol violation between system components */ - UDS_UNEXPECTED_RESULT = UDS_INTERNAL_ERROR_CODE_BASE + 6, - /** An error created by test case processing */ - UDS_INJECTED_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 7, - /** An assertion failed */ - UDS_ASSERTION_FAILED = UDS_INTERNAL_ERROR_CODE_BASE + 8, - /** A file or stream is not scannable with the current scanner */ - UDS_UNSCANNABLE = UDS_INTERNAL_ERROR_CODE_BASE + 9, - /** Not an actual error, but reporting that the result will be delayed */ - UDS_QUEUED = UDS_INTERNAL_ERROR_CODE_BASE + 10, - /** Queue already connected */ - UDS_QUEUE_ALREADY_CONNECTED = UDS_INTERNAL_ERROR_CODE_BASE + 11, - /** Fill phase not supported */ - UDS_BAD_FILL_PHASE = UDS_INTERNAL_ERROR_CODE_BASE + 12, - /** A problem has occurred with a Buffer */ - UDS_BUFFER_ERROR = UDS_INTERNAL_ERROR_CODE_BASE + 13, - /** A network connection was lost */ - UDS_CONNECTION_LOST = UDS_INTERNAL_ERROR_CODE_BASE + 14, - /** A time out has occurred */ - UDS_TIMEOUT = UDS_INTERNAL_ERROR_CODE_BASE + 15, - /** No directory was found where one was expected */ - UDS_NO_DIRECTORY = UDS_INTERNAL_ERROR_CODE_BASE + 16, - /** Checkpoint not completed */ - UDS_CHECKPOINT_INCOMPLETE = UDS_INTERNAL_ERROR_CODE_BASE + 17, - /** Invalid albGenTest server run ID */ - UDS_INVALID_RUN_ID = UDS_INTERNAL_ERROR_CODE_BASE + 18, - /** albGenTest server run canceled */ - UDS_RUN_CANCELED = UDS_INTERNAL_ERROR_CODE_BASE + 19, - /** this error range has already been registered */ - UDS_ALREADY_REGISTERED = UDS_INTERNAL_ERROR_CODE_BASE + 20, - /** One more than the last UDS_INTERNAL error code */ - UDS_INTERNAL_ERROR_CODE_LAST, - /** One more than the last error this block will ever use */ - UDS_INTERNAL_ERROR_CODE_BLOCK_END = UDS_INTERNAL_ERROR_CODE_BASE + 440 -}; - -enum { - ERRBUF_SIZE = 128 // default size for buffer passed to stringError -}; - -const char *stringError(int errnum, char *buf, size_t buflen); -const char *stringErrorName(int errnum, char *buf, size_t buflen); - -int makeUnrecoverable(int resultCode) __attribute__((warn_unused_result)); -bool isUnrecoverable(int resultCode) __attribute__((warn_unused_result)); -int sansUnrecoverable(int resultCode) __attribute__((warn_unused_result)); - -typedef struct errorInfo { - const char *name; - const char *message; -} ErrorInfo; - -/** - * Initialize UDS error code blocks. - * - * @note Must be called once, before any of the other routines in this - * file. - **/ -void initializeStandardErrorBlocks(void); - -/** - * Register an error code block for stringError and stringErrorName. - * - * @param blockName the name of the block of error codes - * @param firstError the first error code in the block - * @param lastReservedError one past the highest possible error in the block - * @param infos a pointer to the error info array for the block - * @param infoSize the size of the error info array, which determines - * the last actual error for which information is - * available - * - * @return a success or error code, particularly UDS_DUPLICATE_NAME if the - * block name is already present, or UDS_ALREADY_REGISTERED if a - * block with the specified error code is present - **/ -int registerErrorBlock(const char *blockName, - int firstError, - int lastReservedError, - const ErrorInfo *infos, - size_t infoSize); - -#endif /* ERRORS_H */ diff --git a/vdo/kernel/histogram.c b/vdo/kernel/histogram.c deleted file mode 100644 index 0e1a6ae..0000000 --- a/vdo/kernel/histogram.c +++ /dev/null @@ -1,665 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/histogram.c#2 $ - */ - -#include - -#include "memoryAlloc.h" -#include "typeDefs.h" - -#include "histogram.h" -#include "logger.h" -#include "numUtils.h" - -/* - * Set NO_BUCKETS to streamline the histogram code by reducing it to - * tracking just minimum, maximum, mean, etc. Only one bucket counter - * (the final one for "bigger" values) will be used, no range checking - * is needed to find the right bucket, and no histogram will be - * reported. With newer compilers, the histogram output code will be - * optimized out. - */ -enum { - NO_BUCKETS = 1 -}; - -/* - * Support histogramming in the VDO code. - * - * This is not a complete and general histogram package. It follows the XP - * practice of implementing the "customer" requirements, and no more. We can - * support other requirements after we know what they are. - * - * The code was originally borrowed from Albireo, and includes both linear and - * logarithmic histograms. VDO only uses the logarithmic histograms. - * - * All samples are uint64_t values. - * - * A unit conversion option is supported internally to allow sample values to - * be supplied in "jiffies" and results to be reported via /sys in - * milliseconds. Depending on the system configuration, this could mean a - * factor of four (a bucket for values of 1 jiffy is reported as 4-7 - * milliseconds). In theory it could be a non-integer ratio (including less - * than one), but as the x86-64 platforms we've encountered appear to use 1 or - * 4 milliseconds per jiffy, we don't support non-integer values yet. - * - * All internal processing uses the values as passed to enterHistogramSample. - * Conversions only affect the values seen or input through the /sys interface, - * including possibly rounding a "limit" value entered. - */ - -struct histogram { - // These fields are ordered so that enterHistogramSample touches - // only the first cache line. - atomic64_t *counters; // Counter for each bucket - uint64_t limit; // We want to know how many samples are larger - atomic64_t sum; // Sum of all the samples - atomic64_t count; // Number of samples - atomic64_t minimum; // Minimum value - atomic64_t maximum; // Maximum value - atomic64_t unacceptable; // Number of samples that exceed the limit - int numBuckets; // The number of buckets - bool logFlag; // True if the y scale should be logarithmic - // These fields are used only when reporting results. - const char *label; // Histogram label - const char *countedItems; // Name for things being counted - const char *metric; // Term for value used to divide into buckets - const char *sampleUnits; // Unit for measuring metric; NULL for count - unsigned int conversionFactor; // Converts input units to reporting units - struct kobject kobj; -}; - -/* - * Fixed table defining the top value for each bucket of a logarithmic - * histogram. We arbitrarily limit the histogram to 12 orders of magnitude. - */ -enum { MAX_LOG_SIZE = 12 }; -static const uint64_t bottomValue[1 + 10 * MAX_LOG_SIZE] = { - // 0 to 10 - The first 10 buckets are linear - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - // 10 to 100 - From this point on, the Nth entry of the table is - // floor(exp10((double)N/10.0)). - 12, 15, 19, 25, 31, 39, 50, 63, 79, 100, - // 100 to 1K - 125, 158, 199, 251, 316, 398, 501, 630, 794, 1000, - // 1K to 10K - 1258, 1584, 1995, 2511, 3162, 3981, 5011, 6309, 7943, 10000, - // 10K to 100K - 12589, 15848, 19952, 25118, 31622, 39810, 50118, 63095, 79432, 100000, - // 100K to 1M - 125892, 158489, 199526, 251188, 316227, - 398107, 501187, 630957, 794328, 1000000, - // 1M to 10M - 1258925, 1584893, 1995262, 2511886, 3162277, - 3981071, 5011872, 6309573, 7943282, 10000000, - // 10M to 100M - 12589254, 15848931, 19952623, 25118864, 31622776, - 39810717, 50118723, 63095734, 79432823, 100000000, - // 100M to 1G - 125892541, 158489319, 199526231, 251188643, 316227766, - 398107170, 501187233, 630957344, 794328234, 1000000000, - // 1G to 10G - 1258925411L, 1584893192L, 1995262314L, 2511886431L, 3162277660L, - 3981071705L, 5011872336L, 6309573444L, 7943282347L, 10000000000L, - // 10G to 100G - 12589254117L, 15848931924L, 19952623149L, 25118864315L, 31622776601L, - 39810717055L, 50118723362L, 63095734448L, 79432823472L, 100000000000L, - // 100G to 1T - 125892541179L, 158489319246L, 199526231496L, 251188643150L, 316227766016L, - 398107170553L, 501187233627L, 630957344480L, 794328234724L, 1000000000000L, -}; - -/***********************************************************************/ -static unsigned int divideRoundingToNearest(uint64_t number, uint64_t divisor) -{ - number += divisor / 2; - return number / divisor; -} - -/***********************************************************************/ -static int maxBucket(Histogram *h) -{ - int max = h->numBuckets; - while ((max >= 0) && (atomic64_read(&h->counters[max]) == 0)) { - max--; - } - // max == -1 means that there were no samples - return max; -} - -/***********************************************************************/ - -typedef struct { - struct attribute attr; - ssize_t (*show)(Histogram *h, char *buf); - ssize_t (*store)(Histogram *h, const char *buf, size_t length); -} HistogramAttribute; - -/***********************************************************************/ -static void histogramKobjRelease(struct kobject *kobj) -{ - Histogram *h = container_of(kobj, Histogram, kobj); - FREE(h->counters); - FREE(h); -} - -/***********************************************************************/ -static ssize_t histogramShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - HistogramAttribute *ha = container_of(attr, HistogramAttribute, attr); - if (ha->show == NULL) { - return -EINVAL; - } - Histogram *h = container_of(kobj, Histogram, kobj); - return ha->show(h, buf); -} - -/***********************************************************************/ -static ssize_t histogramStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) -{ - HistogramAttribute *ha = container_of(attr, HistogramAttribute, attr); - if (ha->show == NULL) { - return -EINVAL; - } - Histogram *h = container_of(kobj, Histogram, kobj); - return ha->store(h, buf, length); -} - -/***********************************************************************/ -static ssize_t histogramShowCount(Histogram *h, char *buf) -{ - int64_t count = atomic64_read(&h->count); - return sprintf(buf, "%" PRId64 "\n", count); -} - -/***********************************************************************/ -static ssize_t histogramShowHistogram(Histogram *h, char *buffer) -{ - /* - * We're given one page in which to write. The caller logs a complaint if we - * report that we've written too much, so we'll truncate to PAGE_SIZE-1. - */ - size_t bufferSize = PAGE_SIZE; - bool bars = true; - ssize_t length = 0; - int max = maxBucket(h); - // If max is -1, we'll fall through to reporting the total of zero. - - enum { BAR_SIZE = 50 }; - char bar[BAR_SIZE + 2]; - bar[0] = ' '; - memset(bar + 1, '=', BAR_SIZE); - bar[BAR_SIZE + 1] = '\0'; - - uint64_t total = 0; - for (int i = 0; i <= max; i++) { - total += atomic64_read(&h->counters[i]); - } - - length += snprintf(buffer, bufferSize, "%s Histogram - number of %s by %s", - h->label, h->countedItems, h->metric); - if (length >= (bufferSize - 1)) { - return bufferSize - 1; - } - if (h->sampleUnits != NULL) { - length += snprintf(buffer + length, bufferSize - length, " (%s)", - h->sampleUnits); - if (length >= (bufferSize - 1)) { - return bufferSize - 1; - } - } - length += snprintf(buffer + length, bufferSize - length, "\n"); - if (length >= (bufferSize - 1)) { - return bufferSize - 1; - } - for (int i = 0; i <= max; i++) { - uint64_t value = atomic64_read(&h->counters[i]); - - unsigned int barLength; - if (bars && (total != 0)) { - // +1 for the space at the beginning - barLength = (divideRoundingToNearest(value * BAR_SIZE, total) + 1); - if (barLength == 1) { - // Don't bother printing just the initial space. - barLength = 0; - } - } else { - // 0 means skip the space and the bar - barLength = 0; - } - - if (h->logFlag) { - if (i == h->numBuckets) { - length += snprintf(buffer + length, bufferSize - length, "%-16s", - "Bigger"); - } else { - unsigned int lower = h->conversionFactor * bottomValue[i]; - unsigned int upper = h->conversionFactor * bottomValue[i + 1] - 1; - length += snprintf(buffer + length, bufferSize - length, "%6u - %7u", - lower, upper); - } - } else { - if (i == h->numBuckets) { - length += snprintf(buffer + length, bufferSize - length, "%6s", - "Bigger"); - } else { - length += snprintf(buffer + length, bufferSize - length, "%6d", i); - } - } - if (length >= (bufferSize - 1)) { - return bufferSize - 1; - } - length += snprintf(buffer + length, bufferSize - length, - " : %12llu%.*s\n", value, barLength, bar); - if (length >= (bufferSize - 1)) { - return bufferSize - 1; - } - } - - length += snprintf(buffer + length, bufferSize - length, - "total %llu\n", total); - return minSizeT(bufferSize - 1, length); -} - -/***********************************************************************/ -static ssize_t histogramShowMaximum(Histogram *h, char *buf) -{ - // Maximum is initialized to 0. - unsigned long value = atomic64_read(&h->maximum); - return sprintf(buf, "%lu\n", h->conversionFactor * value); -} - -/***********************************************************************/ -static ssize_t histogramShowMinimum(Histogram *h, char *buf) -{ - // Minimum is initialized to -1. - unsigned long value = ((atomic64_read(&h->count) > 0) - ? atomic64_read(&h->minimum) - : 0); - return sprintf(buf, "%lu\n", h->conversionFactor * value); -} - -/***********************************************************************/ -static ssize_t histogramShowLimit(Histogram *h, char *buf) -{ - // Display the limit in the reporting units - return sprintf(buf, "%u\n", (unsigned int)(h->conversionFactor * h->limit)); -} - -/***********************************************************************/ -static ssize_t histogramStoreLimit(Histogram *h, - const char *buf, - size_t length) -{ - unsigned int value; - if ((length > 12) || (sscanf(buf, "%u", &value) != 1)) { - return -EINVAL; - } - /* - * Convert input from reporting units (e.g., milliseconds) to internal - * recording units (e.g., jiffies). - * - * computeBucketCount could also be called "divideRoundingUp". - */ - h->limit = computeBucketCount(value, h->conversionFactor); - atomic64_set(&h->unacceptable, 0); - return length; -} - -/***********************************************************************/ -static ssize_t histogramShowMean(Histogram *h, char *buf) -{ - uint64_t count = atomic64_read(&h->count); - if (count == 0) { - return sprintf(buf, "0/0\n"); - } - // Compute mean, scaled up by 1000, in reporting units - unsigned long sumTimes1000InReportingUnits - = h->conversionFactor * atomic64_read(&h->sum) * 1000; - unsigned int meanTimes1000 - = divideRoundingToNearest(sumTimes1000InReportingUnits, count); - // Print mean with fractional part - return sprintf(buf, "%u.%03u\n", meanTimes1000 / 1000, - meanTimes1000 % 1000); -} - -/***********************************************************************/ -static ssize_t histogramShowUnacceptable(Histogram *h, char *buf) -{ - int64_t count = atomic64_read(&h->unacceptable); - return sprintf(buf, "%" PRId64 "\n", count); -} - -/***********************************************************************/ -static ssize_t histogramShowLabel(Histogram *h, char *buf) -{ - return sprintf(buf, "%s\n", h->label); -} - -/***********************************************************************/ -static ssize_t histogramShowUnit(Histogram *h, char *buf) -{ - if (h->sampleUnits != NULL) { - return sprintf(buf, "%s\n", h->sampleUnits); - } else { - *buf = 0; - return 0; - } -} - -/***********************************************************************/ - -static struct sysfs_ops histogramSysfsOps = { - .show = histogramShow, - .store = histogramStore, -}; - -static HistogramAttribute countAttribute = { - .attr = { .name = "count", .mode = 0444, }, - .show = histogramShowCount, -}; - -static HistogramAttribute histogramAttribute = { - .attr = { .name = "histogram", .mode = 0444, }, - .show = histogramShowHistogram, -}; - -static HistogramAttribute labelAttribute = { - .attr = { .name = "label", .mode = 0444, }, - .show = histogramShowLabel, -}; - -static HistogramAttribute maximumAttribute = { - .attr = { .name = "maximum", .mode = 0444, }, - .show = histogramShowMaximum, -}; - -static HistogramAttribute minimumAttribute = { - .attr = { .name = "minimum", .mode = 0444, }, - .show = histogramShowMinimum, -}; - -static HistogramAttribute limitAttribute = { - .attr = { .name = "limit", .mode = 0644, }, - .show = histogramShowLimit, - .store = histogramStoreLimit, -}; - -static HistogramAttribute meanAttribute = { - .attr = { .name = "mean", .mode = 0444, }, - .show = histogramShowMean, -}; - -static HistogramAttribute unacceptableAttribute = { - .attr = { .name = "unacceptable", .mode = 0444, }, - .show = histogramShowUnacceptable, -}; - -static HistogramAttribute unitAttribute = { - .attr = { .name = "unit", .mode = 0444, }, - .show = histogramShowUnit, -}; - -// "Real" histogram plotting. -static struct attribute *histogramAttributes[] = { - &countAttribute.attr, - &histogramAttribute.attr, - &labelAttribute.attr, - &limitAttribute.attr, - &maximumAttribute.attr, - &meanAttribute.attr, - &minimumAttribute.attr, - &unacceptableAttribute.attr, - &unitAttribute.attr, - NULL, -}; - -static struct kobj_type histogramKobjType = { - .release = histogramKobjRelease, - .sysfs_ops = &histogramSysfsOps, - .default_attrs = histogramAttributes, -}; - -static struct attribute *bucketlessHistogramAttributes[] = { - &countAttribute.attr, - &labelAttribute.attr, - &maximumAttribute.attr, - &meanAttribute.attr, - &minimumAttribute.attr, - &unitAttribute.attr, - NULL, -}; - -static struct kobj_type bucketlessHistogramKobjType = { - .release = histogramKobjRelease, - .sysfs_ops = &histogramSysfsOps, - .default_attrs = bucketlessHistogramAttributes, -}; - -/***********************************************************************/ -static Histogram *makeHistogram(struct kobject *parent, - const char *name, - const char *label, - const char *countedItems, - const char *metric, - const char *sampleUnits, - int numBuckets, - unsigned long conversionFactor, - bool logFlag) -{ - Histogram *h; - if (ALLOCATE(1, Histogram, "histogram", &h) != UDS_SUCCESS) { - return NULL; - } - - if (NO_BUCKETS) { - numBuckets = 0; // plus 1 for "bigger" bucket - } - - if (numBuckets <= 10) { - /* - * The first buckets in a "logarithmic" histogram are still - * linear, but the bucket-search mechanism is a wee bit slower - * than for linear, so change the type. - */ - logFlag = false; - } - - h->label = label; - h->countedItems = countedItems; - h->metric = metric; - h->sampleUnits = sampleUnits; - h->logFlag = logFlag; - h->numBuckets = numBuckets; - h->conversionFactor = conversionFactor; - atomic64_set(&h->minimum, -1UL); - - if (ALLOCATE(h->numBuckets + 1, atomic64_t, "histogram counters", - &h->counters) != UDS_SUCCESS) { - histogramKobjRelease(&h->kobj); - return NULL; - } - - kobject_init(&h->kobj, - ((numBuckets > 0) - ? &histogramKobjType - : &bucketlessHistogramKobjType)); - if (kobject_add(&h->kobj, parent, name) != 0) { - histogramKobjRelease(&h->kobj); - return NULL; - } - return h; -} - -/***********************************************************************/ -Histogram *makeLinearHistogram(struct kobject *parent, - const char *name, - const char *initLabel, - const char *countedItems, - const char *metric, - const char *sampleUnits, - int size) -{ - return makeHistogram(parent, name, initLabel, countedItems, - metric, sampleUnits, size, 1, false); -} - - -/** - * Intermediate routine for creating logarithmic histograms. - * - * Limits the histogram size, and computes the bucket count from the - * orders-of-magnitude count. - * - * @param parent The parent kobject. - * @param name The short name of the histogram. This label is - * used for the sysfs node. - * @param initLabel The label for the sampled data. This label is used - * when we plot the data. - * @param countedItems A name (plural) for the things being counted. - * @param metric The measure being used to divide samples into - * buckets. - * @param sampleUnits The units (plural) for the metric, or NULL if it's - * a simple counter. - * @param logSize The number of buckets. There are buckets for a - * range of sizes up to 10^logSize, and an extra - * bucket for larger samples. - * @param conversionFactor Unit conversion factor for reporting. - * - * @return the histogram - **/ -static Histogram * -makeLogarithmicHistogramWithConversionFactor(struct kobject *parent, - const char *name, - const char *initLabel, - const char *countedItems, - const char *metric, - const char *sampleUnits, - int logSize, - uint64_t conversionFactor) -{ - if (logSize > MAX_LOG_SIZE) { - logSize = MAX_LOG_SIZE; - } - return makeHistogram(parent, name, - initLabel, countedItems, metric, sampleUnits, - 10 * logSize, conversionFactor, true); -} - -/***********************************************************************/ -Histogram *makeLogarithmicHistogram(struct kobject *parent, - const char *name, - const char *initLabel, - const char *countedItems, - const char *metric, - const char *sampleUnits, - int logSize) -{ - return makeLogarithmicHistogramWithConversionFactor(parent, name, initLabel, - countedItems, - metric, sampleUnits, - logSize, 1); -} - -/***********************************************************************/ -Histogram *makeLogarithmicJiffiesHistogram(struct kobject *parent, - const char *name, - const char *initLabel, - const char *countedItems, - const char *metric, - int logSize) -{ - /* - * If these fail, we have a jiffy duration that is not an integral number of - * milliseconds, and the unit conversion code needs updating. - */ - STATIC_ASSERT(HZ <= MSEC_PER_SEC); - STATIC_ASSERT((MSEC_PER_SEC % HZ) == 0); - return makeLogarithmicHistogramWithConversionFactor(parent, name, initLabel, - countedItems, - metric, "milliseconds", - logSize, - jiffies_to_msecs(1)); -} - -/***********************************************************************/ -void enterHistogramSample(Histogram *h, uint64_t sample) -{ - int bucket; - if (h->logFlag) { - int lo = 0; - int hi = h->numBuckets; - while (lo < hi) { - int middle = (lo + hi) / 2; - if (sample < bottomValue[middle + 1]) { - hi = middle; - } else { - lo = middle + 1; - } - } - bucket = lo; - } else { - bucket = sample < h->numBuckets ? sample : h->numBuckets; - } - atomic64_inc(&h->counters[bucket]); - atomic64_inc(&h->count); - atomic64_add(sample, &h->sum); - if ((h->limit > 0) && (sample > h->limit)) { - atomic64_inc(&h->unacceptable); - } - - /* - * Theoretically this could loop a lot; in practice it should rarely - * do more than a single read, with no memory barrier, from a cache - * line we've already referenced above. - */ - uint64_t oldMaximum = atomic64_read(&h->maximum); - while (oldMaximum < sample) { - uint64_t readValue = atomic64_cmpxchg(&h->maximum, oldMaximum, sample); - if (readValue == oldMaximum) { - break; - } - oldMaximum = readValue; - } - - uint64_t oldMinimum = atomic64_read(&h->minimum); - while (oldMinimum > sample) { - uint64_t readValue = atomic64_cmpxchg(&h->minimum, oldMinimum, sample); - if (readValue == oldMinimum) { - break; - } - oldMinimum = readValue; - } -} - -/***********************************************************************/ -void freeHistogram(Histogram **hp) -{ - if (*hp != NULL) { - Histogram *h = *hp; - kobject_put(&h->kobj); - *hp = NULL; - } -} diff --git a/vdo/kernel/histogram.h b/vdo/kernel/histogram.h deleted file mode 100644 index a177e0a..0000000 --- a/vdo/kernel/histogram.h +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/histogram.h#1 $ - */ - -#ifndef HISTOGRAM_H -#define HISTOGRAM_H - -#include - -typedef struct histogram Histogram; - -/** - * Allocate and initialize a histogram that uses linearly sized buckets. - * - * The histogram label reported via /sys is constructed from several of the - * values passed here; it will be something like "Init Label Histogram - number - * of countedItems grouped by metric (sampleUnits)", e.g., "Flush Forwarding - * Histogram - number of flushes grouped by latency (milliseconds)". Thus - * countedItems and sampleUnits should be plural. - * - * The sampleUnits string will also be reported separately via another /sys - * entry to aid in programmatic processing of the results, so the strings used - * should be consistent (e.g., always "milliseconds" and not "ms" for - * milliseconds). - * - * @param parent The parent kobject. - * @param name The short name of the histogram. This label is used - * for the sysfs node. - * @param initLabel The label for the sampled data. This label is used - * when we plot the data. - * @param countedItems A name (plural) for the things being counted. - * @param metric The measure being used to divide samples into buckets. - * @param sampleUnits The unit (plural) for the metric, or NULL if it's a - * simple counter. - * @param size The number of buckets. There are buckets for every - * value from 0 up to size (but not including) size. - * There is an extra bucket for larger samples. - * - * @return the histogram - **/ -Histogram *makeLinearHistogram(struct kobject *parent, - const char *name, - const char *initLabel, - const char *countedItems, - const char *metric, - const char *sampleUnits, - int size); - -/** - * Allocate and initialize a histogram that uses logarithmically sized - * buckets. - * - * @param parent The parent kobject. - * @param name The short name of the histogram. This label is used - * for the sysfs node. - * @param initLabel The label for the sampled data. This label is used - * when we plot the data. - * @param countedItems A name (plural) for the things being counted. - * @param metric The measure being used to divide samples into buckets. - * @param sampleUnits The unit (plural) for the metric, or NULL if it's a - * simple counter. - * @param logSize The number of buckets. There are buckets for a range - * of sizes up to 10^logSize, and an extra bucket for - * larger samples. - * - * @return the histogram - **/ -Histogram *makeLogarithmicHistogram(struct kobject *parent, - const char *name, - const char *initLabel, - const char *countedItems, - const char *metric, - const char *sampleUnits, - int logSize); - -/** - * Allocate and initialize a histogram that uses logarithmically sized - * buckets. Values are entered that count in jiffies, and they are - * reported in milliseconds. - * - * @param parent The parent kobject. - * @param name The short name of the histogram. This label is used - * for the sysfs node. - * @param initLabel The label for the sampled data. This label is used - * when we plot the data. - * @param countedItems A name (plural) for the things being counted. - * @param metric The measure being used to divide samples into buckets. - * @param logSize The number of buckets. There are buckets for a range - * of sizes up to 10^logSize, and an extra bucket for - * larger samples. - * - * @return the histogram - **/ -Histogram *makeLogarithmicJiffiesHistogram(struct kobject *parent, - const char *name, - const char *initLabel, - const char *countedItems, - const char *metric, - int logSize); - -/** - * Enter a sample into a histogram - * - * @param h The histogram - * @param sample The sample - **/ -void enterHistogramSample(Histogram *h, uint64_t sample); - -/** - * Free a histogram and null out the reference to it. - * - * @param hp The reference to the histogram. - **/ -void freeHistogram(Histogram **hp); - -#endif /* HISTOGRAM_H */ diff --git a/vdo/kernel/instanceNumber.c b/vdo/kernel/instanceNumber.c deleted file mode 100644 index 178fd92..0000000 --- a/vdo/kernel/instanceNumber.c +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/instanceNumber.c#1 $ - */ - -#include "instanceNumber.h" - -#include -#include - -#include "memoryAlloc.h" -#include "numUtils.h" -#include "permassert.h" - -/* - * Track in-use instance numbers using a flat bit array. - * - * O(n) run time isn't ideal, but if we have 1000 VDO devices in use - * simultaneously we still only need to scan 16 words, so it's not - * likely to be a big deal compared to other resource usage. - */ - -enum { - /** - * This minimum size for the bit array creates a numbering space of 0-999, - * which allows successive starts of the same volume to have different - * instance numbers in any reasonably-sized test. Changing instances on - * restart allows vdoMonReport to detect that the ephemeral stats have reset - * to zero. - **/ - BIT_COUNT_MINIMUM = 1000, - /** Grow the bit array by this many bits when needed */ - BIT_COUNT_INCREMENT = 100, -}; - -static struct mutex instanceNumberLock; -static unsigned int bitCount; -static unsigned long *words; -static unsigned int instanceCount; -static unsigned int nextInstance; - -/** - * Return the number of bytes needed to store a bit array of the specified - * capacity in an array of unsigned longs. - * - * @param bitCount The number of bits the array must hold - * - * @return the number of bytes needed for the array reperesentation - **/ -static size_t getBitArraySize(unsigned int bitCount) -{ - // Round up to a multiple of the word size and convert to a byte count. - return (computeBucketCount(bitCount, BITS_PER_LONG) * sizeof(unsigned long)); -} - -/** - * Re-allocate the bitmap word array so there will more instance numbers that - * can be allocated. Since the array is initially NULL, this also initializes - * the array the first time we allocate an instance number. - * - * @return UDS_SUCCESS or an error code from the allocation - **/ -static int growBitArray(void) -{ - unsigned int newCount = maxUInt(bitCount + BIT_COUNT_INCREMENT, - BIT_COUNT_MINIMUM); - unsigned long *newWords; - int result = reallocateMemory(words, - getBitArraySize(bitCount), - getBitArraySize(newCount), - "instance number bit array", - &newWords); - if (result != UDS_SUCCESS) { - return result; - } - - bitCount = newCount; - words = newWords; - return UDS_SUCCESS; -} - -/**********************************************************************/ -static int allocateKVDOInstanceLocked(unsigned int *instancePtr) -{ - // If there are no unallocated instances, grow the bit array. - if (instanceCount >= bitCount) { - int result = growBitArray(); - if (result != UDS_SUCCESS) { - return result; - } - } - - // There must be a zero bit somewhere now. Find it, starting just after the - // last instance allocated. - unsigned int instance = find_next_zero_bit(words, bitCount, nextInstance); - if (instance >= bitCount) { - // Nothing free after nextInstance, so wrap around to instance zero. - instance = find_first_zero_bit(words, bitCount); - int result = ASSERT(instance < bitCount, "impossibly, no zero bit found"); - if (result != UDS_SUCCESS) { - return result; - } - } - - __set_bit(instance, words); - instanceCount += 1; - nextInstance = instance + 1; - *instancePtr = instance; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int allocateKVDOInstance(unsigned int *instancePtr) -{ - mutex_lock(&instanceNumberLock); - int result = allocateKVDOInstanceLocked(instancePtr); - mutex_unlock(&instanceNumberLock); - return result; -} - -/**********************************************************************/ -void releaseKVDOInstance(unsigned int instance) -{ - mutex_lock(&instanceNumberLock); - if (instance >= bitCount) { - ASSERT_LOG_ONLY(false, "instance number %u must be less than bit count %u", - instance, bitCount); - } else if (test_bit(instance, words) == 0) { - ASSERT_LOG_ONLY(false, "instance number %u must be allocated", instance); - } else { - __clear_bit(instance, words); - instanceCount -= 1; - } - mutex_unlock(&instanceNumberLock); -} - -/**********************************************************************/ -void initializeInstanceNumberTracking(void) -{ - mutex_init(&instanceNumberLock); -} - -/**********************************************************************/ -void cleanUpInstanceNumberTracking(void) -{ - ASSERT_LOG_ONLY(instanceCount == 0, - "should have no instance numbers still in use, but have %u", - instanceCount); - FREE(words); - words = NULL; - bitCount = 0; - instanceCount = 0; - nextInstance = 0; - mutex_destroy(&instanceNumberLock); -} diff --git a/vdo/kernel/instanceNumber.h b/vdo/kernel/instanceNumber.h deleted file mode 100644 index 6d96bad..0000000 --- a/vdo/kernel/instanceNumber.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/instanceNumber.h#1 $ - */ - -/** - * Allocate an instance number. - * - * @param [out] instancePtr An integer to hold the allocated instance number - * - * @result UDS_SUCCESS or an error code - **/ -int allocateKVDOInstance(unsigned int *instancePtr); - -/** - * Release an instance number previously allocated. - * - * @param instance The instance number to release - **/ -void releaseKVDOInstance(unsigned int instance); - -/** - * Initialize the instance-number tracking data structures. - **/ -void initializeInstanceNumberTracking(void); - -/** - * Free up the instance-number tracking data structures. - **/ -void cleanUpInstanceNumberTracking(void); diff --git a/vdo/kernel/ioSubmitter.c b/vdo/kernel/ioSubmitter.c deleted file mode 100644 index 036bf25..0000000 --- a/vdo/kernel/ioSubmitter.c +++ /dev/null @@ -1,668 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ioSubmitter.c#8 $ - */ - -#include "ioSubmitter.h" - -#include - -#include "memoryAlloc.h" - -#include "bio.h" -#include "dataKVIO.h" -#include "kernelLayer.h" -#include "logger.h" - -enum { - /* - * Whether to use bio merging code. - * - * Merging I/O requests in the request queue below us is helpful for - * many devices, and VDO does a good job sometimes of shuffling up - * the I/O order (too much for some simple I/O schedulers to sort - * out) as we deal with dedupe advice etc. The bio map tracks the - * yet-to-be-submitted I/O requests by block number so that we can - * collect together and submit sequential I/O operations that should - * be easy to merge. (So we don't actually *merge* them here, we - * just arrange them so that merging can happen.) - * - * For some devices, merging may not help, and we may want to turn - * off this code and save compute/spinlock cycles. - */ - USE_BIOMAP = 1, -}; - -/* - * Submission of bio operations to the underlying storage device will - * go through a separate work queue thread (or more than one) to - * prevent blocking in other threads if the storage device has a full - * queue. The plug structure allows that thread to do better batching - * of requests to make the I/O more efficient. - * - * When multiple worker threads are used, a thread is chosen for a - * I/O operation submission based on the PBN, so a given PBN will - * consistently wind up on the same thread. Flush operations are - * assigned round-robin. - * - * The map (protected by the mutex) collects pending I/O operations so - * that the worker thread can reorder them to try to encourage I/O - * request merging in the request queue underneath. - */ -typedef struct bioQueueData { - KvdoWorkQueue *queue; - struct blk_plug plug; - IntMap *map; - struct mutex lock; - unsigned int queueNumber; -} BioQueueData; - -struct ioSubmitter { - unsigned int numBioQueuesUsed; - unsigned int bioQueueRotationInterval; - unsigned int bioQueueRotor; - BioQueueData bioQueueData[]; -}; - -/**********************************************************************/ -static void startBioQueue(void *ptr) -{ - BioQueueData *bioQueueData = (BioQueueData *)ptr; - blk_start_plug(&bioQueueData->plug); -} - -/**********************************************************************/ -static void finishBioQueue(void *ptr) -{ - BioQueueData *bioQueueData = (BioQueueData *)ptr; - blk_finish_plug(&bioQueueData->plug); -} - -static const KvdoWorkQueueType bioQueueType = { - .start = startBioQueue, - .finish = finishBioQueue, - .actionTable = { - { .name = "bio_compressed_data", - .code = BIO_Q_ACTION_COMPRESSED_DATA, - .priority = 0 }, - { .name = "bio_data", - .code = BIO_Q_ACTION_DATA, - .priority = 0 }, - { .name = "bio_flush", - .code = BIO_Q_ACTION_FLUSH, - .priority = 2 }, - { .name = "bio_high", - .code = BIO_Q_ACTION_HIGH, - .priority = 2 }, - { .name = "bio_metadata", - .code = BIO_Q_ACTION_METADATA, - .priority = 1 }, - { .name = "bio_readcache", - .code = BIO_Q_ACTION_READCACHE, - .priority = 0 }, - { .name = "bio_verify", - .code = BIO_Q_ACTION_VERIFY, - .priority = 1 }, - }, -}; - -/** - * Check that we're running normally (i.e., not in an - * interrupt-servicing context) in an IOSubmitter bio thread. - **/ -static void assertRunningInBioQueue(void) -{ - ASSERT_LOG_ONLY(!in_interrupt(), "not in interrupt context"); - ASSERT_LOG_ONLY(strnstr(current->comm, "bioQ", TASK_COMM_LEN) != NULL, - "running in bio submission work queue thread"); -} - -/** - * Returns the BioQueueData pointer associated with the current thread. - * Results are undefined if called from any other thread. - * - * @return the BioQueueData pointer - **/ -static inline BioQueueData *getCurrentBioQueueData(void) -{ - BioQueueData *bioQueueData = (BioQueueData *) getWorkQueuePrivateData(); - // Does it look like a bio queue thread? - BUG_ON(bioQueueData == NULL); - BUG_ON(bioQueueData->queue != getCurrentWorkQueue()); - return bioQueueData; -} - -/**********************************************************************/ -static inline IOSubmitter *bioQueueToSubmitter(BioQueueData *bioQueue) -{ - BioQueueData *firstBioQueue = bioQueue - bioQueue->queueNumber; - IOSubmitter *submitter = container_of(firstBioQueue, IOSubmitter, - bioQueueData[0]); - return submitter; -} - -/** - * Return the bio thread number handling the specified physical block - * number. - * - * @param ioSubmitter The I/O submitter data - * @param pbn The physical block number - * - * @return read cache zone number - **/ -static unsigned int bioQueueNumberForPBN(IOSubmitter *ioSubmitter, - PhysicalBlockNumber pbn) -{ - unsigned int bioQueueIndex - = ((pbn - % (ioSubmitter->numBioQueuesUsed - * ioSubmitter->bioQueueRotationInterval)) - / ioSubmitter->bioQueueRotationInterval); - - return bioQueueIndex; -} - -/** - * Check that we're running normally (i.e., not in an - * interrupt-servicing context) in an IOSubmitter bio thread. Also - * require that the thread we're running on is the correct one for the - * supplied physical block number. - * - * @param pbn The PBN that should have been used in thread selection - **/ -static void assertRunningInBioQueueForPBN(PhysicalBlockNumber pbn) -{ - assertRunningInBioQueue(); - - BioQueueData *thisQueue = getCurrentBioQueueData(); - IOSubmitter *submitter = bioQueueToSubmitter(thisQueue); - unsigned int computedQueueNumber = bioQueueNumberForPBN(submitter, pbn); - ASSERT_LOG_ONLY(thisQueue->queueNumber == computedQueueNumber, - "running in correct bio queue (%u vs %u) for PBN %llu", - thisQueue->queueNumber, computedQueueNumber, pbn); -} - -/** - * Increments appropriate counters for bio completions - * - * @param kvio the kvio associated with the bio - * @param bio the bio to count - */ -static void countAllBiosCompleted(KVIO *kvio, BIO *bio) -{ - KernelLayer *layer = kvio->layer; - if (isData(kvio)) { - countBios(&layer->biosOutCompleted, bio); - return; - } - - countBios(&layer->biosMetaCompleted, bio); - if (kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) { - countBios(&layer->biosJournalCompleted, bio); - } else if (kvio->vio->type == VIO_TYPE_BLOCK_MAP) { - countBios(&layer->biosPageCacheCompleted, bio); - } -} - -/**********************************************************************/ -void countCompletedBios(BIO *bio) -{ - KVIO *kvio = (KVIO *)bio->bi_private; - KernelLayer *layer = kvio->layer; - atomic64_inc(&layer->biosCompleted); - countAllBiosCompleted(kvio, bio); -} - -/**********************************************************************/ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) -void completeAsyncBio(BIO *bio) -#else -void completeAsyncBio(BIO *bio, int error) -#endif -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) - int error = getBioResult(bio); -#endif - KVIO *kvio = (KVIO *) bio->bi_private; - kvioAddTraceRecord(kvio, THIS_LOCATION("$F($io);cb=io($io)")); - countCompletedBios(bio); - if ((error == 0) && isData(kvio) && isReadVIO(kvio->vio)) { - DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); - if (!isCompressed(dataKVIO->dataVIO.mapped.state) - && !dataKVIO->isPartial) { - kvdoAcknowledgeDataVIO(&dataKVIO->dataVIO); - return; - } - } - kvdoContinueKvio(kvio, error); -} - -/** - * Determines which bio counter to use - * - * @param kvio the kvio associated with the bio - * @param bio the bio to count - */ -static void countAllBios(KVIO *kvio, BIO *bio) -{ - KernelLayer *layer = kvio->layer; - if (isData(kvio)) { - countBios(&layer->biosOut, bio); - return; - } - - countBios(&layer->biosMeta, bio); - if (kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) { - countBios(&layer->biosJournal, bio); - } else if (kvio->vio->type == VIO_TYPE_BLOCK_MAP) { - countBios(&layer->biosPageCache, bio); - } -} - -/** - * Update stats and tracing info, then submit the supplied bio to the - * OS for processing. - * - * @param kvio The KVIO associated with the bio - * @param bio The bio to submit to the OS - * @param location Call site location for tracing - **/ -static void sendBioToDevice(KVIO *kvio, BIO *bio, TraceLocation location) -{ - assertRunningInBioQueueForPBN(kvio->vio->physical); - - atomic64_inc(&kvio->layer->biosSubmitted); - countAllBios(kvio, bio); - kvioAddTraceRecord(kvio, location); - bio->bi_next = NULL; - generic_make_request(bio); -} - -/** - * Submits a bio to the underlying block device. May block if the - * device is busy. - * - * For metadata or if USE_BIOMAP is disabled, kvio->bioToSubmit holds - * the BIO pointer to submit to the target device. For normal - * data when USE_BIOMAP is enabled, kvio->biosMerged is the list of - * all bios collected together in this group; all of them get - * submitted. In both cases, the bi_end_io callback is invoked when - * each I/O operation completes. - * - * @param item The work item in the KVIO "owning" either the bio to - * submit, or the head of the bio_list to be submitted. - **/ -static void processBioMap(KvdoWorkItem *item) -{ - assertRunningInBioQueue(); - KVIO *kvio = workItemAsKVIO(item); - /* - * XXX Make these paths more regular: Should bi_bdev be set here, or - * in the caller, or in the callback function? Should we call - * finishBioQueue for the biomap case on old kernels? - */ - if (USE_BIOMAP && isData(kvio)) { - // We need to make sure to do two things here: - // 1. Use each bio's kvio when submitting. Any other kvio is not safe - // 2. Detach the bio list from the kvio before submitting, because it - // could get reused/free'd up before all bios are submitted. - BioQueueData *bioQueueData = getWorkQueuePrivateData(); - BIO *bio = NULL; - mutex_lock(&bioQueueData->lock); - if (!bio_list_empty(&kvio->biosMerged)) { - intMapRemove(bioQueueData->map, getBioSector(kvio->biosMerged.head)); - intMapRemove(bioQueueData->map, getBioSector(kvio->biosMerged.tail)); - } - bio = kvio->biosMerged.head; - bio_list_init(&kvio->biosMerged); - mutex_unlock(&bioQueueData->lock); - // Somewhere in the list we'll be submitting the current "kvio", - // so drop our handle on it now. - kvio = NULL; - - while (bio != NULL) { - KVIO *kvioBio = bio->bi_private; - BIO *next = bio->bi_next; - bio->bi_next = NULL; - setBioBlockDevice(bio, getKernelLayerBdev(kvioBio->layer)); - sendBioToDevice(kvioBio, bio, THIS_LOCATION("$F($io)")); - bio = next; - } - } else { - sendBioToDevice(kvio, kvio->bioToSubmit, THIS_LOCATION("$F($io)")); - } -} - -/** - * This function will attempt to find an already queued bio that the current - * bio can be merged with. There are two types of merging possible, forward - * and backward, which are distinguished by a flag that uses kernel - * elevator terminology. - * - * @param map The bio map to use for merging - * @param kvio The kvio we want to merge - * @param mergeType The type of merging we want to try - * - * @return the kvio to merge to, NULL if no merging is possible - */ -static KVIO *getMergeableLocked(IntMap *map, - KVIO *kvio, - unsigned int mergeType) -{ - BIO *bio = kvio->bioToSubmit; - sector_t mergeSector = getBioSector(bio); - switch (mergeType) { - case ELEVATOR_BACK_MERGE: - mergeSector -= VDO_SECTORS_PER_BLOCK; - break; - case ELEVATOR_FRONT_MERGE: - mergeSector += VDO_SECTORS_PER_BLOCK; - break; - } - - KVIO *kvioMerge = intMapGet(map, mergeSector); - - if (kvioMerge != NULL) { - if (!areWorkItemActionsEqual(&kvio->enqueueable.workItem, - &kvioMerge->enqueueable.workItem)) { - return NULL; - } else if (bio_data_dir(bio) != bio_data_dir(kvioMerge->bioToSubmit)) { - return NULL; - } else if (bio_list_empty(&kvioMerge->biosMerged)) { - return NULL; - } else { - switch (mergeType) { - case ELEVATOR_BACK_MERGE: - if (getBioSector(kvioMerge->biosMerged.tail) != mergeSector) { - return NULL; - } - break; - case ELEVATOR_FRONT_MERGE: - if (getBioSector(kvioMerge->biosMerged.head) != mergeSector) { - return NULL; - } - break; - } - } - } - - return kvioMerge; -} - -/**********************************************************************/ -static inline unsigned int advanceBioRotor(IOSubmitter *bioData) -{ - unsigned int index = bioData->bioQueueRotor++ - % (bioData->numBioQueuesUsed - * bioData->bioQueueRotationInterval); - index /= bioData->bioQueueRotationInterval; - return index; -} - -/**********************************************************************/ -static bool tryBioMapMerge(BioQueueData *bioQueueData, KVIO *kvio, BIO *bio) -{ - bool merged = false; - - mutex_lock(&bioQueueData->lock); - KVIO *prevKvio = getMergeableLocked(bioQueueData->map, kvio, - ELEVATOR_BACK_MERGE); - KVIO *nextKvio = getMergeableLocked(bioQueueData->map, kvio, - ELEVATOR_FRONT_MERGE); - if (prevKvio == nextKvio) { - nextKvio = NULL; - } - int result; - if ((prevKvio == NULL) && (nextKvio == NULL)) { - // no merge. just add to bioQueue - result = intMapPut(bioQueueData->map, getBioSector(bio), kvio, true, NULL); - // We don't care about failure of intMapPut in this case. - result = result; - mutex_unlock(&bioQueueData->lock); - } else { - if (nextKvio == NULL) { - // Only prev. merge to prev's tail - intMapRemove(bioQueueData->map, getBioSector(prevKvio->biosMerged.tail)); - bio_list_merge(&prevKvio->biosMerged, &kvio->biosMerged); - result = intMapPut(bioQueueData->map, - getBioSector(prevKvio->biosMerged.head), - prevKvio, true, NULL); - result = intMapPut(bioQueueData->map, - getBioSector(prevKvio->biosMerged.tail), - prevKvio, true, NULL); - } else { - // Only next. merge to next's head - // - // Handle "next merge" and "gap fill" cases the same way so as to - // reorder bios in a way that's compatible with using funnel queues - // in work queues. This avoids removing an existing work item. - intMapRemove(bioQueueData->map, getBioSector(nextKvio->biosMerged.head)); - bio_list_merge_head(&nextKvio->biosMerged, &kvio->biosMerged); - result = intMapPut(bioQueueData->map, - getBioSector(nextKvio->biosMerged.head), - nextKvio, true, NULL); - result = intMapPut(bioQueueData->map, - getBioSector(nextKvio->biosMerged.tail), - nextKvio, true, NULL); - } - - // We don't care about failure of intMapPut in this case. - result = result; - mutex_unlock(&bioQueueData->lock); - merged = true; - } - return merged; -} - -/**********************************************************************/ -static BioQueueData *bioQueueDataForPBN(IOSubmitter *ioSubmitter, - PhysicalBlockNumber pbn) -{ - unsigned int bioQueueIndex = bioQueueNumberForPBN(ioSubmitter, pbn); - return &ioSubmitter->bioQueueData[bioQueueIndex]; -} - -/**********************************************************************/ -void submitBio(BIO *bio, BioQAction action) -{ - KVIO *kvio = bio->bi_private; - kvio->bioToSubmit = bio; - setupKVIOWork(kvio, processBioMap, (KvdoWorkFunction) bio->bi_end_io, - action); - - KernelLayer *layer = kvio->layer; - BioQueueData *bioQueueData - = bioQueueDataForPBN(layer->ioSubmitter, kvio->vio->physical); - - kvioAddTraceRecord(kvio, THIS_LOCATION("$F($io)")); - - bio->bi_next = NULL; - bio_list_init(&kvio->biosMerged); - bio_list_add(&kvio->biosMerged, bio); - - /* - * Enabling of MD RAID5 mode optimizes performance for MD RAID5 storage - * configurations. It clears the bits for sync I/O RW flags on data block - * bios and sets the bits for sync I/O RW flags on all journal-related - * bios. - * - * This increases the frequency of full-stripe writes by altering flags of - * submitted bios. For workloads with write requests this increases the - * likelihood that the MD RAID5 device will update a full stripe instead of - * a partial stripe, thereby avoiding making read requests to the underlying - * physical storage for purposes of parity chunk calculations. - * - * Setting the sync-flag on journal-related bios is expected to reduce - * latency on journal updates submitted to an MD RAID5 device. - */ - if (layer->deviceConfig->mdRaid5ModeEnabled) { - if (isData(kvio)) { - // Clear the bits for sync I/O RW flags on data block bios. - clearBioOperationFlagSync(bio); - } else if ((kvio->vio->type == VIO_TYPE_RECOVERY_JOURNAL) - || (kvio->vio->type == VIO_TYPE_SLAB_JOURNAL)) { - // Set the bits for sync I/O RW flags on all journal-related and - // slab-journal-related bios. - setBioOperationFlagSync(bio); - } - } - - /* - * Try to use the bio map to submit this bio earlier if we're already sending - * IO for an adjacent block. If we can't use an existing pending bio, enqueue - * an operation to run in a bio submission thread appropriate to the - * indicated physical block number. - */ - - bool merged = false; - if (USE_BIOMAP && isData(kvio)) { - merged = tryBioMapMerge(bioQueueData, kvio, bio); - } - if (!merged) { - enqueueKVIOWork(bioQueueData->queue, kvio); - } -} - -/**********************************************************************/ -static int initializeBioQueue(BioQueueData *bioQueueData, - const char *threadNamePrefix, - const char *queueName, - unsigned int queueNumber, - KernelLayer *layer) -{ -#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,38) - bioQueueData->bdev = layer->dev->bdev; -#endif - bioQueueData->queueNumber = queueNumber; - - return makeWorkQueue(threadNamePrefix, queueName, &layer->wqDirectory, - layer, bioQueueData, &bioQueueType, 1, - &bioQueueData->queue); -} - -/**********************************************************************/ -int makeIOSubmitter(const char *threadNamePrefix, - unsigned int threadCount, - unsigned int rotationInterval, - unsigned int maxRequestsActive, - KernelLayer *layer, - IOSubmitter **ioSubmitterPtr) -{ - IOSubmitter *ioSubmitter; - int result = ALLOCATE_EXTENDED(IOSubmitter, - threadCount, - BioQueueData, - "bio submission data", - &ioSubmitter); - if (result != UDS_SUCCESS) { - return result; - } - - // Setup for each bio-submission work queue - char queueName[MAX_QUEUE_NAME_LEN]; - ioSubmitter->bioQueueRotationInterval = rotationInterval; - for (unsigned int i=0; i < threadCount; i++) { - BioQueueData *bioQueueData = &ioSubmitter->bioQueueData[i]; - snprintf(queueName, sizeof(queueName), "bioQ%u", i); - - if (USE_BIOMAP) { - mutex_init(&bioQueueData->lock); - /* - * One I/O operation per request, but both first & last sector numbers. - * - * If requests are assigned to threads round-robin, they should - * be distributed quite evenly. But if they're assigned based on - * PBN, things can sometimes be very uneven. So for now, we'll - * assume that all requests *may* wind up on one thread, and - * thus all in the same map. - */ - result = makeIntMap(maxRequestsActive * 2, 0, &bioQueueData->map); - if (result != 0) { - // Clean up the partially initialized bio-queue entirely and - // indicate that initialization failed. - logError("bio map initialization failed %d", result); - cleanupIOSubmitter(ioSubmitter); - freeIOSubmitter(ioSubmitter); - return result; - } - } - - result = initializeBioQueue(bioQueueData, - threadNamePrefix, - queueName, - i, - layer); - if (result != VDO_SUCCESS) { - // Clean up the partially initialized bio-queue entirely and - // indicate that initialization failed. - if (USE_BIOMAP) { - freeIntMap(&ioSubmitter->bioQueueData[i].map); - } - logError("bio queue initialization failed %d", result); - cleanupIOSubmitter(ioSubmitter); - freeIOSubmitter(ioSubmitter); - return result; - } - - ioSubmitter->numBioQueuesUsed++; - } - - *ioSubmitterPtr = ioSubmitter; - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void cleanupIOSubmitter(IOSubmitter *ioSubmitter) -{ - for (int i=ioSubmitter->numBioQueuesUsed - 1; i >= 0; i--) { - finishWorkQueue(ioSubmitter->bioQueueData[i].queue); - } -} - -/**********************************************************************/ -void freeIOSubmitter(IOSubmitter *ioSubmitter) -{ - for (int i = ioSubmitter->numBioQueuesUsed - 1; i >= 0; i--) { - ioSubmitter->numBioQueuesUsed--; - freeWorkQueue(&ioSubmitter->bioQueueData[i].queue); - if (USE_BIOMAP) { - freeIntMap(&ioSubmitter->bioQueueData[i].map); - } - } - FREE(ioSubmitter); -} - -/**********************************************************************/ -void dumpBioWorkQueue(IOSubmitter *ioSubmitter) -{ - for (int i=0; i < ioSubmitter->numBioQueuesUsed; i++) { - dumpWorkQueue(ioSubmitter->bioQueueData[i].queue); - } -} - - -/**********************************************************************/ -void enqueueBioWorkItem(IOSubmitter *ioSubmitter, KvdoWorkItem *workItem) -{ - unsigned int bioQueueIndex = advanceBioRotor(ioSubmitter); - enqueueWorkQueue(ioSubmitter->bioQueueData[bioQueueIndex].queue, - workItem); -} - diff --git a/vdo/kernel/ioSubmitter.h b/vdo/kernel/ioSubmitter.h deleted file mode 100644 index c4fb5ce..0000000 --- a/vdo/kernel/ioSubmitter.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ioSubmitter.h#4 $ - */ - -#ifndef IOSUBMITTER_H -#define IOSUBMITTER_H - -#include - -#include "kernelLayer.h" -#include "kvio.h" - -/** - * Does all the appropriate accounting for bio completions - * - * @param bio the bio to count - **/ -void countCompletedBios(BIO *bio); - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) -/** - * Completes a bio relating to a kvio, causing the completion callback - * to be invoked. - * - * This is used as the bi_end_io function for most of the bios created - * within VDO and submitted to the storage device. Exceptions are the - * flush code and the read-block code, both of which need to regain - * control in the kernel layer after the I/O is completed. - * - * @param bio The bio to complete - **/ -void completeAsyncBio(BIO *bio); -#else -/** - * Completes a bio relating to a kvio, causing the completion callback - * to be invoked. - * - * This is used as the bi_end_io function for most of the bios created - * within VDO and submitted to the storage device. Exceptions are the - * flush code and the read-block code, both of which need to regain - * control in the kernel layer after the I/O is completed. - * - * @param bio The bio to complete - * @param error Possible error from underlying block device - **/ -void completeAsyncBio(BIO *bio, int error); -#endif - -/** - * Create a IOSubmitter structure for a new physical layer. - * - * @param [in] threadNamePrefix The per-device prefix to use in process names - * @param [in] threadCount Number of bio-submission threads to set up - * @param [in] rotationInterval Interval to use when rotating between - * bio-submission threads when enqueuing work - * items - * @param [in] maxRequestsActive Number of bios for merge tracking - * @param [in] layer The kernel layer - * @param [out] ioSubmitter Pointer to the new data structure - * - * @return VDO_SUCCESS or an error - **/ -int makeIOSubmitter(const char *threadNamePrefix, - unsigned int threadCount, - unsigned int rotationInterval, - unsigned int maxRequestsActive, - KernelLayer *layer, - IOSubmitter **ioSubmitter); - -/** - * Tear down the IOSubmitter fields as needed for a physical layer. - * - * @param [in] ioSubmitter The I/O submitter data to tear down - **/ -void cleanupIOSubmitter(IOSubmitter *ioSubmitter); - -/** - * Free the IOSubmitter fields and structure as needed for a - * physical layer. This must be called after - * cleanupIOSubmitter(). It is used to release resources late in - * the shutdown process to avoid or reduce the chance of race - * conditions. - * - * @param [in] ioSubmitter The I/O submitter data to destroy - **/ -void freeIOSubmitter(IOSubmitter *ioSubmitter); - -/** - * Dump info to the kernel log about the work queue used by the - * physical layer. For debugging only. - * - * @param [in] ioSubmitter The I/O submitter data - **/ -void dumpBioWorkQueue(IOSubmitter *ioSubmitter); - - -/** - * Enqueue a work item to run in the work queue(s) used for bio - * submissions from the physical layer. - * - * Outside of IOSubmitter, used only for finishing processing of empty - * flush bios by sending them to the storage device. - * - * @param ioSubmitter The I/O submitter data to update - * @param workItem The new work item to run - **/ -void enqueueBioWorkItem(IOSubmitter *ioSubmitter, KvdoWorkItem *workItem); - -/** - * Submit bio but don't block. - * - * Submits the bio to a helper work queue which sits in a loop - * submitting bios. The worker thread may block if the target device - * is busy, which is why we don't want to do the submission in the - * original calling thread. - * - * The bi_private field of the bio must point to a KVIO associated - * with the operation. The bi_end_io callback is invoked when the I/O - * operation completes. - * - * @param bio the block I/O operation descriptor to submit - * @param action the action code specifying the priority for the operation - **/ -void submitBio(BIO *bio, BioQAction action); - -#endif // IOSUBMITTER_H diff --git a/vdo/kernel/kernelLayer.c b/vdo/kernel/kernelLayer.c deleted file mode 100644 index 8d4d4ed..0000000 --- a/vdo/kernel/kernelLayer.c +++ /dev/null @@ -1,1409 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.c#38 $ - */ - -#include "kernelLayer.h" - -#include -#include -#include -#include -#include - -#include "logger.h" -#include "memoryAlloc.h" -#include "murmur/MurmurHash3.h" - -#include "lz4.h" -#include "releaseVersions.h" -#include "volumeGeometry.h" -#include "statistics.h" -#include "vdo.h" - -#include "bio.h" -#include "dataKVIO.h" -#include "dedupeIndex.h" -#include "deviceConfig.h" -#include "deviceRegistry.h" -#include "instanceNumber.h" -#include "ioSubmitter.h" -#include "kvdoFlush.h" -#include "kvio.h" -#include "poolSysfs.h" -#include "statusProcfs.h" -#include "stringUtils.h" -#include "verify.h" - -enum { - DEDUPE_TIMEOUT_REPORT_INTERVAL = 1000, -}; - -static const KvdoWorkQueueType bioAckQType = { - .actionTable = { - { .name = "bio_ack", - .code = BIO_ACK_Q_ACTION_ACK, - .priority = 0 }, - }, -}; - -static const KvdoWorkQueueType cpuQType = { - .actionTable = { - { .name = "cpu_complete_kvio", - .code = CPU_Q_ACTION_COMPLETE_KVIO, - .priority = 0 }, - { .name = "cpu_compress_block", - .code = CPU_Q_ACTION_COMPRESS_BLOCK, - .priority = 0 }, - { .name = "cpu_hash_block", - .code = CPU_Q_ACTION_HASH_BLOCK, - .priority = 0 }, - { .name = "cpu_event_reporter", - .code = CPU_Q_ACTION_EVENT_REPORTER, - .priority = 0 }, - }, -}; - -// 2000 is half the number of entries currently in our page cache, -// to allow for each in-progress operation to update two pages. -int defaultMaxRequestsActive = 2000; - -/**********************************************************************/ -static CRC32Checksum kvdoUpdateCRC32(CRC32Checksum crc, - const byte *buffer, - size_t length) -{ - /* - * The kernel's CRC 32 implementation does not do pre- and post- - * conditioning, so do it ourselves. - */ - return crc32(crc ^ 0xffffffff, buffer, length) ^ 0xffffffff; -} - -/**********************************************************************/ -static BlockCount kvdoGetBlockCount(PhysicalLayer *header) -{ - return asKernelLayer(header)->deviceConfig->physicalBlocks; -} - -/**********************************************************************/ -bool layerIsNamed(KernelLayer *layer, void *context) -{ - struct dm_target *ti = layer->deviceConfig->owningTarget; - const char *deviceName = dm_device_name(dm_table_get_md(ti->table)); - return (strcmp(deviceName, (const char *) context) == 0); -} - -/** - * Implements LayerFilter. - **/ -static bool layerUsesDevice(KernelLayer *layer, void *context) -{ - DeviceConfig *config = context; - return (layer->deviceConfig->ownedDevice->bdev->bd_dev - == config->ownedDevice->bdev->bd_dev); -} - -int mapToSystemError(int error) -{ - // 0 is success, negative a system error code - if (likely(error <= 0)) { - return error; - } - if (error < 1024) { - // errno macro used without negating - may be a minor bug - return -error; - } - // VDO or UDS error - char errorName[80], errorMessage[ERRBUF_SIZE]; - switch (sansUnrecoverable(error)) { - case VDO_NO_SPACE: - return -ENOSPC; - case VDO_READ_ONLY: - return -EIO; - default: - logInfo("%s: mapping internal status code %d (%s: %s) to EIO", - __func__, error, - stringErrorName(error, errorName, sizeof(errorName)), - stringError(error, errorMessage, sizeof(errorMessage))); - return -EIO; - } -} - -/**********************************************************************/ -static void setKernelLayerState(KernelLayer *layer, KernelLayerState newState) -{ - atomicStore32(&layer->state, newState); -} - -/**********************************************************************/ -void waitForNoRequestsActive(KernelLayer *layer) -{ - // Do nothing if there are no requests active. This check is not necessary - // for correctness but does reduce log message traffic. - if (limiterIsIdle(&layer->requestLimiter)) { - return; - } - - // We have to make sure to flush the packer before waiting. We do this - // by turning off compression, which also means no new entries coming in - // while waiting will end up in the packer. - bool wasCompressing = setKVDOCompressing(&layer->kvdo, false); - // Now wait for there to be no active requests - limiterWaitForIdle(&layer->requestLimiter); - // Reset the compression state after all requests are done - if (wasCompressing) { - setKVDOCompressing(&layer->kvdo, true); - } -} - -/** - * Start processing a new data KVIO based on the supplied bio, but from within - * a VDO thread context, when we're not allowed to block. Using this path at - * all suggests a bug or erroneous usage, but we special-case it to avoid a - * deadlock that can apparently result. Message will be logged to alert the - * administrator that something has gone wrong, while we attempt to continue - * processing other requests. - * - * If a request permit can be acquired immediately, kvdoLaunchDataKVIOFromBio - * will be called. (If the bio is a discard operation, a permit from the - * discard limiter will be requested but the call will be made with or without - * it.) If the request permit is not available, the bio will be saved on a list - * to be launched later. Either way, this function will not block, and will - * take responsibility for processing the bio. - * - * @param layer The kernel layer - * @param bio The bio to launch - * @param arrivalTime The arrival time of the bio - * - * @return DM_MAPIO_SUBMITTED or a system error code - **/ -static int launchDataKVIOFromVDOThread(KernelLayer *layer, - BIO *bio, - Jiffies arrivalTime) -{ - logWarning("kvdoMapBio called from within a VDO thread!"); - /* - * We're not yet entirely sure what circumstances are causing this situation - * in [ESC-638], but it does appear to be happening and causing VDO to - * deadlock. - * - * Somehow kvdoMapBio is being called from generic_make_request which is - * being called from the VDO code to pass a flush on down to the underlying - * storage system; we've got 2000 requests in progress, so we have to wait - * for one to complete, but none can complete while the bio thread is blocked - * from passing more I/O requests down. Near as we can tell, the flush bio - * should always have gotten updated to point to the storage system, so we - * shouldn't be calling back into VDO unless something's gotten messed up - * somewhere. - * - * To side-step this case, if the limiter says we're busy *and* we're running - * on one of VDO's own threads, we'll drop the I/O request in a special queue - * for processing as soon as KVIOs become free. - * - * We don't want to do this in general because it leads to unbounded - * buffering, arbitrarily high latencies, inability to push back in a way the - * caller can take advantage of, etc. If someone wants huge amounts of - * buffering on top of VDO, they're welcome to access it through the kernel - * page cache or roll their own. - */ - if (!limiterPoll(&layer->requestLimiter)) { - addToDeadlockQueue(&layer->deadlockQueue, bio, arrivalTime); - logWarning("queued an I/O request to avoid deadlock!"); - - return DM_MAPIO_SUBMITTED; - } - - bool hasDiscardPermit - = (isDiscardBio(bio) && limiterPoll(&layer->discardLimiter)); - int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, - hasDiscardPermit); - // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. - if (result != VDO_SUCCESS) { - return result; - } - - return DM_MAPIO_SUBMITTED; -} - -/**********************************************************************/ -int kvdoMapBio(KernelLayer *layer, BIO *bio) -{ - Jiffies arrivalTime = jiffies; - KernelLayerState state = getKernelLayerState(layer); - ASSERT_LOG_ONLY(state == LAYER_RUNNING, - "kvdoMapBio should not be called while in state %d", state); - - // Count all incoming bios. - countBios(&layer->biosIn, bio); - - // Handle empty bios. Empty flush bios are not associated with a VIO. - if (isFlushBio(bio)) { - if (ASSERT(getBioSize(bio) == 0, "Flush bio is size 0") != VDO_SUCCESS) { - // We expect flushes to be of size 0. - return -EINVAL; - } - if (shouldProcessFlush(layer)) { - launchKVDOFlush(layer, bio); - return DM_MAPIO_SUBMITTED; - } else { - // We're not acknowledging this bio now, but we'll never touch it - // again, so this is the last chance to account for it. - countBios(&layer->biosAcknowledged, bio); - atomic64_inc(&layer->flushOut); - setBioBlockDevice(bio, getKernelLayerBdev(layer)); - return DM_MAPIO_REMAPPED; - } - } - - if (ASSERT(getBioSize(bio) != 0, "Data bio is not size 0") != VDO_SUCCESS) { - // We expect non-flushes to be non-zero in size. - return -EINVAL; - } - - if (isDiscardBio(bio) && isReadBio(bio)) { - // Read and Discard should never occur together - return -EIO; - } - - KvdoWorkQueue *currentWorkQueue = getCurrentWorkQueue(); - if ((currentWorkQueue != NULL) - && (layer == getWorkQueueOwner(currentWorkQueue))) { - /* - * This prohibits sleeping during I/O submission to VDO from its own - * thread. - */ - return launchDataKVIOFromVDOThread(layer, bio, arrivalTime); - } - bool hasDiscardPermit = false; - if (isDiscardBio(bio)) { - limiterWaitForOneFree(&layer->discardLimiter); - hasDiscardPermit = true; - } - limiterWaitForOneFree(&layer->requestLimiter); - - int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, - hasDiscardPermit); - // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. - if (result != VDO_SUCCESS) { - return result; - } - - return DM_MAPIO_SUBMITTED; -} - -/**********************************************************************/ -struct block_device *getKernelLayerBdev(const KernelLayer *layer) -{ - return layer->deviceConfig->ownedDevice->bdev; -} - -/**********************************************************************/ -void completeManyRequests(KernelLayer *layer, uint32_t count) -{ - // If we had to buffer some requests to avoid deadlock, release them now. - while (count > 0) { - Jiffies arrivalTime = 0; - BIO *bio = pollDeadlockQueue(&layer->deadlockQueue, &arrivalTime); - if (likely(bio == NULL)) { - break; - } - - bool hasDiscardPermit - = (isDiscardBio(bio) && limiterPoll(&layer->discardLimiter)); - int result = kvdoLaunchDataKVIOFromBio(layer, bio, arrivalTime, - hasDiscardPermit); - if (result != VDO_SUCCESS) { - completeBio(bio, result); - } - // Succeed or fail, kvdoLaunchDataKVIOFromBio owns the permit(s) now. - count--; - } - // Notify the limiter, so it can wake any blocked processes. - if (count > 0) { - limiterReleaseMany(&layer->requestLimiter, count); - } -} - -/**********************************************************************/ -static void reportEvents(PeriodicEventReporter *reporter) -{ - atomic_set(&reporter->workItemQueued, 0); - uint64_t newValue = atomic64_read(&reporter->value); - uint64_t difference = newValue - reporter->lastReportedValue; - if (difference != 0) { - logDebug(reporter->format, difference); - reporter->lastReportedValue = newValue; - } -} - -/**********************************************************************/ -static void reportEventsWork(KvdoWorkItem *item) -{ - PeriodicEventReporter *reporter = container_of(item, PeriodicEventReporter, - workItem); - reportEvents(reporter); -} - -/**********************************************************************/ -static void initPeriodicEventReporter(PeriodicEventReporter *reporter, - const char *format, - unsigned long reportingInterval, - KernelLayer *layer) -{ - setupWorkItem(&reporter->workItem, reportEventsWork, NULL, - CPU_Q_ACTION_EVENT_REPORTER); - reporter->format = format; - reporter->reportingInterval = msecs_to_jiffies(reportingInterval); - reporter->layer = layer; -} - -/**********************************************************************/ -static void addEventCount(PeriodicEventReporter *reporter, unsigned int count) -{ - if (count > 0) { - atomic64_add(count, &reporter->value); - int oldWorkItemQueued = atomic_xchg(&reporter->workItemQueued, 1); - if (oldWorkItemQueued == 0) { - enqueueWorkQueueDelayed(reporter->layer->cpuQueue, - &reporter->workItem, - jiffies + reporter->reportingInterval); - } - } -} - -/**********************************************************************/ -static void stopPeriodicEventReporter(PeriodicEventReporter *reporter) -{ - reportEvents(reporter); -} - -/**********************************************************************/ -void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount) -{ - addEventCount(&layer->albireoTimeoutReporter, expiredCount); -} - -/**********************************************************************/ -static int kvdoCreateEnqueueable(VDOCompletion *completion) -{ - KvdoEnqueueable *kvdoEnqueueable; - int result = ALLOCATE(1, KvdoEnqueueable, "kvdoEnqueueable", - &kvdoEnqueueable); - if (result != VDO_SUCCESS) { - logError("kvdoEnqueueable allocation failure %d", result); - return result; - } - kvdoEnqueueable->enqueueable.completion = completion; - completion->enqueueable = &kvdoEnqueueable->enqueueable; - return VDO_SUCCESS; -} - -/**********************************************************************/ -static void kvdoDestroyEnqueueable(Enqueueable **enqueueablePtr) -{ - Enqueueable *enqueueable = *enqueueablePtr; - if (enqueueable != NULL) { - KvdoEnqueueable *kvdoEnqueueable - = container_of(enqueueable, KvdoEnqueueable, enqueueable); - FREE(kvdoEnqueueable); - *enqueueablePtr = NULL; - } -} - -/** - * Implements BufferAllocator. - **/ -static int kvdoAllocateIOBuffer(PhysicalLayer *layer __attribute__((unused)), - size_t bytes, - const char *why, - char **bufferPtr) -{ - return ALLOCATE(bytes, char, why, bufferPtr); -} - -/** - * Implements ExtentReader. Exists only for the geometry block; is unset after - * it is read. - **/ -static int kvdoSynchronousRead(PhysicalLayer *layer, - PhysicalBlockNumber startBlock, - size_t blockCount, - char *buffer, - size_t *blocksRead) -{ - if (blockCount != 1) { - return VDO_NOT_IMPLEMENTED; - } - - KernelLayer *kernelLayer = asKernelLayer(layer); - - BIO *bio; - int result = createBio(kernelLayer, buffer, &bio); - if (result != VDO_SUCCESS) { - return result; - } - setBioBlockDevice(bio, getKernelLayerBdev(kernelLayer)); - setBioSector(bio, blockToSector(kernelLayer, startBlock)); - setBioOperationRead(bio); - result = submitBioAndWait(bio); - if (result != 0) { - logErrorWithStringError(result, "synchronous read failed"); - result = -EIO; - } - freeBio(bio, kernelLayer); - - if (result != VDO_SUCCESS) { - return result; - } - if (blocksRead != NULL) { - *blocksRead = blockCount; - } - return VDO_SUCCESS; -} - -/** - * Implements VIODestructor. - **/ -static void kvdoFreeVIO(VIO **vioPtr) -{ - VIO *vio = *vioPtr; - if (vio == NULL) { - return; - } - - BUG_ON(isDataVIO(vio)); - - if (isCompressedWriteVIO(vio)) { - CompressedWriteKVIO *compressedWriteKVIO - = allocatingVIOAsCompressedWriteKVIO(vioAsAllocatingVIO(vio)); - freeCompressedWriteKVIO(&compressedWriteKVIO); - } else { - MetadataKVIO *metadataKVIO = vioAsMetadataKVIO(vio); - freeMetadataKVIO(&metadataKVIO); - } - - *vioPtr = NULL; -} - -/**********************************************************************/ -static WritePolicy kvdoGetWritePolicy(PhysicalLayer *common) -{ - KernelLayer *layer = asKernelLayer(common); - return getKVDOWritePolicy(&layer->kvdo); -} - -/** - * Function that is called when a synchronous operation is completed. We let - * the waiting thread know it can continue. - * - *

Implements OperationComplete. - * - * @param common The kernel layer - **/ -static void kvdoCompleteSyncOperation(PhysicalLayer *common) -{ - KernelLayer *layer = asKernelLayer(common); - complete(&layer->callbackSync); -} - -/** - * Wait for a synchronous operation to complete. - * - *

Implements OperationWaiter. - * - * @param common The kernel layer - **/ -static void waitForSyncOperation(PhysicalLayer *common) -{ - KernelLayer *layer = asKernelLayer(common); - // Using the "interruptible" interface means that Linux will not log a - // message when we wait for more than 120 seconds. - while (wait_for_completion_interruptible(&layer->callbackSync) != 0) { - // However, if we get a signal in a user-mode process, we could - // spin... - msleep(1); - } -} - -/** - * Make the bio set for allocating new bios. - * - * @param layer The kernel layer - * - * @returns VDO_SUCCESS if bio set created, error code otherwise - **/ -static int makeDedupeBioSet(KernelLayer *layer) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0) - int result = ALLOCATE(1, struct bio_set, "bio set", &layer->bioset); - if (result != VDO_SUCCESS) { - return result; - } - - result = bioset_init(layer->bioset, 0, 0, BIOSET_NEED_BVECS); - if (result != 0) { - return result; - } -#else -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,13,0) - layer->bioset = bioset_create(0, 0, BIOSET_NEED_BVECS); -#else - layer->bioset = bioset_create(0, 0); -#endif - if (layer->bioset == NULL) { - return -ENOMEM; - } -#endif - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int makeKernelLayer(uint64_t startingSector, - unsigned int instance, - DeviceConfig *config, - struct kobject *parentKobject, - ThreadConfig **threadConfigPointer, - char **reason, - KernelLayer **layerPtr) -{ - // VDO-3769 - Set a generic reason so we don't ever return garbage. - *reason = "Unspecified error"; - - KernelLayer *oldLayer = findLayerMatching(layerUsesDevice, config); - if (oldLayer != NULL) { - logError("Existing layer named %s already uses device %s", - oldLayer->deviceConfig->poolName, - oldLayer->deviceConfig->parentDeviceName); - *reason = "Cannot share storage device with already-running VDO"; - return VDO_BAD_CONFIGURATION; - } - - /* - * Part 1 - Allocate the kernel layer, its essential parts, and setup up the - * sysfs node. These must come first so that the sysfs node works correctly - * through the freeing of the kernel layer. After this part you must use - * freeKernelLayer. - */ - KernelLayer *layer; - int result = ALLOCATE(1, KernelLayer, "VDO configuration", &layer); - if (result != UDS_SUCCESS) { - *reason = "Cannot allocate VDO configuration"; - return result; - } - - // Allow the base VDO to allocate buffers and construct or destroy - // enqueuables as part of its allocation. - layer->common.allocateIOBuffer = kvdoAllocateIOBuffer; - layer->common.createEnqueueable = kvdoCreateEnqueueable; - layer->common.destroyEnqueueable = kvdoDestroyEnqueueable; - - result = allocateVDO(&layer->common, &layer->kvdo.vdo); - if (result != VDO_SUCCESS) { - *reason = "Cannot allocate VDO"; - FREE(layer); - return result; - } - - // After this point, calling kobject_put on kobj will decrement its - // reference count, and when the count goes to 0 the KernelLayer will - // be freed. - kobject_init(&layer->kobj, &kernelLayerKobjType); - result = kobject_add(&layer->kobj, parentKobject, config->poolName); - if (result != 0) { - *reason = "Cannot add sysfs node"; - kobject_put(&layer->kobj); - return result; - } - kobject_init(&layer->wqDirectory, &workQueueDirectoryKobjType); - result = kobject_add(&layer->wqDirectory, &layer->kobj, "work_queues"); - if (result != 0) { - *reason = "Cannot add sysfs node"; - kobject_put(&layer->wqDirectory); - kobject_put(&layer->kobj); - return result; - } - - /* - * Part 2 - Do all the simple initialization. These initializations have no - * order dependencies and can be done in any order, but freeKernelLayer() - * cannot be called until all the simple layer properties are set. - * - * The KernelLayer structure starts as all zeros. Pointer initializations - * consist of replacing a NULL pointer with a non-NULL pointer, which can be - * easily undone by freeing all of the non-NULL pointers (using the proper - * free routine). - */ - setKernelLayerState(layer, LAYER_SIMPLE_THINGS_INITIALIZED); - - initializeDeadlockQueue(&layer->deadlockQueue); - - int requestLimit = defaultMaxRequestsActive; - initializeLimiter(&layer->requestLimiter, requestLimit); - initializeLimiter(&layer->discardLimiter, requestLimit * 3 / 4); - - layer->allocationsAllowed = true; - layer->instance = instance; - layer->deviceConfig = config; - layer->startingSectorOffset = startingSector; - initializeRing(&layer->deviceConfigRing); - - layer->common.updateCRC32 = kvdoUpdateCRC32; - layer->common.getBlockCount = kvdoGetBlockCount; - layer->common.getWritePolicy = kvdoGetWritePolicy; - layer->common.createMetadataVIO = kvdoCreateMetadataVIO; - layer->common.createCompressedWriteVIO = kvdoCreateCompressedWriteVIO; - layer->common.freeVIO = kvdoFreeVIO; - layer->common.completeFlush = kvdoCompleteFlush; - layer->common.enqueue = kvdoEnqueue; - layer->common.waitForAdminOperation = waitForSyncOperation; - layer->common.completeAdminOperation = kvdoCompleteSyncOperation; - layer->common.getCurrentThreadID = kvdoGetCurrentThreadID; - layer->common.zeroDataVIO = kvdoZeroDataVIO; - layer->common.compareDataVIOs = kvdoCompareDataVIOs; - layer->common.copyData = kvdoCopyDataVIO; - layer->common.readData = kvdoReadDataVIO; - layer->common.writeData = kvdoWriteDataVIO; - layer->common.writeCompressedBlock = kvdoWriteCompressedBlock; - layer->common.readMetadata = kvdoSubmitMetadataVIO; - layer->common.writeMetadata = kvdoSubmitMetadataVIO; - layer->common.applyPartialWrite = kvdoModifyWriteDataVIO; - layer->common.flush = kvdoFlushVIO; - layer->common.hashData = kvdoHashDataVIO; - layer->common.checkForDuplication = kvdoCheckForDuplication; - layer->common.verifyDuplication = kvdoVerifyDuplication; - layer->common.acknowledgeDataVIO = kvdoAcknowledgeDataVIO; - layer->common.compressDataVIO = kvdoCompressDataVIO; - layer->common.updateAlbireo = kvdoUpdateDedupeAdvice; - - spin_lock_init(&layer->flushLock); - mutex_init(&layer->statsMutex); - bio_list_init(&layer->waitingFlushes); - - result = addLayerToDeviceRegistry(layer); - if (result != VDO_SUCCESS) { - *reason = "Cannot add layer to device registry"; - freeKernelLayer(layer); - return result; - } - - snprintf(layer->threadNamePrefix, sizeof(layer->threadNamePrefix), "%s%u", - THIS_MODULE->name, instance); - - result = makeThreadConfig(config->threadCounts.logicalZones, - config->threadCounts.physicalZones, - config->threadCounts.hashZones, - threadConfigPointer); - if (result != VDO_SUCCESS) { - *reason = "Cannot create thread configuration"; - freeKernelLayer(layer); - return result; - } - - logInfo("zones: %d logical, %d physical, %d hash; base threads: %d", - config->threadCounts.logicalZones, - config->threadCounts.physicalZones, - config->threadCounts.hashZones, - (*threadConfigPointer)->baseThreadCount); - - result = makeBatchProcessor(layer, returnDataKVIOBatchToPool, layer, - &layer->dataKVIOReleaser); - if (result != UDS_SUCCESS) { - *reason = "Cannot allocate KVIO-freeing batch processor"; - freeKernelLayer(layer); - return result; - } - - // Spare KVDOFlush, so that we will always have at least one available - result = makeKVDOFlush(&layer->spareKVDOFlush); - if (result != UDS_SUCCESS) { - *reason = "Cannot allocate KVDOFlush record"; - freeKernelLayer(layer); - return result; - } - - // BIO pool (needed before the geometry block) - result = makeDedupeBioSet(layer); - if (result != VDO_SUCCESS) { - *reason = "Cannot allocate dedupe bioset"; - freeKernelLayer(layer); - return result; - } - - // Read the geometry block so we know how to set up the index. Allow it to - // do synchronous reads. - layer->common.reader = kvdoSynchronousRead; - result = loadVolumeGeometry(&layer->common, &layer->geometry); - layer->common.reader = NULL; - if (result != VDO_SUCCESS) { - *reason = "Could not load geometry block"; - freeKernelLayer(layer); - return result; - } - - // Albireo Timeout Reporter - initPeriodicEventReporter(&layer->albireoTimeoutReporter, - "Albireo timeout on %llu requests", - DEDUPE_TIMEOUT_REPORT_INTERVAL, layer); - - // Dedupe Index - BUG_ON(layer->threadNamePrefix[0] == '\0'); - result = makeDedupeIndex(&layer->dedupeIndex, layer); - if (result != UDS_SUCCESS) { - *reason = "Cannot initialize dedupe index"; - freeKernelLayer(layer); - return result; - } - - // Compression context storage - result = ALLOCATE(config->threadCounts.cpuThreads, char *, "LZ4 context", - &layer->compressionContext); - if (result != VDO_SUCCESS) { - *reason = "cannot allocate LZ4 context"; - freeKernelLayer(layer); - return result; - } - for (int i = 0; i < config->threadCounts.cpuThreads; i++) { - result = ALLOCATE(LZ4_context_size(), char, "LZ4 context", - &layer->compressionContext[i]); - if (result != VDO_SUCCESS) { - *reason = "cannot allocate LZ4 context"; - freeKernelLayer(layer); - return result; - } - } - - - /* - * Part 3 - Do initializations that depend upon other previous - * initializations, but have no order dependencies at freeing time. - * Order dependencies for initialization are identified using BUG_ON. - */ - setKernelLayerState(layer, LAYER_BUFFER_POOLS_INITIALIZED); - - // Trace pool - BUG_ON(layer->requestLimiter.limit <= 0); - result = traceKernelLayerInit(layer); - if (result != VDO_SUCCESS) { - *reason = "Cannot initialize trace data"; - freeKernelLayer(layer); - return result; - } - - // KVIO and VIO pool - BUG_ON(layer->deviceConfig->logicalBlockSize <= 0); - BUG_ON(layer->requestLimiter.limit <= 0); - BUG_ON(layer->bioset == NULL); - BUG_ON(layer->deviceConfig->ownedDevice == NULL); - result = makeDataKVIOBufferPool(layer, layer->requestLimiter.limit, - &layer->dataKVIOPool); - if (result != VDO_SUCCESS) { - *reason = "Cannot allocate vio data"; - freeKernelLayer(layer); - return result; - } - - /* - * Part 4 - Do initializations that depend upon other previous - * initialization, that may have order dependencies at freeing time. - * These are mostly starting up the workqueue threads. - */ - - // Base-code thread, etc - result = initializeKVDO(&layer->kvdo, *threadConfigPointer, reason); - if (result != VDO_SUCCESS) { - freeKernelLayer(layer); - return result; - } - - setKernelLayerState(layer, LAYER_REQUEST_QUEUE_INITIALIZED); - - // Bio queue - result = makeIOSubmitter(layer->threadNamePrefix, - config->threadCounts.bioThreads, - config->threadCounts.bioRotationInterval, - layer->requestLimiter.limit, - layer, - &layer->ioSubmitter); - if (result != VDO_SUCCESS) { - // If initialization of the bio-queues failed, they are cleaned - // up already, so just free the rest of the kernel layer. - freeKernelLayer(layer); - *reason = "bio submission initialization failed"; - return result; - } - setKernelLayerState(layer, LAYER_BIO_DATA_INITIALIZED); - - // Bio ack queue - if (useBioAckQueue(layer)) { - result = makeWorkQueue(layer->threadNamePrefix, "ackQ", - &layer->wqDirectory, layer, layer, &bioAckQType, - config->threadCounts.bioAckThreads, - &layer->bioAckQueue); - if (result != VDO_SUCCESS) { - *reason = "bio ack queue initialization failed"; - freeKernelLayer(layer); - return result; - } - } - - setKernelLayerState(layer, LAYER_BIO_ACK_QUEUE_INITIALIZED); - - // CPU Queues - result = makeWorkQueue(layer->threadNamePrefix, "cpuQ", &layer->wqDirectory, - layer, NULL, &cpuQType, - config->threadCounts.cpuThreads, &layer->cpuQueue); - if (result != VDO_SUCCESS) { - *reason = "Albireo CPU queue initialization failed"; - freeKernelLayer(layer); - return result; - } - - setKernelLayerState(layer, LAYER_CPU_QUEUE_INITIALIZED); - - *layerPtr = layer; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int prepareToModifyKernelLayer(KernelLayer *layer, - DeviceConfig *config, - char **errorPtr) -{ - DeviceConfig *extantConfig = layer->deviceConfig; - if (config->owningTarget->begin != extantConfig->owningTarget->begin) { - *errorPtr = "Starting sector cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (strcmp(config->parentDeviceName, extantConfig->parentDeviceName) != 0) { - *errorPtr = "Underlying device cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (config->logicalBlockSize != extantConfig->logicalBlockSize) { - *errorPtr = "Logical block size cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (config->cacheSize != extantConfig->cacheSize) { - *errorPtr = "Block map cache size cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (config->blockMapMaximumAge != extantConfig->blockMapMaximumAge) { - *errorPtr = "Block map maximum age cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (config->mdRaid5ModeEnabled != extantConfig->mdRaid5ModeEnabled) { - *errorPtr = "mdRaid5Mode cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - if (memcmp(&config->threadCounts, &extantConfig->threadCounts, - sizeof(ThreadCountConfig)) != 0) { - *errorPtr = "Thread configuration cannot change"; - return VDO_PARAMETER_MISMATCH; - } - - // Below here are the actions to take when a non-immutable property changes. - - if (config->writePolicy != extantConfig->writePolicy) { - // Nothing needs doing right now for a write policy change. - } - - if (config->owningTarget->len != extantConfig->owningTarget->len) { - size_t logicalBytes = to_bytes(config->owningTarget->len); - if ((logicalBytes % VDO_BLOCK_SIZE) != 0) { - *errorPtr = "Logical size must be a multiple of 4096"; - return VDO_PARAMETER_MISMATCH; - } - - int result = prepareToResizeLogical(layer, logicalBytes / VDO_BLOCK_SIZE); - if (result != VDO_SUCCESS) { - *errorPtr = "Device prepareToGrowLogical failed"; - return result; - } - } - - if (config->physicalBlocks != extantConfig->physicalBlocks) { - int result = prepareToResizePhysical(layer, config->physicalBlocks); - if (result != VDO_SUCCESS) { - if (result == VDO_TOO_MANY_SLABS) { - *errorPtr = "Device prepareToGrowPhysical failed (specified physical" - " size too big based on formatted slab size)"; - } else { - *errorPtr = "Device prepareToGrowPhysical failed"; - } - return result; - } - } - - return VDO_SUCCESS; -} - -/********************************************************************** - * Modify the pool name of the device. - * - * @param layer The kernel layer - * @param oldName The old pool name - * @param newName The new pool name - * - * @return VDO_SUCCESS or an error - * - */ -int modifyPoolName(KernelLayer *layer, char *oldName, char *newName) -{ - // We use pool name for sysfs and procfs. Rename them accordingly - logInfo("Modify pool name from %s to %s", oldName, newName); - - void *procfsPrivate; - int result = vdoCreateProcfsEntry(layer, newName, &procfsPrivate); - if (result != VDO_SUCCESS) { - return result; - } - - result = kobject_rename(&layer->kobj, newName); - if (result != 0) { - vdoDestroyProcfsEntry(newName, procfsPrivate); - return result; - } - - void *tmpProcfs = layer->procfsPrivate; - layer->procfsPrivate = procfsPrivate; - - vdoDestroyProcfsEntry(oldName, tmpProcfs); - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int modifyKernelLayer(KernelLayer *layer, - DeviceConfig *config) -{ - KernelLayerState state = getKernelLayerState(layer); - if (state == LAYER_RUNNING) { - return VDO_SUCCESS; - } else if (state != LAYER_SUSPENDED) { - logError("pre-resume invoked while in unexpected kernel layer state %d", - state); - return -EINVAL; - } - - setKernelLayerState(layer, LAYER_RESUMING); - - DeviceConfig *extantConfig = layer->deviceConfig; - - // A failure here is unrecoverable. So there is no problem if it happens. - - if (config->writePolicy != extantConfig->writePolicy) { - /* - * Ordinarily, when going from async to sync, we must flush any metadata - * written. However, because the underlying storage must have gone into - * sync mode before we suspend VDO, and suspending VDO concludes by - * issuing a flush, all metadata written before the suspend is flushed - * by the suspend and all metadata between the suspend and the write - * policy change is written to synchronous storage. - */ - logInfo("Modifying device '%s' write policy from %s to %s", - config->poolName, getConfigWritePolicyString(extantConfig), - getConfigWritePolicyString(config)); - setWritePolicy(layer->kvdo.vdo, config->writePolicy); - } - - if (config->owningTarget->len != extantConfig->owningTarget->len) { - size_t logicalBytes = to_bytes(config->owningTarget->len); - int result = resizeLogical(layer, logicalBytes / VDO_BLOCK_SIZE); - if (result != VDO_SUCCESS) { - return result; - } - } - - // Grow physical if the version is 0, so we can't tell if we - // got an old-style growPhysical command, or if size changed. - if ((config->physicalBlocks != extantConfig->physicalBlocks) - || (config->version == 0)) { - int result = resizePhysical(layer, config->physicalBlocks); - if (result != VDO_SUCCESS) { - return result; - } - } - - if (strcmp(config->poolName, extantConfig->poolName) != 0) { - logInfo("Modifying device '%s' pool name from %s to %s", - config->poolName, extantConfig->poolName, config->poolName); - int result = modifyPoolName(layer, extantConfig->poolName, - config->poolName); - if (result != VDO_SUCCESS) { - return result; - } - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void freeKernelLayer(KernelLayer *layer) -{ - // This is not the cleanest implementation, but given the current timing - // uncertainties in the shutdown process for work queues, we need to - // store information to enable a late-in-process deallocation of - // funnel-queue data structures in work queues. - bool usedBioAckQueue = false; - bool usedCpuQueue = false; - bool usedKVDO = false; - bool releaseInstance = false; - - KernelLayerState state = getKernelLayerState(layer); - switch (state) { - case LAYER_STOPPING: - logError("re-entered freeKernelLayer while stopping"); - break; - - case LAYER_RUNNING: - suspendKernelLayer(layer); - // fall through - - case LAYER_STARTING: - case LAYER_RESUMING: - case LAYER_SUSPENDED: - stopKernelLayer(layer); - // fall through - - case LAYER_STOPPED: - case LAYER_CPU_QUEUE_INITIALIZED: - finishWorkQueue(layer->cpuQueue); - usedCpuQueue = true; - releaseInstance = true; - // fall through - - case LAYER_BIO_ACK_QUEUE_INITIALIZED: - if (useBioAckQueue(layer)) { - finishWorkQueue(layer->bioAckQueue); - usedBioAckQueue = true; - } - // fall through - - case LAYER_BIO_DATA_INITIALIZED: - cleanupIOSubmitter(layer->ioSubmitter); - // fall through - - case LAYER_REQUEST_QUEUE_INITIALIZED: - finishKVDO(&layer->kvdo); - usedKVDO = true; - // fall through - - case LAYER_BUFFER_POOLS_INITIALIZED: - freeBufferPool(&layer->dataKVIOPool); - freeBufferPool(&layer->traceBufferPool); - // fall through - - case LAYER_SIMPLE_THINGS_INITIALIZED: - if (layer->compressionContext != NULL) { - for (int i = 0; i < layer->deviceConfig->threadCounts.cpuThreads; i++) { - FREE(layer->compressionContext[i]); - } - FREE(layer->compressionContext); - } - if (layer->dedupeIndex != NULL) { - finishDedupeIndex(layer->dedupeIndex); - } - FREE(layer->spareKVDOFlush); - layer->spareKVDOFlush = NULL; - freeBatchProcessor(&layer->dataKVIOReleaser); - removeLayerFromDeviceRegistry(layer); - break; - - default: - logError("Unknown Kernel Layer state: %d", state); - } - - // Late deallocation of resources in work queues. - if (usedCpuQueue) { - freeWorkQueue(&layer->cpuQueue); - } - if (usedBioAckQueue) { - freeWorkQueue(&layer->bioAckQueue); - } - if (layer->ioSubmitter) { - freeIOSubmitter(layer->ioSubmitter); - } - if (usedKVDO) { - destroyKVDO(&layer->kvdo); - } - if (layer->bioset != NULL) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,18,0) - bioset_exit(layer->bioset); - FREE(layer->bioset); -#else - bioset_free(layer->bioset); -#endif - layer->bioset = NULL; - } - - freeDedupeIndex(&layer->dedupeIndex); - - stopPeriodicEventReporter(&layer->albireoTimeoutReporter); - if (releaseInstance) { - releaseKVDOInstance(layer->instance); - } - - // The call to kobject_put on the kobj sysfs node will decrement its - // reference count; when the count goes to zero the VDO object and - // the kernel layer object will be freed as a side effect. - kobject_put(&layer->wqDirectory); - kobject_put(&layer->kobj); -} - -/**********************************************************************/ -static void poolStatsRelease(struct kobject *kobj) -{ - KernelLayer *layer = container_of(kobj, KernelLayer, statsDirectory); - complete(&layer->statsShutdown); -} - -/**********************************************************************/ -int preloadKernelLayer(KernelLayer *layer, - const VDOLoadConfig *loadConfig, - char **reason) -{ - if (getKernelLayerState(layer) != LAYER_CPU_QUEUE_INITIALIZED) { - *reason = "preloadKernelLayer() may only be invoked after initialization"; - return UDS_BAD_STATE; - } - - setKernelLayerState(layer, LAYER_STARTING); - int result = preloadKVDO(&layer->kvdo, &layer->common, loadConfig, - layer->vioTraceRecording, reason); - if (result != VDO_SUCCESS) { - stopKernelLayer(layer); - return result; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int startKernelLayer(KernelLayer *layer, char **reason) -{ - if (getKernelLayerState(layer) != LAYER_STARTING) { - *reason = "Cannot start kernel from non-starting state"; - stopKernelLayer(layer); - return UDS_BAD_STATE; - } - - int result = startKVDO(&layer->kvdo, &layer->common, reason); - if (result != VDO_SUCCESS) { - stopKernelLayer(layer); - return result; - } - - setKernelLayerState(layer, LAYER_RUNNING); - static struct kobj_type statsDirectoryKobjType = { - .release = poolStatsRelease, - .sysfs_ops = &poolStatsSysfsOps, - .default_attrs = poolStatsAttrs, - }; - kobject_init(&layer->statsDirectory, &statsDirectoryKobjType); - result = kobject_add(&layer->statsDirectory, &layer->kobj, "statistics"); - if (result != 0) { - *reason = "Cannot add sysfs statistics node"; - stopKernelLayer(layer); - return result; - } - layer->statsAdded = true; - - if (layer->deviceConfig->deduplication) { - // Don't try to load or rebuild the index first (and log scary error - // messages) if this is known to be a newly-formatted volume. - startDedupeIndex(layer->dedupeIndex, wasNew(layer->kvdo.vdo)); - } - - result = vdoCreateProcfsEntry(layer, layer->deviceConfig->poolName, - &layer->procfsPrivate); - if (result != VDO_SUCCESS) { - *reason = "Could not create proc filesystem entry"; - stopKernelLayer(layer); - return result; - } - - layer->allocationsAllowed = false; - - return VDO_SUCCESS; -} - -/**********************************************************************/ -void stopKernelLayer(KernelLayer *layer) -{ - layer->allocationsAllowed = true; - - // Stop services that need to gather VDO statistics from the worker threads. - if (layer->statsAdded) { - layer->statsAdded = false; - init_completion(&layer->statsShutdown); - kobject_put(&layer->statsDirectory); - wait_for_completion(&layer->statsShutdown); - } - vdoDestroyProcfsEntry(layer->deviceConfig->poolName, layer->procfsPrivate); - - switch (getKernelLayerState(layer)) { - case LAYER_RUNNING: - suspendKernelLayer(layer); - // fall through - - case LAYER_SUSPENDED: - setKernelLayerState(layer, LAYER_STOPPING); - stopDedupeIndex(layer->dedupeIndex); - // fall through - - case LAYER_STOPPING: - case LAYER_STOPPED: - default: - setKernelLayerState(layer, LAYER_STOPPED); - } -} - -/**********************************************************************/ -int suspendKernelLayer(KernelLayer *layer) -{ - // It's important to note any error here does not actually stop device-mapper - // from suspending the device. All this work is done post suspend. - KernelLayerState state = getKernelLayerState(layer); - if (state == LAYER_SUSPENDED) { - return VDO_SUCCESS; - } - if (state != LAYER_RUNNING) { - logError("Suspend invoked while in unexpected kernel layer state %d", - state); - return -EINVAL; - } - - /* - * Attempt to flush all I/O before completing post suspend work. This is - * needed so that changing write policy upon resume is safe. Also, we think - * a suspended device is expected to have persisted all data written before - * the suspend, even if it hasn't been flushed yet. - */ - waitForNoRequestsActive(layer); - int result = synchronousFlush(layer); - if (result != VDO_SUCCESS) { - setKVDOReadOnly(&layer->kvdo, result); - } - - /* - * Suspend the VDO, writing out all dirty metadata if the no-flush flag - * was not set on the dmsetup suspend call. This will ensure that we don't - * have cause to write while suspended [VDO-4402]. - */ - int suspendResult = suspendKVDO(&layer->kvdo); - if (result == VDO_SUCCESS) { - result = suspendResult; - } - - suspendDedupeIndex(layer->dedupeIndex, !layer->noFlushSuspend); - setKernelLayerState(layer, LAYER_SUSPENDED); - return result; -} - -/**********************************************************************/ -int resumeKernelLayer(KernelLayer *layer) -{ - if (getKernelLayerState(layer) == LAYER_RUNNING) { - return VDO_SUCCESS; - } - - resumeDedupeIndex(layer->dedupeIndex); - int result = resumeKVDO(&layer->kvdo); - if (result != VDO_SUCCESS) { - return result; - } - - setKernelLayerState(layer, LAYER_RUNNING); - return VDO_SUCCESS; -} - -/***********************************************************************/ -int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount) -{ - logInfo("Preparing to resize physical to %llu", physicalCount); - // Allocations are allowed and permissible through this non-VDO thread, - // since IO triggered by this allocation to VDO can finish just fine. - int result = kvdoPrepareToGrowPhysical(&layer->kvdo, physicalCount); - if (result != VDO_SUCCESS) { - // kvdoPrepareToGrowPhysical logs errors. - if (result == VDO_PARAMETER_MISMATCH) { - // If we don't trap this case, mapToSystemError() will remap it to -EIO, - // which is misleading and ahistorical. - return -EINVAL; - } else { - return result; - } - } - - logInfo("Done preparing to resize physical"); - return VDO_SUCCESS; -} - -/***********************************************************************/ -int resizePhysical(KernelLayer *layer, BlockCount physicalCount) -{ - // We must not mark the layer as allowing allocations when it is suspended - // lest an allocation attempt block on writing IO to the suspended VDO. - int result = kvdoResizePhysical(&layer->kvdo, physicalCount); - if (result != VDO_SUCCESS) { - // kvdoResizePhysical logs errors - return result; - } - return VDO_SUCCESS; -} - -/***********************************************************************/ -int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount) -{ - logInfo("Preparing to resize logical to %llu", logicalCount); - // Allocations are allowed and permissible through this non-VDO thread, - // since IO triggered by this allocation to VDO can finish just fine. - int result = kvdoPrepareToGrowLogical(&layer->kvdo, logicalCount); - if (result != VDO_SUCCESS) { - // kvdoPrepareToGrowLogical logs errors - return result; - } - - logInfo("Done preparing to resize logical"); - return VDO_SUCCESS; -} - -/***********************************************************************/ -int resizeLogical(KernelLayer *layer, BlockCount logicalCount) -{ - logInfo("Resizing logical to %llu", logicalCount); - // We must not mark the layer as allowing allocations when it is suspended - // lest an allocation attempt block on writing IO to the suspended VDO. - int result = kvdoResizeLogical(&layer->kvdo, logicalCount); - if (result != VDO_SUCCESS) { - // kvdoResizeLogical logs errors - return result; - } - - logInfo("Logical blocks now %llu", logicalCount); - return VDO_SUCCESS; -} - diff --git a/vdo/kernel/kernelLayer.h b/vdo/kernel/kernelLayer.h deleted file mode 100644 index 4e0bf8c..0000000 --- a/vdo/kernel/kernelLayer.h +++ /dev/null @@ -1,583 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelLayer.h#18 $ - */ - -#ifndef KERNELLAYER_H -#define KERNELLAYER_H - -#include - -#include "atomic.h" -#include "constants.h" -#include "flush.h" -#include "intMap.h" -#include "physicalLayer.h" -#include "ringNode.h" -#include "volumeGeometry.h" -#include "waitQueue.h" - -#include "batchProcessor.h" -#include "bufferPool.h" -#include "deadlockQueue.h" -#include "deviceConfig.h" -#include "histogram.h" -#include "kernelStatistics.h" -#include "kernelTypes.h" -#include "kernelVDO.h" -#include "ktrace.h" -#include "limiter.h" -#include "statistics.h" -#include "workQueue.h" - -enum { - VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT) -}; - -typedef enum { - LAYER_SIMPLE_THINGS_INITIALIZED, - LAYER_BUFFER_POOLS_INITIALIZED, - LAYER_REQUEST_QUEUE_INITIALIZED, - LAYER_CPU_QUEUE_INITIALIZED, - LAYER_BIO_ACK_QUEUE_INITIALIZED, - LAYER_BIO_DATA_INITIALIZED, - LAYER_STARTING, - LAYER_RUNNING, - LAYER_SUSPENDED, - LAYER_STOPPING, - LAYER_STOPPED, - LAYER_RESUMING, -} KernelLayerState; - -/* Keep BIO statistics atomically */ -struct atomicBioStats { - atomic64_t read; // Number of not REQ_WRITE bios - atomic64_t write; // Number of REQ_WRITE bios - atomic64_t discard; // Number of REQ_DISCARD bios - atomic64_t flush; // Number of REQ_FLUSH bios - atomic64_t fua; // Number of REQ_FUA bios -}; - -// Data managing the reporting of Albireo timeouts -typedef struct periodicEventReporter { - uint64_t lastReportedValue; - const char *format; - atomic64_t value; - Jiffies reportingInterval; // jiffies - /* - * Just an approximation. If nonzero, then either the work item has - * been queued to run, or some other thread currently has - * responsibility for enqueueing it, or the reporter function is - * running but hasn't looked at the current value yet. - * - * If this is set, don't set the timer again, because we don't want - * the work item queued twice. Use an atomic xchg or cmpxchg to - * test-and-set it, and an atomic store to clear it. - */ - atomic_t workItemQueued; - KvdoWorkItem workItem; - KernelLayer *layer; -} PeriodicEventReporter; - -static inline uint64_t getEventCount(PeriodicEventReporter *reporter) -{ - return atomic64_read(&reporter->value); -} - -/** - * The VDO representation of the target device - **/ -struct kernelLayer { - PhysicalLayer common; - // Layer specific info - DeviceConfig *deviceConfig; - /** A ring of all DeviceConfigs referencing this layer */ - RingNode deviceConfigRing; - char threadNamePrefix[MAX_QUEUE_NAME_LEN]; - struct kobject kobj; - struct kobject wqDirectory; - struct kobject statsDirectory; - /** - * A counter value to attach to thread names and log messages to - * identify the individual device. - **/ - unsigned int instance; - /** Contains the current KernelLayerState, which rarely changes */ - Atomic32 state; - bool noFlushSuspend; - bool allocationsAllowed; - AtomicBool processingMessage; - /** Limit the number of requests that are being processed. */ - Limiter requestLimiter; - Limiter discardLimiter; - KVDO kvdo; - /** Incoming bios we've had to buffer to avoid deadlock. */ - DeadlockQueue deadlockQueue; - // for REQ_FLUSH processing - struct bio_list waitingFlushes; - KVDOFlush *spareKVDOFlush; - spinlock_t flushLock; - Jiffies flushArrivalTime; - /** - * Bio submission manager used for sending bios to the storage - * device. - **/ - IOSubmitter *ioSubmitter; - /** - * Work queue (possibly with multiple threads) for miscellaneous - * CPU-intensive, non-blocking work. - **/ - KvdoWorkQueue *cpuQueue; - /** N blobs of context data for LZ4 code, one per CPU thread. */ - char **compressionContext; - Atomic32 compressionContextIndex; - /** Optional work queue for calling bio_endio. */ - KvdoWorkQueue *bioAckQueue; - /** Underlying block device info. */ - uint64_t startingSectorOffset; - VolumeGeometry geometry; - // Memory allocation - BufferPool *dataKVIOPool; - struct bio_set *bioset; - // Albireo specific info - DedupeIndex *dedupeIndex; - // Statistics - atomic64_t biosSubmitted; - atomic64_t biosCompleted; - atomic64_t dedupeContextBusy; - atomic64_t flushOut; - AtomicBioStats biosIn; - AtomicBioStats biosInPartial; - AtomicBioStats biosOut; - AtomicBioStats biosOutCompleted; - AtomicBioStats biosAcknowledged; - AtomicBioStats biosAcknowledgedPartial; - AtomicBioStats biosMeta; - AtomicBioStats biosMetaCompleted; - AtomicBioStats biosJournal; - AtomicBioStats biosPageCache; - AtomicBioStats biosJournalCompleted; - AtomicBioStats biosPageCacheCompleted; - // for reporting Albireo timeouts - PeriodicEventReporter albireoTimeoutReporter; - // Debugging - /* Whether to dump VDO state on shutdown */ - bool dumpOnShutdown; - /** - * Whether we should collect tracing info. (Actually, this controls - * allocations; non-null record pointers cause recording.) - **/ - bool vioTraceRecording; - SampleCounter traceSampleCounter; - /* Should we log tracing info? */ - bool traceLogging; - /* Storage for trace data. */ - BufferPool *traceBufferPool; - /* Private storage for procfs. */ - void *procfsPrivate; - /* For returning batches of DataKVIOs to their pool */ - BatchProcessor *dataKVIOReleaser; - - // Administrative operations - /* The object used to wait for administrative operations to complete */ - struct completion callbackSync; - - // Statistics reporting - /* Protects the *statsStorage structs */ - struct mutex statsMutex; - /* Used when shutting down the sysfs statistics */ - struct completion statsShutdown;; - /* true if sysfs statistics directory is set up */ - bool statsAdded; - /* Used to gather statistics without allocating memory */ - VDOStatistics vdoStatsStorage; - KernelStatistics kernelStatsStorage; -}; - -typedef enum bioQAction { - BIO_Q_ACTION_COMPRESSED_DATA, - BIO_Q_ACTION_DATA, - BIO_Q_ACTION_FLUSH, - BIO_Q_ACTION_HIGH, - BIO_Q_ACTION_METADATA, - BIO_Q_ACTION_READCACHE, - BIO_Q_ACTION_VERIFY -} BioQAction; - -typedef enum cpuQAction { - CPU_Q_ACTION_COMPLETE_KVIO, - CPU_Q_ACTION_COMPRESS_BLOCK, - CPU_Q_ACTION_EVENT_REPORTER, - CPU_Q_ACTION_HASH_BLOCK, -} CPUQAction; - -typedef enum bioAckQAction { - BIO_ACK_Q_ACTION_ACK, -} BioAckQAction; - -typedef void (*DedupeShutdownCallbackFunction)(KernelLayer *layer); - -/* - * Wrapper for the Enqueueable object, to associate it with a kernel - * layer work item. - */ -typedef struct kvdoEnqueueable { - KvdoWorkItem workItem; - Enqueueable enqueueable; -} KvdoEnqueueable; - -/** - * Implements LayerFilter. - **/ -bool layerIsNamed(KernelLayer *layer, void *context) - __attribute__((warn_unused_result)); - -/** - * Creates a kernel specific physical layer to be used by VDO - * - * @param startingSector The sector offset of our table entry in the - * DM device - * @param instance Device instantiation counter - * @param parentKobject The parent sysfs node - * @param config The device configuration - * @param threadConfigPointer Where to store the new threadConfig handle - * @param reason The reason for any failure during this call - * @param layerPtr A pointer to hold the created layer - * - * @return VDO_SUCCESS or an error - **/ -int makeKernelLayer(uint64_t startingSector, - unsigned int instance, - DeviceConfig *config, - struct kobject *parentKobject, - ThreadConfig **threadConfigPointer, - char **reason, - KernelLayer **layerPtr) - __attribute__((warn_unused_result)); - -/** - * Prepare to modify a kernel layer. - * - * @param layer The layer to modify - * @param config The new device configuration - * @param errorPtr A pointer to store the reason for any failure - * - * @return VDO_SUCCESS or an error - **/ -int prepareToModifyKernelLayer(KernelLayer *layer, - DeviceConfig *config, - char **errorPtr) - __attribute__((warn_unused_result)); - -/** - * Modify a kernel physical layer. - * - * @param layer The layer to modify - * @param config The new device configuration - * - * @return VDO_SUCCESS or an error - **/ -int modifyKernelLayer(KernelLayer *layer, - DeviceConfig *config) - __attribute__((warn_unused_result)); - -/** - * Free a kernel physical layer. - * - * @param layer The layer, which must have been created by - * makeKernelLayer - **/ -void freeKernelLayer(KernelLayer *layer); - -/** - * Make and configure a kernel layer. This method does not alter the VDO state - * on disk. It should be run from the VDO constructor for devices which have - * not been started. - * - * @param layer The kernel layer - * @param loadConfig Load-time parameters for the VDO - * @param reason The reason for any failure during this call - * - * @return VDO_SUCCESS or an error - * - * @note redundant starts are silently ignored - **/ -int preloadKernelLayer(KernelLayer *layer, - const VDOLoadConfig *loadConfig, - char **reason); - -/** - * Start the kernel layer. This method finishes bringing a VDO online now that - * a table is being resumed for the first time. - * - * @param layer The kernel layer - * @param reason The reason for any failure during this call - * - * @return VDO_SUCCESS or an error - **/ -int startKernelLayer(KernelLayer *layer, char **reason); - -/** - * Stop the kernel layer. - * - * @param layer The kernel layer - **/ -void stopKernelLayer(KernelLayer *layer); - -/** - * Suspend the kernel layer. - * - * @param layer The kernel layer - * - * @return VDO_SUCCESS or an error - **/ -int suspendKernelLayer(KernelLayer *layer); - -/** - * Resume the kernel layer. - * - * @param layer The kernel layer - * - * @return VDO_SUCCESS or an error - **/ -int resumeKernelLayer(KernelLayer *layer); - -/** - * Get the kernel layer state. - * - * @param layer The kernel layer - * - * @return the instantaneously correct kernel layer state - **/ -static inline KernelLayerState getKernelLayerState(const KernelLayer *layer) -{ - return atomicLoad32(&layer->state); -} - -/** - * Function call to begin processing a bio passed in from the block layer - * - * @param layer The physical layer - * @param bio The bio from the block layer - * - * @return value to return from the VDO map function. Either an error code - * or DM_MAPIO_REMAPPED or DM_MAPPED_SUBMITTED (see vdoMapBio for - * details). - **/ -int kvdoMapBio(KernelLayer *layer, BIO *bio); - -/** - * Convert a generic PhysicalLayer to a kernelLayer. - * - * @param layer The PhysicalLayer to convert - * - * @return The PhysicalLayer as a KernelLayer - **/ -static inline KernelLayer *asKernelLayer(PhysicalLayer *layer) -{ - return container_of(layer, KernelLayer, common); -} - -/** - * Convert a block number (or count) to a (512-byte-)sector number. - * - * The argument type is sector_t to force conversion to the type we - * want, although the actual values passed are of various integral - * types. It's just too easy to forget and do the multiplication - * without casting, resulting in 32-bit arithmetic that accidentally - * produces wrong results in devices over 2TB (2**32 sectors). - * - * @param [in] layer the physical layer - * @param [in] blockNumber the block number/count - * - * @return the sector number/count - **/ -static inline sector_t blockToSector(KernelLayer *layer, sector_t blockNumber) -{ - return (blockNumber * VDO_SECTORS_PER_BLOCK); -} - -/** - * Convert a sector number (or count) to a block number. Does not - * check to make sure the sector number is an integral number of - * blocks. - * - * @param [in] layer the physical layer - * @param [in] sectorNumber the sector number/count - * - * @return the block number/count - **/ -static inline sector_t sectorToBlock(KernelLayer *layer, sector_t sectorNumber) -{ - return (sectorNumber / VDO_SECTORS_PER_BLOCK); -} - -/** - * Convert a sector number to an offset within a block. - * - * @param [in] layer the physical layer - * @param [in] sectorNumber the sector number - * - * @return the offset within the block - **/ -static inline BlockSize sectorToBlockOffset(KernelLayer *layer, - sector_t sectorNumber) -{ - unsigned int sectorsPerBlockMask = VDO_SECTORS_PER_BLOCK - 1; - return to_bytes(sectorNumber & sectorsPerBlockMask); -} - -/** - * Get the block device object currently underlying a kernel layer. - * - * @param layer The kernel layer in question - * - * @return The block device object under the layer - **/ -struct block_device *getKernelLayerBdev(const KernelLayer *layer) - __attribute__((warn_unused_result)); - -/** - * Set the layer's active config. - * - * @param layer The kernel layer in question - * @param config The config in question - **/ -static inline void setKernelLayerActiveConfig(KernelLayer *layer, - DeviceConfig *config) -{ - layer->deviceConfig = config; -} - -/** - * Given an error code, return a value we can return to the OS. The - * input error code may be a system-generated value (such as -EIO), an - * errno macro used in our code (such as EIO), or a UDS or VDO status - * code; the result must be something the rest of the OS can consume - * (negative errno values such as -EIO, in the case of the kernel). - * - * @param error the error code to convert - * - * @return a system error code value - **/ -int mapToSystemError(int error); - -/** - * Record and eventually report that some number of dedupe requests - * reached their expiration time without getting an answer, so we - * timed out on them. - * - * This is called in a timer context, so it shouldn't do the reporting - * directly. - * - * @param layer The kernel layer for the device - * @param expiredCount The number of expired requests we timed out on - **/ -void kvdoReportDedupeTimeout(KernelLayer *layer, unsigned int expiredCount); - -/** - * Wait until there are no requests in progress. - * - * @param layer The kernel layer for the device - **/ -void waitForNoRequestsActive(KernelLayer *layer); - -/** - * Enqueues an item on our internal "cpu queues". Since there is more than - * one, we rotate through them in hopes of creating some general balance. - * - * @param layer The kernel layer - * @param item The work item to enqueue - */ -static inline void enqueueCPUWorkQueue(KernelLayer *layer, KvdoWorkItem *item) -{ - enqueueWorkQueue(layer->cpuQueue, item); -} - -/** - * Adjust parameters to prepare to use a larger physical space. - * The size must be larger than the current size. - * - * @param layer the kernel layer - * @param physicalCount the new physical size in blocks - * - * @return VDO_SUCCESS or an error - */ -int prepareToResizePhysical(KernelLayer *layer, BlockCount physicalCount); - -/** - * Adjusts parameters to reflect resizing the underlying device. - * The size must be larger than the current size. - * - * @param layer the kernel layer - * @param physicalCount the new physical count in blocks - * - * @return VDO_SUCCESS or an error - */ -int resizePhysical(KernelLayer *layer, BlockCount physicalCount); - -/** - * Adjust parameters to prepare to present a larger logical space. - * The size must be larger than the current size. - * - * @param layer the kernel layer - * @param logicalCount the new logical size in blocks - * - * @return VDO_SUCCESS or an error - */ -int prepareToResizeLogical(KernelLayer *layer, BlockCount logicalCount); - -/** - * Adjust parameters to present a larger logical space. - * The size must be larger than the current size. - * - * @param layer the kernel layer - * @param logicalCount the new logical size in blocks - * - * @return VDO_SUCCESS or an error - */ -int resizeLogical(KernelLayer *layer, BlockCount logicalCount); - -/** - * Indicate whether the kernel layer is configured to use a separate - * work queue for acknowledging received and processed bios. - * - * Note that this directly controls handling of write operations, but - * the compile-time flag USE_BIO_ACK_QUEUE_FOR_READ is also checked - * for read operations. - * - * @param layer The kernel layer - * - * @return Whether a bio-acknowledgement work queue is in use - **/ -static inline bool useBioAckQueue(KernelLayer *layer) -{ - return layer->deviceConfig->threadCounts.bioAckThreads > 0; -} - -/** - * Update bookkeeping for the completion of some number of requests, so that - * more incoming requests can be accepted. - * - * @param layer The kernel layer - * @param count The number of completed requests - **/ -void completeManyRequests(KernelLayer *layer, uint32_t count); - -#endif /* KERNELLAYER_H */ diff --git a/vdo/kernel/kernelStatistics.h b/vdo/kernel/kernelStatistics.h deleted file mode 100644 index a5c1210..0000000 --- a/vdo/kernel/kernelStatistics.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -#ifndef KERNEL_STATISTICS_H -#define KERNEL_STATISTICS_H - -#include "header.h" -#include "types.h" - -typedef struct { - /** Number of not REQ_WRITE bios */ - uint64_t read; - /** Number of REQ_WRITE bios */ - uint64_t write; - /** Number of REQ_DISCARD bios */ - uint64_t discard; - /** Number of REQ_FLUSH bios */ - uint64_t flush; - /** Number of REQ_FUA bios */ - uint64_t fua; -} BioStats; - -typedef struct { - /** Tracked bytes currently allocated. */ - uint64_t bytesUsed; - /** Maximum tracked bytes allocated. */ - uint64_t peakBytesUsed; -} MemoryUsage; - -/** UDS index statistics */ -typedef struct { - /** Number of chunk names stored in the index */ - uint64_t entriesIndexed; - /** Number of post calls that found an existing entry */ - uint64_t postsFound; - /** Number of post calls that added a new entry */ - uint64_t postsNotFound; - /** Number of query calls that found an existing entry */ - uint64_t queriesFound; - /** Number of query calls that added a new entry */ - uint64_t queriesNotFound; - /** Number of update calls that found an existing entry */ - uint64_t updatesFound; - /** Number of update calls that added a new entry */ - uint64_t updatesNotFound; - /** Current number of dedupe queries that are in flight */ - uint32_t currDedupeQueries; - /** Maximum number of dedupe queries that have been in flight */ - uint32_t maxDedupeQueries; -} IndexStatistics; - -typedef struct { - uint32_t version; - uint32_t releaseVersion; - /** The VDO instance */ - uint32_t instance; - /** Current number of active VIOs */ - uint32_t currentVIOsInProgress; - /** Maximum number of active VIOs */ - uint32_t maxVIOs; - /** Number of times the UDS index was too slow in responding */ - uint64_t dedupeAdviceTimeouts; - /** Number of flush requests submitted to the storage device */ - uint64_t flushOut; - /** Logical block size */ - uint64_t logicalBlockSize; - /** Bios submitted into VDO from above */ - BioStats biosIn; - BioStats biosInPartial; - /** Bios submitted onward for user data */ - BioStats biosOut; - /** Bios submitted onward for metadata */ - BioStats biosMeta; - BioStats biosJournal; - BioStats biosPageCache; - BioStats biosOutCompleted; - BioStats biosMetaCompleted; - BioStats biosJournalCompleted; - BioStats biosPageCacheCompleted; - BioStats biosAcknowledged; - BioStats biosAcknowledgedPartial; - /** Current number of bios in progress */ - BioStats biosInProgress; - /** Memory usage stats. */ - MemoryUsage memoryUsage; - /** The statistics for the UDS index */ - IndexStatistics index; -} KernelStatistics; - -/** - * Get the root for all stats proc files. - * - * @return The proc root - **/ -static inline const char *getProcRoot(void) { - return "vdo"; -} - -/** - * Get the proc file path for reading KernelStatistics. - * - * @return The proc file path - **/ -static inline const char *getKernelStatisticsProcFile(void) { - return "kernel_stats"; -} - -#endif /* not KERNEL_STATISTICS_H */ diff --git a/vdo/kernel/kernelTypes.h b/vdo/kernel/kernelTypes.h deleted file mode 100644 index b338440..0000000 --- a/vdo/kernel/kernelTypes.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelTypes.h#3 $ - */ - -#ifndef KERNEL_TYPES_H -#define KERNEL_TYPES_H - -#include "types.h" - -/** - * The size of a discard request in bytes. - **/ -typedef uint32_t DiscardSize; - -/** - * A time in jiffies. - **/ -typedef uint64_t Jiffies; - -/** - * A timeout in jiffies. - **/ -typedef int64_t TimeoutJiffies; - -typedef struct atomicBioStats AtomicBioStats; -typedef struct bio BIO; -typedef struct dataKVIO DataKVIO; -typedef struct dedupeContext DedupeContext; -typedef struct dedupeIndex DedupeIndex; -typedef struct ioSubmitter IOSubmitter; -typedef struct kernelLayer KernelLayer; -typedef struct kvdo KVDO; -typedef struct kvdoFlush KVDOFlush; -typedef struct kvdoWorkItem KvdoWorkItem; -typedef struct kvdoWorkQueue KvdoWorkQueue; -typedef struct kvio KVIO; - -typedef void (*KVIOCallback)(KVIO *kvio); -typedef void (*DataKVIOCallback)(DataKVIO *dataKVIO); -typedef void (*KvdoWorkFunction)(KvdoWorkItem *workItem); - -/** - * Method type for layer matching methods. - * - * A LayerFilter method returns false if the layer doesn't match. - **/ -typedef bool LayerFilter(KernelLayer *layer, void *context); - -#endif /* KERNEL_TYPES_H */ diff --git a/vdo/kernel/kernelVDO.c b/vdo/kernel/kernelVDO.c deleted file mode 100644 index 5e1a72e..0000000 --- a/vdo/kernel/kernelVDO.c +++ /dev/null @@ -1,578 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDO.c#7 $ - */ - -#include "kernelVDOInternals.h" - -#include - -#include "memoryAlloc.h" - -#include "statistics.h" -#include "threadConfig.h" -#include "vdo.h" -#include "vdoDebug.h" -#include "vdoLoad.h" -#include "vdoResize.h" -#include "vdoResizeLogical.h" -#include "vdoResume.h" -#include "vdoSuspend.h" - -#include "kernelLayer.h" -#include "kvio.h" -#include "logger.h" - -enum { PARANOID_THREAD_CONSISTENCY_CHECKS = 0 }; - -/**********************************************************************/ -static void startKVDORequestQueue(void *ptr) -{ - KVDOThread *thread = ptr; - KVDO *kvdo = thread->kvdo; - KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); - registerAllocatingThread(&thread->allocatingThread, - &layer->allocationsAllowed); - setWorkQueuePrivateData(thread); -} - -/**********************************************************************/ -static void finishKVDORequestQueue(void *ptr) -{ - unregisterAllocatingThread(); -} - -/**********************************************************************/ -static const KvdoWorkQueueType requestQueueType = { - .start = startKVDORequestQueue, - .finish = finishKVDORequestQueue, - .actionTable = { - { .name = "req_completion", - .code = REQ_Q_ACTION_COMPLETION, - .priority = 1 }, - { .name = "req_flush", - .code = REQ_Q_ACTION_FLUSH, - .priority = 2 }, - { .name = "req_map_bio", - .code = REQ_Q_ACTION_MAP_BIO, - .priority = 0 }, - { .name = "req_sync", - .code = REQ_Q_ACTION_SYNC, - .priority = 2 }, - { .name = "req_vio_callback", - .code = REQ_Q_ACTION_VIO_CALLBACK, - .priority = 1 }, - }, -}; - -/**********************************************************************/ -int initializeKVDO(KVDO *kvdo, - const ThreadConfig *threadConfig, - char **reason) -{ - unsigned int baseThreads = threadConfig->baseThreadCount; - int result = ALLOCATE(baseThreads, KVDOThread, - "request processing work queue", - &kvdo->threads); - if (result != VDO_SUCCESS) { - *reason = "Cannot allocation thread structures"; - return result; - } - KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); - for (kvdo->initializedThreadCount = 0; - kvdo->initializedThreadCount < baseThreads; - kvdo->initializedThreadCount++) { - KVDOThread *thread = &kvdo->threads[kvdo->initializedThreadCount]; - - thread->kvdo = kvdo; - thread->threadID = kvdo->initializedThreadCount; - - char queueName[MAX_QUEUE_NAME_LEN]; - // Copy only LEN - 1 bytes and ensure NULL termination. - getVDOThreadName(threadConfig, kvdo->initializedThreadCount, - queueName, sizeof(queueName)); - int result = makeWorkQueue(layer->threadNamePrefix, queueName, - &layer->wqDirectory, layer, thread, - &requestQueueType, 1, &thread->requestQueue); - if (result != VDO_SUCCESS) { - *reason = "Cannot initialize request queue"; - while (kvdo->initializedThreadCount > 0) { - unsigned int threadToDestroy = kvdo->initializedThreadCount - 1; - thread = &kvdo->threads[threadToDestroy]; - finishWorkQueue(thread->requestQueue); - freeWorkQueue(&thread->requestQueue); - kvdo->initializedThreadCount--; - } - FREE(kvdo->threads); - return result; - } - - } - return VDO_SUCCESS; -} - -/**********************************************************************/ -int preloadKVDO(KVDO *kvdo, - PhysicalLayer *common, - const VDOLoadConfig *loadConfig, - bool vioTraceRecording, - char **reason) -{ - KernelLayer *layer = asKernelLayer(common); - init_completion(&layer->callbackSync); - int result = prepareToLoadVDO(kvdo->vdo, loadConfig); - if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { - *reason = "Cannot load metadata from device"; - return result; - } - - setVDOTracingFlags(kvdo->vdo, vioTraceRecording); - return VDO_SUCCESS; -} - -/**********************************************************************/ -int startKVDO(KVDO *kvdo, PhysicalLayer *common, char **reason) -{ - KernelLayer *layer = asKernelLayer(common); - init_completion(&layer->callbackSync); - int result = performVDOLoad(kvdo->vdo); - if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { - *reason = "Cannot load metadata from device"; - return result; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int suspendKVDO(KVDO *kvdo) -{ - if (kvdo->vdo == NULL) { - return VDO_SUCCESS; - } - - KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); - init_completion(&layer->callbackSync); - int result = performVDOSuspend(kvdo->vdo, !layer->noFlushSuspend); - if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) { - char errorName[80] = ""; - char errorMessage[ERRBUF_SIZE] = ""; - logError("%s: Suspend device failed %d (%s: %s)", - __func__, result, - stringErrorName(result, errorName, sizeof(errorName)), - stringError(result, errorMessage, sizeof(errorMessage))); - return result; - } - - // Convert VDO_READ_ONLY to VDO_SUCCESS since a read-only suspension still - // leaves the VDO suspended. - return VDO_SUCCESS; -} - -/**********************************************************************/ -int resumeKVDO(KVDO *kvdo) -{ - if (kvdo->vdo == NULL) { - return VDO_SUCCESS; - } - - KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); - init_completion(&layer->callbackSync); - return performVDOResume(kvdo->vdo); -} - -/**********************************************************************/ -void finishKVDO(KVDO *kvdo) -{ - for (int i = 0; i < kvdo->initializedThreadCount; i++) { - finishWorkQueue(kvdo->threads[i].requestQueue); - } -} - -/**********************************************************************/ -void destroyKVDO(KVDO *kvdo) -{ - destroyVDO(kvdo->vdo); - for (int i = 0; i < kvdo->initializedThreadCount; i++) { - freeWorkQueue(&kvdo->threads[i].requestQueue); - } - FREE(kvdo->threads); - kvdo->threads = NULL; -} - - -/**********************************************************************/ -void dumpKVDOWorkQueue(KVDO *kvdo) -{ - for (int i = 0; i < kvdo->initializedThreadCount; i++) { - dumpWorkQueue(kvdo->threads[i].requestQueue); - } -} - -/**********************************************************************/ -typedef struct { - KvdoWorkItem workItem; - KVDO *kvdo; - void *data; - struct completion *completion; -} SyncQueueWork; - -/** - * Initiate an arbitrary asynchronous base-code operation and wait for - * it. - * - * An async queue operation is performed and we wait for completion. - * - * @param kvdo The kvdo data handle - * @param action The operation to perform - * @param data Unique data that can be used by the operation - * @param threadID The thread on which to perform the operation - * @param completion The completion to wait on - * - * @return VDO_SUCCESS of an error code - **/ -static void performKVDOOperation(KVDO *kvdo, - KvdoWorkFunction action, - void *data, - ThreadID threadID, - struct completion *completion) -{ - SyncQueueWork sync; - - memset(&sync, 0, sizeof(sync)); - setupWorkItem(&sync.workItem, action, NULL, REQ_Q_ACTION_SYNC); - sync.kvdo = kvdo; - sync.data = data; - sync.completion = completion; - - init_completion(completion); - enqueueKVDOWork(kvdo, &sync.workItem, threadID); - wait_for_completion(completion); -} - -/**********************************************************************/ -typedef struct { - bool enable; - bool wasEnabled; -} VDOCompressData; - -/** - * Does the work of calling the base code to set compress state, then - * tells the function waiting on completion to go ahead. - * - * @param item The work item - **/ -static void setCompressingWork(KvdoWorkItem *item) -{ - SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); - VDOCompressData *data = (VDOCompressData *)work->data; - data->wasEnabled = setVDOCompressing(getVDO(work->kvdo), data->enable); - complete(work->completion); -} - -/***********************************************************************/ -bool setKVDOCompressing(KVDO *kvdo, bool enableCompression) -{ - struct completion compressWait; - VDOCompressData data; - data.enable = enableCompression; - performKVDOOperation(kvdo, setCompressingWork, &data, - getPackerZoneThread(getThreadConfig(kvdo->vdo)), - &compressWait); - return data.wasEnabled; -} - -/**********************************************************************/ -typedef struct { - int result; -} VDOReadOnlyData; - -/**********************************************************************/ -static void enterReadOnlyModeWork(KvdoWorkItem *item) -{ - SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); - VDOReadOnlyData *data = work->data; - makeVDOReadOnly(getVDO(work->kvdo), data->result); - complete(work->completion); -} - -/***********************************************************************/ -void setKVDOReadOnly(KVDO *kvdo, int result) -{ - struct completion readOnlyWait; - VDOReadOnlyData data; - data.result = result; - performKVDOOperation(kvdo, enterReadOnlyModeWork, &data, - getAdminThread(getThreadConfig(kvdo->vdo)), - &readOnlyWait); -} - -/** - * Does the work of calling the vdo statistics gathering tool - * - * @param item The work item - **/ -static void getVDOStatisticsWork(KvdoWorkItem *item) -{ - SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); - VDOStatistics *stats = (VDOStatistics *)work->data; - getVDOStatistics(getVDO(work->kvdo), stats); - complete(work->completion); -} - -/***********************************************************************/ -void getKVDOStatistics(KVDO *kvdo, VDOStatistics *stats) -{ - struct completion statsWait; - memset(stats, 0, sizeof(VDOStatistics)); - performKVDOOperation(kvdo, getVDOStatisticsWork, stats, - getAdminThread(getThreadConfig(kvdo->vdo)), - &statsWait); -} - -/** - * A structure to invoke an arbitrary VDO action. - **/ -typedef struct vdoActionData { - VDOAction *action; - VDOCompletion *vdoCompletion; - struct completion waiter; -} VDOActionData; - -/** - * Initialize a VDOActionData structure so that the specified action - * can be invoked on the specified completion. - * - * @param data A VDOActionData. - * @param action The VDOAction to execute. - * @param vdoCompletion The VDO completion upon which the action acts. - **/ -static void initializeVDOActionData(VDOActionData *data, - VDOAction *action, - VDOCompletion *vdoCompletion) -{ - *data = (VDOActionData) { - .action = action, - .vdoCompletion = vdoCompletion, - }; -} - -/** - * The VDO callback that completes the KVDO completion. - * - * @param vdoCompletion The VDO completion which was acted upon. - **/ -static void finishVDOAction(VDOCompletion *vdoCompletion) -{ - SyncQueueWork *work = vdoCompletion->parent; - complete(work->completion); -} - -/** - * Perform a VDO base code action as specified by a VDOActionData. - * - * Sets the completion callback and parent inside the VDOActionData - * so that the corresponding kernel completion is completed when - * the VDO completion is. - * - * @param item A KVDO work queue item. - **/ -static void performVDOActionWork(KvdoWorkItem *item) -{ - SyncQueueWork *work = container_of(item, SyncQueueWork, workItem); - VDOActionData *data = work->data; - ThreadID id = getPhysicalLayer()->getCurrentThreadID(); - - setCallbackWithParent(data->vdoCompletion, finishVDOAction, id, work); - data->action(data->vdoCompletion); -} - -/**********************************************************************/ -int performKVDOExtendedCommand(KVDO *kvdo, int argc, char **argv) -{ - VDOActionData data; - VDOCommandCompletion cmd; - - int result = initializeVDOCommandCompletion(&cmd, getVDO(kvdo), argc, argv); - if (result != VDO_SUCCESS) { - return result; - } - - initializeVDOActionData(&data, executeVDOExtendedCommand, &cmd.completion); - performKVDOOperation(kvdo, performVDOActionWork, &data, - getAdminThread(getThreadConfig(kvdo->vdo)), - &data.waiter); - - return destroyVDOCommandCompletion(&cmd); -} - -/**********************************************************************/ -void dumpKVDOStatus(KVDO *kvdo) -{ - dumpVDOStatus(kvdo->vdo); -} - -/**********************************************************************/ -bool getKVDOCompressing(KVDO *kvdo) -{ - return getVDOCompressing(kvdo->vdo); -} - -/**********************************************************************/ -int kvdoPrepareToGrowPhysical(KVDO *kvdo, BlockCount physicalCount) -{ - VDO *vdo = kvdo->vdo; - return prepareToGrowPhysical(vdo, physicalCount); -} - -/**********************************************************************/ -int kvdoResizePhysical(KVDO *kvdo, BlockCount physicalCount) -{ - KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); - init_completion(&layer->callbackSync); - int result = performGrowPhysical(kvdo->vdo, physicalCount); - if (result != VDO_SUCCESS) { - logError("resize operation failed, result = %d", result); - return result; - } - - return VDO_SUCCESS; -} - -/**********************************************************************/ -int kvdoPrepareToGrowLogical(KVDO *kvdo, BlockCount logicalCount) -{ - VDO *vdo = kvdo->vdo; - return prepareToGrowLogical(vdo, logicalCount); -} - -/**********************************************************************/ -int kvdoResizeLogical(KVDO *kvdo, BlockCount logicalCount) -{ - KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); - init_completion(&layer->callbackSync); - int result = performGrowLogical(kvdo->vdo, logicalCount); - if (result != VDO_SUCCESS) { - logError("grow logical operation failed, result = %d", result); - } - - return result; -} - -/**********************************************************************/ -WritePolicy getKVDOWritePolicy(KVDO *kvdo) -{ - return getWritePolicy(kvdo->vdo); -} - -/**********************************************************************/ -void enqueueKVDOThreadWork(KVDOThread *thread, - KvdoWorkItem *item) -{ - enqueueWorkQueue(thread->requestQueue, item); -} - -/**********************************************************************/ -void enqueueKVDOWork(KVDO *kvdo, KvdoWorkItem *item, ThreadID threadID) -{ - enqueueKVDOThreadWork(&kvdo->threads[threadID], item); -} - -/**********************************************************************/ -void enqueueKVIO(KVIO *kvio, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action) -{ - ThreadID threadID = vioAsCompletion(kvio->vio)->callbackThreadID; - BUG_ON(threadID >= kvio->layer->kvdo.initializedThreadCount); - launchKVIO(kvio, work, statsFunction, action, - kvio->layer->kvdo.threads[threadID].requestQueue); -} - -/**********************************************************************/ -static void kvdoEnqueueWork(KvdoWorkItem *workItem) -{ - KvdoEnqueueable *kvdoEnqueueable = container_of(workItem, - KvdoEnqueueable, - workItem); - runCallback(kvdoEnqueueable->enqueueable.completion); -} - -/**********************************************************************/ -void kvdoEnqueue(Enqueueable *enqueueable) -{ - KvdoEnqueueable *kvdoEnqueueable = container_of(enqueueable, - KvdoEnqueueable, - enqueueable); - KernelLayer *layer = asKernelLayer(enqueueable->completion->layer); - ThreadID threadID = enqueueable->completion->callbackThreadID; - if (ASSERT(threadID < layer->kvdo.initializedThreadCount, - "threadID %u (completion type %d) is less than thread count %u", - threadID, enqueueable->completion->type, - layer->kvdo.initializedThreadCount) != UDS_SUCCESS) { - BUG(); - } - - if (enqueueable->completion->type == VIO_COMPLETION) { - vioAddTraceRecord(asVIO(enqueueable->completion), - THIS_LOCATION("$F($cb)")); - } - setupWorkItem(&kvdoEnqueueable->workItem, kvdoEnqueueWork, - (KvdoWorkFunction) enqueueable->completion->callback, - REQ_Q_ACTION_COMPLETION); - enqueueKVDOThreadWork(&layer->kvdo.threads[threadID], - &kvdoEnqueueable->workItem); -} - -/**********************************************************************/ -ThreadID kvdoGetCurrentThreadID(void) -{ - KVDOThread *thread = getWorkQueuePrivateData(); - if (thread == NULL) { - return INVALID_THREAD_ID; - } - - ThreadID threadID = thread->threadID; - if (PARANOID_THREAD_CONSISTENCY_CHECKS) { - KVDO *kvdo = thread->kvdo; - KernelLayer *kernelLayer = asKernelLayer(getPhysicalLayer()); - BUG_ON(&kernelLayer->kvdo != kvdo); - BUG_ON(threadID >= kvdo->initializedThreadCount); - BUG_ON(thread != &kvdo->threads[threadID]); - } - return threadID; -} - -/**********************************************************************/ -static PhysicalLayer *getKernelPhysicalLayer(void) -{ - KVDOThread *thread = getWorkQueuePrivateData(); - if (thread == NULL) { - return NULL; - } - KVDO *kvdo = thread->kvdo; - KernelLayer *layer = container_of(kvdo, KernelLayer, kvdo); - return &layer->common; -} - -void initKernelVDOOnce(void) -{ - registerPhysicalLayerGetter(getKernelPhysicalLayer); -} diff --git a/vdo/kernel/kernelVDO.h b/vdo/kernel/kernelVDO.h deleted file mode 100644 index b65534d..0000000 --- a/vdo/kernel/kernelVDO.h +++ /dev/null @@ -1,299 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDO.h#4 $ - */ - -#ifndef KERNEL_VDO_H -#define KERNEL_VDO_H - -#include "completion.h" -#include "kernelTypes.h" -#include "threadRegistry.h" -#include "workQueue.h" - -typedef struct { - KVDO *kvdo; - ThreadID threadID; - KvdoWorkQueue *requestQueue; - RegisteredThread allocatingThread; -} KVDOThread; - -struct kvdo { - KVDOThread *threads; - ThreadID initializedThreadCount; - KvdoWorkItem workItem; - VDOAction *action; - VDOCompletion *completion; - // Base-code device info - VDO *vdo; -}; - -typedef enum reqQAction { - REQ_Q_ACTION_COMPLETION, - REQ_Q_ACTION_FLUSH, - REQ_Q_ACTION_MAP_BIO, - REQ_Q_ACTION_SYNC, - REQ_Q_ACTION_VIO_CALLBACK -} ReqQAction; - -/** - * Initialize the base code interface. - * - * @param [in] kvdo The KVDO to be initialized - * @param [in] threadConfig The base-code thread configuration - * @param [out] reason The reason for failure - * - * @return VDO_SUCCESS or an error code - **/ -int initializeKVDO(KVDO *kvdo, - const ThreadConfig *threadConfig, - char **reason); - -/** - * Load the VDO state from disk but don't alter the on-disk state. This method - * is ultimately called from the constructor for devices which have not been - * resumed. - * - * @param [in] kvdo The KVDO to be started - * @param [in] common The physical layer pointer - * @param [in] loadConfig Load-time parameters for the VDO - * @param [in] vioTraceRecording Debug flag to store - * @param [out] reason The reason for failure - **/ -int preloadKVDO(KVDO *kvdo, - PhysicalLayer *common, - const VDOLoadConfig *loadConfig, - bool vioTraceRecording, - char **reason); - -/** - * Starts the base VDO instance associated with the kernel layer. This method - * is ultimately called from preresume the first time an instance is resumed. - * - * @param [in] kvdo The KVDO to be started - * @param [in] common The physical layer pointer - * @param [out] reason The reason for failure - * - * @return VDO_SUCCESS if started, otherwise error - */ -int startKVDO(KVDO *kvdo, PhysicalLayer *common, char **reason); - -/** - * Suspend the base VDO instance associated with the kernel layer. - * - * @param kvdo The KVDO to be suspended - * - * @return VDO_SUCCESS if stopped, otherwise error - **/ -int suspendKVDO(KVDO *kvdo); - -/** - * Resume the base VDO instance associated with the kernel layer. - * - * @param kvdo The KVDO to be resumed - * - * @return VDO_SUCCESS or an error - **/ -int resumeKVDO(KVDO *kvdo); - -/** - * Shut down the base code interface. The kvdo object must first be - * stopped. - * - * @param kvdo The KVDO to be shut down - **/ -void finishKVDO(KVDO *kvdo); - -/** - * Free up storage of the base code interface. The KVDO object must - * first have been "finished". - * - * @param kvdo The KVDO object to be destroyed - **/ -void destroyKVDO(KVDO *kvdo); - - -/** - * Dump to the kernel log any work-queue info associated with the base - * code. - * - * @param kvdo The KVDO object to be examined - **/ -void dumpKVDOWorkQueue(KVDO *kvdo); - -/** - * Get the VDO pointer for a kvdo object - * - * @param kvdo The KVDO object - * - * @return the VDO pointer - */ -static inline VDO *getVDO(KVDO *kvdo) -{ - return kvdo->vdo; -} - -/** - * Set whether compression is enabled. - * - * @param kvdo The KVDO object - * @param enableCompression The new compression mode - * - * @return state of compression before new value is set - **/ -bool setKVDOCompressing(KVDO *kvdo, bool enableCompression); - -/** - * Get the current compression mode - * - * @param kvdo The KVDO object to be queried - * - * @return whether compression is currently enabled - */ -bool getKVDOCompressing(KVDO *kvdo); - -/** - * Gets the latest statistics gathered by the base code. - * - * @param kvdo the KVDO object - * @param stats the statistics struct to fill in - */ -void getKVDOStatistics(KVDO *kvdo, VDOStatistics *stats); - -/** - * Get the current write policy - * - * @param kvdo The KVDO to be queried - * - * @return the write policy in effect - */ -WritePolicy getKVDOWritePolicy(KVDO *kvdo); - -/** - * Dump base code status information to the kernel log for debugging. - * - * @param kvdo The KVDO to be examined - */ -void dumpKVDOStatus(KVDO *kvdo); - -/** - * Request the base code prepare to grow the physical space. - * - * @param kvdo The KVDO to be updated - * @param physicalCount The new size - * - * @return VDO_SUCCESS or error - */ -int kvdoPrepareToGrowPhysical(KVDO *kvdo, BlockCount physicalCount); - -/** - * Notify the base code of resized physical storage. - * - * @param kvdo The KVDO to be updated - * @param physicalCount The new size - * - * @return VDO_SUCCESS or error - */ -int kvdoResizePhysical(KVDO *kvdo, BlockCount physicalCount); - -/** - * Request the base code prepare to grow the logical space. - * - * @param kvdo The KVDO to be updated - * @param logicalCount The new size - * - * @return VDO_SUCCESS or error - */ -int kvdoPrepareToGrowLogical(KVDO *kvdo, BlockCount logicalCount); - -/** - * Request the base code grow the logical space. - * - * @param kvdo The KVDO to be updated - * @param logicalCount The new size - * - * @return VDO_SUCCESS or error - */ -int kvdoResizeLogical(KVDO *kvdo, BlockCount logicalCount); - -/** - * Request the base code go read-only. - * - * @param kvdo The KVDO to be updated - * @param result The error code causing the read only - */ -void setKVDOReadOnly(KVDO *kvdo, int result); - -/** - * Perform an extended base-code command - * - * @param kvdo The KVDO upon which to perform the operation. - * @param argc The number of arguments to the command. - * @param argv The command arguments. Note that all extended - * command argv[0] strings start with "x-". - * - * @return VDO_SUCCESS or an error code - **/ -int performKVDOExtendedCommand(KVDO *kvdo, int argc, char **argv); - -/** - * Enqueue a work item to be processed in the base code context. - * - * @param kvdo The KVDO object in which to run the work item - * @param item The work item to be run - * @param threadID The thread on which to run the work item - **/ -void enqueueKVDOWork(KVDO *kvdo, KvdoWorkItem *item, ThreadID threadID); - -/** - * Set up and enqueue a VIO's work item to be processed in the base code - * context. - * - * @param kvio The VIO with the work item to be run - * @param work The function pointer to execute - * @param statsFunction A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -void enqueueKVIO(KVIO *kvio, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action); - -/** - * Enqueue an arbitrary completion for execution on its indicated - * thread. - * - * @param enqueueable The Enqueueable object containing the completion pointer - **/ -void kvdoEnqueue(Enqueueable *enqueueable); - -/** - * Get the base-code thread index for the current execution context. - * - * @return The thread ID, or (ThreadID)-1 if the current thread is - * not a base-code thread, or in an interrupt context. - **/ -ThreadID kvdoGetCurrentThreadID(void); - -/** - * Do one-time initialization of kernelVDO interface. - **/ -void initKernelVDOOnce(void); - -#endif // KERNEL_VDO_H diff --git a/vdo/kernel/kernelVDOInternals.h b/vdo/kernel/kernelVDOInternals.h deleted file mode 100644 index aefe05a..0000000 --- a/vdo/kernel/kernelVDOInternals.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kernelVDOInternals.h#1 $ - */ - -#ifndef KERNEL_VDO_INTERNALS_H -#define KERNEL_VDO_INTERNALS_H - -#include "kernelVDO.h" - -/** - * Enqueue a work item to be performed in the base code in a - * particular thread. - * - * @param thread The KVDO thread on which to run the work item - * @param item The work item to be run - **/ -void enqueueKVDOThreadWork(KVDOThread *thread, KvdoWorkItem *item); - -#endif // KERNEL_VDO_INTERNALS_H diff --git a/vdo/kernel/ktrace.c b/vdo/kernel/ktrace.c deleted file mode 100644 index ebc654a..0000000 --- a/vdo/kernel/ktrace.c +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ktrace.c#2 $ - */ - -#include "ktrace.h" - -#include "memoryAlloc.h" - -#include "dataVIO.h" - -#include "kvio.h" -#include "logger.h" - -enum { - // How much data from a trace can we log in one call without messing - // up the log or losing data? - TRACE_LOG_MAX = 820, - - // What fraction (1 out of TRACE_SAMPLE_INTERVAL VIOs) to trace - TRACE_SAMPLE_INTERVAL = 3, -}; - -bool traceRecording = false; - -static struct { - char buffer[2000]; - unsigned int counter; - struct mutex lock; -} traceLoggingState; - -/** - * Initialize a SampleCounter structure with the given sampling interval. - * - * @param counter The counter to initialize - * @param interval The desired sampling interval - **/ -static void initializeSampleCounter(SampleCounter *counter, - unsigned int interval) -{ - spin_lock_init(&counter->lock); - counter->tick = 0; - counter->interval = interval; -} - -/*************************************************************************/ -bool sampleThisOne(SampleCounter *counter) -{ - bool wantTracing = false; - spin_lock(&counter->lock); - counter->tick++; - if (counter->tick >= counter->interval) { - counter->tick = 0; - wantTracing = true; - } - spin_unlock(&counter->lock); - return wantTracing; -} - -/*************************************************************************/ -static void freeTraceDataBuffer(void *poolData, void *data) -{ - Trace *trace = (Trace *) data; - FREE(trace); -} - -/*************************************************************************/ -static int allocTraceDataBuffer(void *poolData, void **dataPtr) -{ - Trace *trace; - int result = ALLOCATE(1, Trace, __func__, &trace); - if (result != VDO_SUCCESS) { - logError("trace data allocation failure %d", result); - return result; - } - - *dataPtr = trace; - return VDO_SUCCESS; -} - -/*************************************************************************/ -int allocTraceFromPool(KernelLayer *layer, Trace **tracePointer) -{ - int result = allocBufferFromPool(layer->traceBufferPool, - (void **) tracePointer); - if (result == VDO_SUCCESS) { - (*tracePointer)->used = 0; - } - return result; -} - -/*************************************************************************/ -void freeTraceToPool(KernelLayer *layer, Trace *trace) -{ - freeBufferToPool(layer->traceBufferPool, trace); -} - -/*************************************************************************/ -int traceKernelLayerInit(KernelLayer *layer) -{ - layer->vioTraceRecording = traceRecording; - initializeSampleCounter(&layer->traceSampleCounter, TRACE_SAMPLE_INTERVAL); - unsigned int traceRecordsNeeded = 0; - if (layer->vioTraceRecording) { - traceRecordsNeeded += layer->requestLimiter.limit; - } - if (traceRecordsNeeded > 0) { - return makeBufferPool("KVDO Trace Data Pool", traceRecordsNeeded, - allocTraceDataBuffer, freeTraceDataBuffer, NULL, - layer, &layer->traceBufferPool); - } - return VDO_SUCCESS; -} - -/*************************************************************************/ -void initializeTraceLoggingOnce(void) -{ - mutex_init(&traceLoggingState.lock); -} - -/*************************************************************************/ -void logKvioTrace(KVIO *kvio) -{ - KernelLayer *layer = kvio->layer; - - mutex_lock(&traceLoggingState.lock); - traceLoggingState.counter++; - // Log about 0.1% to avoid spewing data faster than syslog can keep up - // (on certain of Permabit's test machines). - // Yes, the 37 is arbitrary and meaningless. - - if (layer->traceLogging && ((traceLoggingState.counter % 1024) == 37)) { - kvioAddTraceRecord(kvio, THIS_LOCATION(NULL)); - size_t traceLen = 0; - formatTrace(kvio->vio->trace, traceLoggingState.buffer, - sizeof(traceLoggingState.buffer), &traceLen); - - if (isMetadata(kvio)) { - logInfo("finishing kvio %s meta @%" PRIptr " %s", - (isWriteVIO(kvio->vio) ? "read" : "write"), - kvio, traceLoggingState.buffer); - } else if (isCompressedWriter(kvio)) { - logInfo("finishing kvio write comp @%" PRIptr " %s", - kvio, traceLoggingState.buffer); - } else { - const char *dupeLabel = ""; - if (isWriteVIO(kvio->vio)) { - DataVIO *dataVIO = vioAsDataVIO(kvio->vio); - if (isTrimDataVIO(dataVIO)) { - dupeLabel = "trim "; - } else if (dataVIO->isZeroBlock) { - dupeLabel = "zero "; - } else if (dataVIO->isDuplicate) { - dupeLabel = "dupe "; - } else { - dupeLabel = "new "; - } - } - - logInfo("finishing kvio %s data %s@%" PRIptr " %.*s", - (isWriteVIO(kvio->vio) ? "read" : "write"), - dupeLabel, kvio, TRACE_LOG_MAX, traceLoggingState.buffer); - char *buf = traceLoggingState.buffer; - while (traceLen > TRACE_LOG_MAX) { - traceLen -= TRACE_LOG_MAX; - buf += TRACE_LOG_MAX; - logInfo("more kvio %" PRIptr " path: %.*s", kvio, TRACE_LOG_MAX, buf); - } - } - } - - mutex_unlock(&traceLoggingState.lock); -} diff --git a/vdo/kernel/ktrace.h b/vdo/kernel/ktrace.h deleted file mode 100644 index 99cda7a..0000000 --- a/vdo/kernel/ktrace.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/ktrace.h#1 $ - */ - -#ifndef KTRACE_H -#define KTRACE_H - -#include - -#include "common.h" -#include "trace.h" - -struct kernelLayer; -struct kvio; - -// Implement event sampling once per N. -typedef struct { - unsigned int interval; - unsigned int tick; - spinlock_t lock; -} SampleCounter; - -/** - * Flag indicating whether newly created VDO devices should record trace info. - **/ -extern bool traceRecording; - -/** - * Updates the counter state and returns true once each time the - * sampling interval is reached. - * - * @param counter The sampling counter info - * - * @return whether to do sampling on this invocation - **/ -bool sampleThisOne(SampleCounter *counter); - -/** - * Initialize trace data in the KernelLayer - * - * @param layer The KernelLayer - * - * @return VDO_SUCCESS, or an error code - **/ -int traceKernelLayerInit(struct kernelLayer *layer); - -/** - * Initialize the mutex used when logging latency tracing data. - **/ -void initializeTraceLoggingOnce(void); - -/** - * Allocate a trace buffer - * - * @param layer The KernelLayer - * @param tracePointer The trace buffer is returned here - * - * @return VDO_SUCCESS or an error code - **/ -int allocTraceFromPool(struct kernelLayer *layer, Trace **tracePointer); - -/** - * Free a trace buffer - * - * @param layer The KernelLayer - * @param trace The trace buffer - **/ -void freeTraceToPool(struct kernelLayer *layer, Trace *trace); - -/** - * Log the trace at kvio freeing time - * - * @param kvio The kvio structure - **/ -void logKvioTrace(struct kvio *kvio); - -#endif /* KTRACE_H */ diff --git a/vdo/kernel/kvdoFlush.c b/vdo/kernel/kvdoFlush.c deleted file mode 100644 index 7b38af1..0000000 --- a/vdo/kernel/kvdoFlush.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvdoFlush.c#6 $ - */ - -#include "kvdoFlush.h" - -#include "logger.h" -#include "memoryAlloc.h" - -#include "threadConfig.h" - -#include "bio.h" -#include "ioSubmitter.h" - -/** - * A specific (concrete) encapsulation of flush requests. - * - *

We attempt to allocate a KVDOFlush objects for each incoming flush bio. - * In case the allocate fails, a spare object is pre-allocated by and stored - * in the kernel layer. The first time an allocation fails, the spare is used. - * If another allocation fails while the spare is in use, it will merely be - * queued for later processing. - * - *

When a KVDOFlush is complete, it will either be freed, immediately - * re-used for queued flushes, or stashed in the kernel layer as the new spare - * object. This ensures that we will always make forward progress. - **/ -struct kvdoFlush { - KvdoWorkItem workItem; - KernelLayer *layer; - struct bio_list bios; - Jiffies arrivalTime; // Time when earliest bio appeared - VDOFlush vdoFlush; -}; - -/**********************************************************************/ -int makeKVDOFlush(KVDOFlush **flushPtr) -{ - return ALLOCATE(1, KVDOFlush, __func__, flushPtr); -} - -/**********************************************************************/ -bool shouldProcessFlush(KernelLayer *layer) -{ - return (getKVDOWritePolicy(&layer->kvdo) != WRITE_POLICY_SYNC); -} - -/** - * Function call to handle an empty flush request from the request queue. - * - * @param item The work item representing the flush request - **/ -static void kvdoFlushWork(KvdoWorkItem *item) -{ - KVDOFlush *kvdoFlush = container_of(item, KVDOFlush, workItem); - flush(kvdoFlush->layer->kvdo.vdo, &kvdoFlush->vdoFlush); -} - -/** - * Initialize a KVDOFlush object, transferring all the bios in the kernel - * layer's waitingFlushes list to it. The caller MUST already hold the layer's - * flushLock. - * - * @param kvdoFlush The flush to initialize - * @param layer The kernel layer on which the flushLock is held - **/ -static void initializeKVDOFlush(KVDOFlush *kvdoFlush, KernelLayer *layer) -{ - kvdoFlush->layer = layer; - bio_list_init(&kvdoFlush->bios); - bio_list_merge(&kvdoFlush->bios, &layer->waitingFlushes); - bio_list_init(&layer->waitingFlushes); - kvdoFlush->arrivalTime = layer->flushArrivalTime; -} - -/**********************************************************************/ -static void enqueueKVDOFlush(KVDOFlush *kvdoFlush) -{ - setupWorkItem(&kvdoFlush->workItem, kvdoFlushWork, NULL, REQ_Q_ACTION_FLUSH); - KVDO *kvdo = &kvdoFlush->layer->kvdo; - enqueueKVDOWork(kvdo, &kvdoFlush->workItem, - getPackerZoneThread(getThreadConfig(kvdo->vdo))); -} - -/**********************************************************************/ -void launchKVDOFlush(KernelLayer *layer, BIO *bio) -{ - // Try to allocate a KVDOFlush to represent the flush request. If the - // allocation fails, we'll deal with it later. - KVDOFlush *kvdoFlush = ALLOCATE_NOWAIT(KVDOFlush, __func__); - - spin_lock(&layer->flushLock); - - // We have a new bio to start. Add it to the list. If it becomes the - // only entry on the list, record the time. - if (bio_list_empty(&layer->waitingFlushes)) { - layer->flushArrivalTime = jiffies; - } - bio_list_add(&layer->waitingFlushes, bio); - - if (kvdoFlush == NULL) { - // The KVDOFlush allocation failed. Try to use the spare KVDOFlush object. - if (layer->spareKVDOFlush == NULL) { - // The spare is already in use. This bio is on waitingFlushes and it - // will be handled by a flush completion or by a bio that can allocate. - spin_unlock(&layer->flushLock); - return; - } - - // Take and use the spare KVDOFlush object. - kvdoFlush = layer->spareKVDOFlush; - layer->spareKVDOFlush = NULL; - } - - // We have flushes to start. Capture them in the KVDOFlush object. - initializeKVDOFlush(kvdoFlush, layer); - - spin_unlock(&layer->flushLock); - - // Finish launching the flushes. - enqueueKVDOFlush(kvdoFlush); -} - -/** - * Release a KVDOFlush object that has completed its work. If there are any - * pending flush requests whose KVDOFlush allocation failed, they will be - * launched by immediately re-using the released KVDOFlush. If there is no - * spare KVDOFlush, the released object will become the spare. Otherwise, the - * KVDOFlush will be freed. - * - * @param kvdoFlush The completed flush object to re-use or free - **/ -static void releaseKVDOFlush(KVDOFlush *kvdoFlush) -{ - KernelLayer *layer = kvdoFlush->layer; - bool relaunchFlush = false; - bool freeFlush = false; - - spin_lock(&layer->flushLock); - if (bio_list_empty(&layer->waitingFlushes)) { - // Nothing needs to be started. Save one spare KVDOFlush object. - if (layer->spareKVDOFlush == NULL) { - // Make the new spare all zero, just like a newly allocated one. - memset(kvdoFlush, 0, sizeof(*kvdoFlush)); - layer->spareKVDOFlush = kvdoFlush; - } else { - freeFlush = true; - } - } else { - // We have flushes to start. Capture them in the KVDOFlush object. - initializeKVDOFlush(kvdoFlush, layer); - relaunchFlush = true; - } - spin_unlock(&layer->flushLock); - - if (relaunchFlush) { - // Finish launching the flushes. - enqueueKVDOFlush(kvdoFlush); - } else if (freeFlush) { - FREE(kvdoFlush); - } -} - -/** - * Function called to complete and free a flush request - * - * @param item The flush-request work item - **/ -static void kvdoCompleteFlushWork(KvdoWorkItem *item) -{ - KVDOFlush *kvdoFlush = container_of(item, KVDOFlush, workItem); - KernelLayer *layer = kvdoFlush->layer; - - BIO *bio; - while ((bio = bio_list_pop(&kvdoFlush->bios)) != NULL) { - // We're not acknowledging this bio now, but we'll never touch it - // again, so this is the last chance to account for it. - countBios(&layer->biosAcknowledged, bio); - - // Make sure the bio is a empty flush bio. - prepareFlushBIO(bio, bio->bi_private, getKernelLayerBdev(layer), - bio->bi_end_io); - atomic64_inc(&layer->flushOut); - generic_make_request(bio); - } - - - // Release the KVDOFlush object, freeing it, re-using it as the spare, or - // using it to launch any flushes that had to wait when allocations failed. - releaseKVDOFlush(kvdoFlush); -} - -/**********************************************************************/ -void kvdoCompleteFlush(VDOFlush **kfp) -{ - if (*kfp != NULL) { - KVDOFlush *kvdoFlush = container_of(*kfp, KVDOFlush, vdoFlush); - setupWorkItem(&kvdoFlush->workItem, kvdoCompleteFlushWork, NULL, - BIO_Q_ACTION_FLUSH); - enqueueBioWorkItem(kvdoFlush->layer->ioSubmitter, - &kvdoFlush->workItem); - *kfp = NULL; - } -} - -/**********************************************************************/ -int synchronousFlush(KernelLayer *layer) -{ - BIO bio; -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) - bio_init(&bio, 0, 0); -#else - bio_init(&bio); -#endif - int result = 0; - - prepareFlushBIO(&bio, layer, getKernelLayerBdev(layer), NULL); - result = submitBioAndWait(&bio); - atomic64_inc(&layer->flushOut); - if (result != 0) { - logErrorWithStringError(result, "synchronous flush failed"); - result = -EIO; - } - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) - bio_uninit(&bio); -#endif - return result; -} diff --git a/vdo/kernel/kvdoFlush.h b/vdo/kernel/kvdoFlush.h deleted file mode 100644 index 2d90953..0000000 --- a/vdo/kernel/kvdoFlush.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvdoFlush.h#1 $ - */ - -#ifndef KVDO_FLUSH_H -#define KVDO_FLUSH_H - -#include "flush.h" - -#include "kernelLayer.h" - -/** - * Create a KVDOFlush. - * - * @param flushPtr A pointer to hold the new flush - **/ -int makeKVDOFlush(KVDOFlush **flushPtr); - -/** - * Answer the question as to whether VDO should be processing REQ_FLUSH - * requests or not. - * - * @param layer The layer - * - * @return true if VDO should process empty flush requests, or false if - * they should just be forwarded to our storage device. - **/ -bool shouldProcessFlush(KernelLayer *layer); - -/** - * Function called to start processing a flush request. It is called when we - * receive an empty flush bio from the block layer, and before acknowledging a - * non-empty bio with the FUA flag set. - * - * @param layer The physical layer - * @param bio The bio containing an empty flush request - **/ -void launchKVDOFlush(KernelLayer *layer, BIO *bio); - -/** - * Function called from base VDO to complete and free a flush request. - * - * @param kfp Pointer to the flush request - **/ -void kvdoCompleteFlush(VDOFlush **kfp); - -/** - * Issue a flush request and wait for it to complete. - * - * @param layer The kernel layer - * - * @return VDO_SUCCESS or an error - */ -int synchronousFlush(KernelLayer *layer); - -#endif /* KVDO_FLUSH_H */ diff --git a/vdo/kernel/kvio.c b/vdo/kernel/kvio.c deleted file mode 100644 index 336f86e..0000000 --- a/vdo/kernel/kvio.c +++ /dev/null @@ -1,415 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvio.c#7 $ - */ - -#include "kvio.h" - - -#include "logger.h" -#include "memoryAlloc.h" - -#include "numUtils.h" -#include "vdo.h" -#include "waitQueue.h" - -#include "bio.h" -#include "ioSubmitter.h" -#include "kvdoFlush.h" - -/** - * A function to tell vdo that we have completed the requested async - * operation for a vio - * - * @param item The work item of the VIO to complete - **/ -static void kvdoHandleVIOCallback(KvdoWorkItem *item) -{ - KVIO *kvio = workItemAsKVIO(item); - runCallback(vioAsCompletion(kvio->vio)); -} - -/**********************************************************************/ -void kvdoEnqueueVIOCallback(KVIO *kvio) -{ - enqueueKVIO(kvio, kvdoHandleVIOCallback, - (KvdoWorkFunction) vioAsCompletion(kvio->vio)->callback, - REQ_Q_ACTION_VIO_CALLBACK); -} - -/**********************************************************************/ -void kvdoContinueKvio(KVIO *kvio, int error) -{ - if (unlikely(error != VDO_SUCCESS)) { - setCompletionResult(vioAsCompletion(kvio->vio), error); - } - kvdoEnqueueVIOCallback(kvio); -} - -/**********************************************************************/ -// noinline ensures systemtap can hook in here -static noinline void maybeLogKvioTrace(KVIO *kvio) -{ - if (kvio->layer->traceLogging) { - logKvioTrace(kvio); - } -} - -/**********************************************************************/ -static void freeKVIO(KVIO **kvioPtr) -{ - KVIO *kvio = *kvioPtr; - if (kvio == NULL) { - return; - } - - if (unlikely(kvio->vio->trace != NULL)) { - maybeLogKvioTrace(kvio); - FREE(kvio->vio->trace); - } - - freeBio(kvio->bio, kvio->layer); - FREE(kvio); - *kvioPtr = NULL; -} - -/**********************************************************************/ -void freeMetadataKVIO(MetadataKVIO **metadataKVIOPtr) -{ - freeKVIO((KVIO **) metadataKVIOPtr); -} - -/**********************************************************************/ -void freeCompressedWriteKVIO(CompressedWriteKVIO **compressedWriteKVIOPtr) -{ - freeKVIO((KVIO **) compressedWriteKVIOPtr); -} - -/**********************************************************************/ -void kvdoWriteCompressedBlock(AllocatingVIO *allocatingVIO) -{ - // This method assumes that compressed writes never set the flush or FUA - // bits. - CompressedWriteKVIO *compressedWriteKVIO - = allocatingVIOAsCompressedWriteKVIO(allocatingVIO); - KVIO *kvio = compressedWriteKVIOAsKVIO(compressedWriteKVIO); - BIO *bio = kvio->bio; - resetBio(bio, kvio->layer); - setBioOperationWrite(bio); - setBioSector(bio, blockToSector(kvio->layer, kvio->vio->physical)); - submitBio(bio, BIO_Q_ACTION_COMPRESSED_DATA); -} - -/** - * Get the BioQueue action for a metadata VIO based on that VIO's priority. - * - * @param vio The VIO - * - * @return The action with which to submit the VIO's BIO. - **/ -static inline BioQAction getMetadataAction(VIO *vio) -{ - return ((vio->priority == VIO_PRIORITY_HIGH) - ? BIO_Q_ACTION_HIGH : BIO_Q_ACTION_METADATA); -} - -/**********************************************************************/ -void kvdoSubmitMetadataVIO(VIO *vio) -{ - KVIO *kvio = metadataKVIOAsKVIO(vioAsMetadataKVIO(vio)); - BIO *bio = kvio->bio; - resetBio(bio, kvio->layer); - - setBioSector(bio, blockToSector(kvio->layer, vio->physical)); - - // Metadata I/Os bypass the read cache. - if (isReadVIO(vio)) { - ASSERT_LOG_ONLY(!vioRequiresFlushBefore(vio), - "read VIO does not require flush before"); - vioAddTraceRecord(vio, THIS_LOCATION("$F;io=readMeta")); - setBioOperationRead(bio); - } else { - KernelLayerState state = getKernelLayerState(kvio->layer); - ASSERT_LOG_ONLY(((state == LAYER_RUNNING) - || (state == LAYER_RESUMING) - || (state = LAYER_STARTING)), - "write metadata in allowed state %d", state); - if (vioRequiresFlushBefore(vio)) { - setBioOperationWrite(bio); - setBioOperationFlagPreflush(bio); - vioAddTraceRecord(vio, THIS_LOCATION("$F;io=flushWriteMeta")); - } else { - setBioOperationWrite(bio); - vioAddTraceRecord(vio, THIS_LOCATION("$F;io=writeMeta")); - } - } - - if (vioRequiresFlushAfter(vio)) { - setBioOperationFlagFua(bio); - } - submitBio(bio, getMetadataAction(vio)); -} - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) -/** - * Handle the completion of a base-code initiated flush by continuing the flush - * VIO. - * - * @param bio The bio to complete - **/ -static void completeFlushBio(BIO *bio) -#else -/** - * Handle the completion of a base-code initiated flush by continuing the flush - * VIO. - * - * @param bio The bio to complete - * @param error Possible error from underlying block device - **/ -static void completeFlushBio(BIO *bio, int error) -#endif -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0) - int error = getBioResult(bio); -#endif - KVIO *kvio = (KVIO *) bio->bi_private; - // XXX This assumes a VDO-created bio around a buffer contains exactly 1 - // page, which we believe is true, but do not assert. - bio->bi_vcnt = 1; - // Restore the bio's notion of its own data. - resetBio(bio, kvio->layer); - kvdoContinueKvio(kvio, error); -} - -/**********************************************************************/ -void kvdoFlushVIO(VIO *vio) -{ - KVIO *kvio = metadataKVIOAsKVIO(vioAsMetadataKVIO(vio)); - BIO *bio = kvio->bio; - KernelLayer *layer = kvio->layer; - resetBio(bio, layer); - prepareFlushBIO(bio, kvio, getKernelLayerBdev(layer), completeFlushBio); - submitBio(bio, getMetadataAction(vio)); -} - -/* - * Hook for a SystemTap probe to potentially restrict the choices - * of which VIOs should have their latencies tracked. - * - * Normally returns true. Even if true is returned, sampleThisOne may - * cut down the monitored VIOs by some fraction so as to reduce the - * impact on system performance. - * - * Must be "noinline" so that SystemTap can find the return - * instruction and modify the return value. - * - * @param kvio The KVIO being initialized - * @param layer The kernel layer - * @param bio The incoming I/O request - * - * @return whether it's useful to track latency for VIOs looking like - * this one - */ -static noinline bool -sampleThisVIO(KVIO *kvio, KernelLayer *layer, BIO *bio) -{ - bool result = true; - // Ensure the arguments and result exist at the same time, for SystemTap. - __asm__ __volatile__("" - : "=g" (result) - : "0" (result), - "g" (kvio), - "g" (layer), - "g" (bio) - : "memory"); - return result; -} - -/**********************************************************************/ -void initializeKVIO(KVIO *kvio, - KernelLayer *layer, - VIOType vioType, - VIOPriority priority, - void *parent, - BIO *bio) -{ - if (layer->vioTraceRecording - && sampleThisVIO(kvio, layer, bio) - && sampleThisOne(&layer->traceSampleCounter)) { - int result = (isDataVIOType(vioType) - ? allocTraceFromPool(layer, &kvio->vio->trace) - : ALLOCATE(1, Trace, "trace", &kvio->vio->trace)); - if (result != VDO_SUCCESS) { - logError("trace record allocation failure %d", result); - } - } - - kvio->bio = bio; - kvio->layer = layer; - if (bio != NULL) { - bio->bi_private = kvio; - } - - initializeVIO(kvio->vio, vioType, priority, parent, getVDO(&layer->kvdo), - &layer->common); - - // XXX: The "init" label should be replaced depending on the - // write/read/flush path followed. - kvioAddTraceRecord(kvio, THIS_LOCATION("$F;io=?init;j=normal")); - - VDOCompletion *completion = vioAsCompletion(kvio->vio); - kvio->enqueueable.enqueueable.completion = completion; - completion->enqueueable = &kvio->enqueueable.enqueueable; -} - -/** - * Construct a metadata KVIO. - * - * @param [in] layer The physical layer - * @param [in] vioType The type of VIO to create - * @param [in] priority The relative priority to assign to the - * MetadataKVIO - * @param [in] parent The parent of the MetadataKVIO completion - * @param [in] bio The bio to associate with this MetadataKVIO - * @param [out] metadataKVIOPtr A pointer to hold the new MetadataKVIO - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int makeMetadataKVIO(KernelLayer *layer, - VIOType vioType, - VIOPriority priority, - void *parent, - BIO *bio, - MetadataKVIO **metadataKVIOPtr) -{ - // If MetadataKVIO grows past 256 bytes, we'll lose benefits of VDOSTORY-176. - STATIC_ASSERT(sizeof(MetadataKVIO) <= 256); - - // Metadata VIOs should use direct allocation and not use the buffer pool, - // which is reserved for submissions from the linux block layer. - MetadataKVIO *metadataKVIO; - int result = ALLOCATE(1, MetadataKVIO, __func__, &metadataKVIO); - if (result != VDO_SUCCESS) { - logError("metadata KVIO allocation failure %d", result); - return result; - } - - KVIO *kvio = &metadataKVIO->kvio; - kvio->vio = &metadataKVIO->vio; - initializeKVIO(kvio, layer, vioType, priority, parent, bio); - *metadataKVIOPtr = metadataKVIO; - return VDO_SUCCESS; -} - -/** - * Construct a CompressedWriteKVIO. - * - * @param [in] layer The physical layer - * @param [in] parent The parent of the CompressedWriteKVIO - * completion - * @param [in] bio The bio to associate with this - * CompressedWriteKVIO - * @param [out] compressedWriteKVIOPtr A pointer to hold the new - * CompressedWriteKVIO - * - * @return VDO_SUCCESS or an error - **/ -__attribute__((warn_unused_result)) -static int -makeCompressedWriteKVIO(KernelLayer *layer, - void *parent, - BIO *bio, - CompressedWriteKVIO **compressedWriteKVIOPtr) -{ - // Compressed write VIOs should use direct allocation and not use the buffer - // pool, which is reserved for submissions from the linux block layer. - CompressedWriteKVIO *compressedWriteKVIO; - int result = ALLOCATE(1, CompressedWriteKVIO, __func__, - &compressedWriteKVIO); - if (result != VDO_SUCCESS) { - logError("compressed write KVIO allocation failure %d", result); - return result; - } - - KVIO *kvio = &compressedWriteKVIO->kvio; - kvio->vio = allocatingVIOAsVIO(&compressedWriteKVIO->allocatingVIO); - initializeKVIO(kvio, layer, VIO_TYPE_COMPRESSED_BLOCK, - VIO_PRIORITY_COMPRESSED_DATA, parent, bio); - *compressedWriteKVIOPtr = compressedWriteKVIO; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int kvdoCreateMetadataVIO(PhysicalLayer *layer, - VIOType vioType, - VIOPriority priority, - void *parent, - char *data, - VIO **vioPtr) -{ - int result = ASSERT(isMetadataVIOType(vioType), - "%d is a metadata type", vioType); - if (result != VDO_SUCCESS) { - return result; - } - - BIO *bio; - KernelLayer *kernelLayer = asKernelLayer(layer); - result = createBio(kernelLayer, data, &bio); - if (result != VDO_SUCCESS) { - return result; - } - - MetadataKVIO *metadataKVIO; - result = makeMetadataKVIO(kernelLayer, vioType, priority, parent, bio, - &metadataKVIO); - if (result != VDO_SUCCESS) { - freeBio(bio, kernelLayer); - return result; - } - - *vioPtr = &metadataKVIO->vio; - return VDO_SUCCESS; -} - -/**********************************************************************/ -int kvdoCreateCompressedWriteVIO(PhysicalLayer *layer, - void *parent, - char *data, - AllocatingVIO **allocatingVIOPtr) -{ - BIO *bio; - KernelLayer *kernelLayer = asKernelLayer(layer); - int result = createBio(kernelLayer, data, &bio); - if (result != VDO_SUCCESS) { - return result; - } - - CompressedWriteKVIO *compressedWriteKVIO; - result = makeCompressedWriteKVIO(kernelLayer, parent, bio, - &compressedWriteKVIO); - if (result != VDO_SUCCESS) { - freeBio(bio, kernelLayer); - return result; - } - - *allocatingVIOPtr = &compressedWriteKVIO->allocatingVIO; - return VDO_SUCCESS; -} diff --git a/vdo/kernel/kvio.h b/vdo/kernel/kvio.h deleted file mode 100644 index 64200cd..0000000 --- a/vdo/kernel/kvio.h +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/kvio.h#3 $ - */ - -#ifndef KVIO_H -#define KVIO_H - -#include "allocatingVIO.h" -#include "vio.h" - -#include "kernelLayer.h" - -/** - * A specific (semi-opaque) encapsulation of a single block - **/ -struct kvio { - KvdoEnqueueable enqueueable; - VIO *vio; - KernelLayer *layer; - BIO *bio; - - /** - * A bio pointer used in enqueueBioMap (used via submitBio etc), to - * pass information -- which bio to submit to the storage device -- - * across a thread switch. This may match another bio pointer in - * this structure, or could point somewhere else. - **/ - BIO *bioToSubmit; - /** - * A list of enqueued bios with consecutive block numbers, stored by - * enqueueBioMap under the first-enqueued KVIO. The other KVIOs are - * found via their bio entries in this list, and are not added to - * the work queue as separate work items. - **/ - struct bio_list biosMerged; - /** A slot for an arbitrary bit of data, for use by systemtap. */ - long debugSlot; -}; - -typedef struct { - KVIO kvio; - VIO vio; -} MetadataKVIO; - -typedef struct { - KVIO kvio; - AllocatingVIO allocatingVIO; -} CompressedWriteKVIO; - -/** - * Determine whether a KVIO is a data VIO or not - * - * @param kvio The KVIO to check - * - * @return true if a data KVIO - */ -static inline bool isData(KVIO *kvio) -{ - return isDataVIO(kvio->vio); -} - -/** - * Determine whether a KVIO is a compressed block write VIO or not - * - * @param kvio The KVIO to check - * - * @return true if a compressed block writer - */ -static inline bool isCompressedWriter(KVIO *kvio) -{ - return isCompressedWriteVIO(kvio->vio); -} - -/** - * Determine whether a KVIO is a metadata VIO or not - * - * @param kvio The KVIO to check - * - * @return true if a metadata KVIO - */ -static inline bool isMetadata(KVIO *kvio) -{ - return isMetadataVIO(kvio->vio); -} - -/** - * Convert a VIO to a MetadataKVIO. - * - * @param vio The VIO to convert - * - * @return the VIO as a KVIO - **/ -static inline MetadataKVIO *vioAsMetadataKVIO(VIO *vio) -{ - ASSERT_LOG_ONLY(isMetadataVIO(vio), "VIO is a metadata VIO"); - return container_of(vio, MetadataKVIO, vio); -} - -/** - * Convert a MetadataKVIO to a KVIO. - * - * @param metadataKVIO The MetadataKVIO to convert - * - * @return The MetadataKVIO as a KVIO - **/ -static inline KVIO *metadataKVIOAsKVIO(MetadataKVIO *metadataKVIO) -{ - return &metadataKVIO->kvio; -} - -/** - * Returns a pointer to the CompressedWriteKVIO wrapping an AllocatingVIO. - * - * @param allocatingVIO The AllocatingVIO to convert - * - * @return the CompressedWriteKVIO - **/ -static inline CompressedWriteKVIO * -allocatingVIOAsCompressedWriteKVIO(AllocatingVIO *allocatingVIO) -{ - ASSERT_LOG_ONLY(isCompressedWriteAllocatingVIO(allocatingVIO), - "AllocatingVIO is a compressed write"); - return container_of(allocatingVIO, CompressedWriteKVIO, allocatingVIO); -} - -/** - * Convert a CompressedWriteKVIO to a KVIO. - * - * @param compressedWriteKVIO The CompressedWriteKVIO to convert - * - * @return The CompressedWriteKVIO as a KVIO - **/ -static inline -KVIO *compressedWriteKVIOAsKVIO(CompressedWriteKVIO *compressedWriteKVIO) -{ - return &compressedWriteKVIO->kvio; -} - -/** - * Returns a pointer to the KVIO wrapping a work item - * - * @param item the work item - * - * @return the KVIO - **/ -static inline KVIO *workItemAsKVIO(KvdoWorkItem *item) -{ - return container_of(item, KVIO, enqueueable.workItem); -} - -/** - * Enqueue a KVIO on a work queue. - * - * @param queue The queue - * @param kvio The KVIO - **/ -static inline void enqueueKVIOWork(KvdoWorkQueue *queue, KVIO *kvio) -{ - enqueueWorkQueue(queue, &kvio->enqueueable.workItem); -} - -/** - * Add a trace record for the current source location. - * - * @param kvio The KVIO structure to be updated - * @param location The source-location descriptor to be recorded - **/ -static inline void kvioAddTraceRecord(KVIO *kvio, TraceLocation location) -{ - vioAddTraceRecord(kvio->vio, location); -} - -/** - * Set up the work item for a KVIO. - * - * @param kvio The KVIO to set up - * @param work The function pointer to execute - * @param statsFunction A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - **/ -static inline void setupKVIOWork(KVIO *kvio, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action) -{ - setupWorkItem(&kvio->enqueueable.workItem, work, statsFunction, action); -} - -/** - * Set up and enqueue a KVIO. - * - * @param kvio The KVIO to set up - * @param work The function pointer to execute - * @param statsFunction A function pointer to record for stats, or NULL - * @param action Action code, mapping to a relative priority - * @param queue The queue on which to enqueue the KVIO - **/ -static inline void launchKVIO(KVIO *kvio, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action, - KvdoWorkQueue *queue) -{ - setupKVIOWork(kvio, work, statsFunction, action); - enqueueKVIOWork(queue, kvio); -} - -/** - * Move a KVIO back to the base threads. - * - * @param kvio The KVIO to enqueue - **/ -void kvdoEnqueueVIOCallback(KVIO *kvio); - -/** - * Handles kvio-related I/O post-processing. - * - * @param kvio The kvio to finalize - * @param error Possible error - **/ -void kvdoContinueKvio(KVIO *kvio, int error); - -/** - * Initialize a KVIO. - * - * @param kvio The KVIO to initialize - * @param layer The physical layer - * @param vioType The type of VIO to create - * @param priority The relative priority to assign to the KVIO - * @param parent The parent of the KVIO completion - * @param bio The bio to associate with this KVIO - **/ -void initializeKVIO(KVIO *kvio, - KernelLayer *layer, - VIOType vioType, - VIOPriority priority, - void *parent, - BIO *bio); - -/** - * Destroy a MetadataKVIO and NULL out the pointer to it. - * - * @param metadataKVIOPtr A pointer to the MetadataKVIO to destroy - **/ -void freeMetadataKVIO(MetadataKVIO **metadataKVIOPtr); - -/** - * Destroy a CompressedWriteKVIO and NULL out the pointer to it. - * - * @param compressedWriteKVIOPtr A pointer to the CompressedWriteKVIO to - * destroy - **/ -void freeCompressedWriteKVIO(CompressedWriteKVIO **compressedWriteKVIOPtr); - -/** - * Create a new VIO (and its enclosing KVIO) for metadata operations. - * - *

Implements MetadataVIOCreator. - * - * @param [in] layer The physical layer - * @param [in] vioType The type of VIO to create - * @param [in] priority The relative priority to assign to the VIO - * @param [in] parent The parent to assign to the VIO's completion - * @param [in] data The buffer - * @param [out] vioPtr A pointer to hold new VIO - * - * @return VDO_SUCCESS or an error - **/ -int kvdoCreateMetadataVIO(PhysicalLayer *layer, - VIOType vioType, - VIOPriority priority, - void *parent, - char *data, - VIO **vioPtr) - __attribute__((warn_unused_result)); - -/** - * Create a new AllocatingVIO (and its enclosing KVIO) for compressed writes. - * - *

Implements CompressedWriteVIOCreator. - * - * @param [in] layer The physical layer - * @param [in] parent The parent to assign to the AllocatingVIO's - * completion - * @param [in] data The buffer - * @param [out] allocatingVIOPtr A pointer to hold new AllocatingVIO - * - * @return VDO_SUCCESS or an error - **/ -int kvdoCreateCompressedWriteVIO(PhysicalLayer *layer, - void *parent, - char *data, - AllocatingVIO **allocatingVIOPtr) - __attribute__((warn_unused_result)); - -/** - * Submit a compressed block write. - * - *

Implements CompressedWriter. - * - * @param allocatingVIO The AllocatingVIO for the compressed write - **/ -void kvdoWriteCompressedBlock(AllocatingVIO *allocatingVIO); - -/** - * Read or write a single metadata VIO. - * - *

Implements MetadataReader and MetadataWriter - * - * @param vio The VIO to read or write - **/ -void kvdoSubmitMetadataVIO(VIO *vio); - -/** - * Issue an empty flush to the lower layer using the BIO in a metadata VIO. - * - *

Implements MetadataWriter. - * - * @param vio The VIO to flush - **/ -void kvdoFlushVIO(VIO *vio); - -#endif /* KVIO_H */ diff --git a/vdo/kernel/limiter.c b/vdo/kernel/limiter.c deleted file mode 100644 index 72a4bb5..0000000 --- a/vdo/kernel/limiter.c +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/limiter.c#2 $ - */ - -#include "limiter.h" - -#include - -/**********************************************************************/ -void getLimiterValuesAtomically(Limiter *limiter, - uint32_t *active, - uint32_t *maximum) -{ - spin_lock(&limiter->lock); - *active = limiter->active; - *maximum = limiter->maximum; - spin_unlock(&limiter->lock); -} - -/**********************************************************************/ -void initializeLimiter(Limiter *limiter, uint32_t limit) -{ - limiter->active = 0; - limiter->limit = limit; - limiter->maximum = 0; - init_waitqueue_head(&limiter->waiterQueue); - spin_lock_init(&limiter->lock); -} - -/**********************************************************************/ -bool limiterIsIdle(Limiter *limiter) -{ - spin_lock(&limiter->lock); - bool idle = limiter->active == 0; - spin_unlock(&limiter->lock); - return idle; -} - -/**********************************************************************/ -void limiterReleaseMany(Limiter *limiter, uint32_t count) -{ - spin_lock(&limiter->lock); - limiter->active -= count; - spin_unlock(&limiter->lock); - if (waitqueue_active(&limiter->waiterQueue)) { - wake_up_nr(&limiter->waiterQueue, count); - } -} - -/**********************************************************************/ -void limiterWaitForIdle(Limiter *limiter) -{ - spin_lock(&limiter->lock); - while (limiter->active > 0) { - DEFINE_WAIT(wait); - prepare_to_wait_exclusive(&limiter->waiterQueue, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&limiter->lock); - io_schedule(); - spin_lock(&limiter->lock); - finish_wait(&limiter->waiterQueue, &wait); - }; - spin_unlock(&limiter->lock); -} - -/** - * Take one permit from the limiter, if one is available, and update - * the maximum active count if appropriate. - * - * The limiter's lock must already be locked. - * - * @param limiter The limiter to update - * - * @return true iff the permit was acquired - **/ -static bool takePermitLocked(Limiter *limiter) -{ - if (limiter->active >= limiter->limit) { - return false; - } - limiter->active += 1; - if (limiter->active > limiter->maximum) { - limiter->maximum = limiter->active; - } - return true; -} - -/**********************************************************************/ -void limiterWaitForOneFree(Limiter *limiter) -{ - spin_lock(&limiter->lock); - while (!takePermitLocked(limiter)) { - DEFINE_WAIT(wait); - prepare_to_wait_exclusive(&limiter->waiterQueue, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&limiter->lock); - io_schedule(); - spin_lock(&limiter->lock); - finish_wait(&limiter->waiterQueue, &wait); - }; - spin_unlock(&limiter->lock); -} - -/**********************************************************************/ -bool limiterPoll(Limiter *limiter) -{ - spin_lock(&limiter->lock); - bool acquired = takePermitLocked(limiter); - spin_unlock(&limiter->lock); - return acquired; -} diff --git a/vdo/kernel/limiter.h b/vdo/kernel/limiter.h deleted file mode 100644 index a9ee8fc..0000000 --- a/vdo/kernel/limiter.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/limiter.h#2 $ - */ - -#ifndef LIMITER_H -#define LIMITER_H - -#include - -/* - * A Limiter is a fancy counter used to limit resource usage. We have a - * limit to number of resources that we are willing to use, and a Limiter - * holds us to that limit. - */ - -typedef struct limiter { - // A spinlock controlling access to the contents of this struct - spinlock_t lock; - // The queue of threads waiting for a resource to become available - wait_queue_head_t waiterQueue; - // The number of resources in use - uint32_t active; - // The maximum number number of resources that have ever been in use - uint32_t maximum; - // The limit to the number of resources that are allowed to be used - uint32_t limit; -} Limiter; - -/** - * Get the Limiter variable values (atomically under the lock) - * - * @param limiter The limiter - * @param active The number of requests in progress - * @param maximum The maximum number of requests that have ever been active - **/ -void getLimiterValuesAtomically(Limiter *limiter, - uint32_t *active, - uint32_t *maximum); - -/** - * Initialize a Limiter - * - * @param limiter The limiter - * @param limit The limit to the number of active resources - **/ -void initializeLimiter(Limiter *limiter, uint32_t limit); - -/** - * Determine whether there are any active resources - * - * @param limiter The limiter - * - * @return true if there are no active resources - **/ -bool limiterIsIdle(Limiter *limiter); - -/** - * Release resources, making them available for other uses - * - * @param limiter The limiter - * @param count The number of resources to release - **/ -void limiterReleaseMany(Limiter *limiter, uint32_t count); - -/** - * Release one resource, making it available for another use - * - * @param limiter The limiter - **/ -static inline void limiterRelease(Limiter *limiter) -{ - limiterReleaseMany(limiter, 1); -} - -/** - * Wait until there are no active resources - * - * @param limiter The limiter - **/ -void limiterWaitForIdle(Limiter *limiter); - -/** - * Prepare to start using one resource, waiting if there are too many resources - * already in use. After returning from this routine, the caller may use the - * resource, and must call limiterRelease after freeing the resource. - * - * @param limiter The limiter - **/ -void limiterWaitForOneFree(Limiter *limiter); - -/** - * Attempt to reserve one resource, without waiting. After returning from this - * routine, if allocation was successful, the caller may use the resource, and - * must call limiterRelease after freeing the resource. - * - * @param limiter The limiter - * - * @return true iff the resource was allocated - **/ -bool limiterPoll(Limiter *limiter); - -#endif /* LIMITER_H */ diff --git a/vdo/kernel/logger.c b/vdo/kernel/logger.c deleted file mode 100644 index d18f5ea..0000000 --- a/vdo/kernel/logger.c +++ /dev/null @@ -1,520 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/logger.c#4 $ - */ - -#include "logger.h" - -#include -#include -#include - -#include "errors.h" -#include "threadDevice.h" - -static const int DEFAULT_PRIORITY = LOG_INFO; - -typedef struct { - const char *name; - const int priority; -} PRIORITY_NAMES; - -static const PRIORITY_NAMES PRIORITIES[] = { - { "ALERT", LOG_ALERT }, - { "CRIT", LOG_CRIT }, - { "CRITICAL", LOG_CRIT }, - { "DEBUG", LOG_DEBUG }, - { "EMERG", LOG_EMERG }, - { "EMERGENCY", LOG_EMERG }, - { "ERR", LOG_ERR }, - { "ERROR", LOG_ERR }, - { "INFO", LOG_INFO }, - { "NOTICE", LOG_NOTICE }, - { "PANIC", LOG_EMERG }, - { "WARN", LOG_WARNING }, - { "WARNING", LOG_WARNING }, - { NULL, -1 }, -}; - -enum { - PRIORITY_COUNT = 8 -}; - -static const char *PRIORITY_STRINGS[] = { - "EMERGENCY", - "ALERT", - "CRITICAL", - "ERROR", - "WARN", - "NOTICE", - "INFO", - "DEBUG", -}; - -static int logLevel = LOG_INFO; - -/**********************************************************************/ -int stringToPriority(const char *string) -{ - for (int i = 0; PRIORITIES[i].name != NULL; i++) { - if (strcasecmp(string, PRIORITIES[i].name) == 0) { - return PRIORITIES[i].priority; - } - } - return DEFAULT_PRIORITY; -} - -/**********************************************************************/ -int getLogLevel(void) -{ - return logLevel; -} - -/**********************************************************************/ -void setLogLevel(int newLogLevel) -{ - logLevel = newLogLevel; -} - -/**********************************************************************/ -const char *priorityToString(int priority) -{ - if ((priority < 0) || (priority >= PRIORITY_COUNT)) { - return "unknown"; - } - return PRIORITY_STRINGS[priority]; -} - -/**********************************************************************/ -static const char *priorityToLogLevel(int priority) -{ - switch (priority) { - case LOG_EMERG: - case LOG_ALERT: - case LOG_CRIT: - return KERN_CRIT; - case LOG_ERR: - return KERN_ERR; - case LOG_WARNING: - return KERN_WARNING; - case LOG_NOTICE: - return KERN_NOTICE; - case LOG_INFO: - return KERN_INFO; - case LOG_DEBUG: - return KERN_DEBUG; - default: - return ""; - } -} - -/**********************************************************************/ -static const char *getCurrentInterruptType(void) -{ - if (in_nmi()) { - return "NMI"; - } - if (in_irq()) { - return "HI"; - } - if (in_softirq()) { - return "SI"; - } - return "INTR"; -} - -/** - * Emit a log message to the kernel log in a format suited to the current - * thread context. Context info formats: - * - * interrupt: kvdo[NMI]: blah - * thread w/dev id: kvdo12:myprog: blah - * kvdo thread: kvdo12:foobarQ: blah - * other thread: kvdo: myprog: blah - * - * Fields: module name, interrupt level, process name, device ID. - * - * @param level A string describing the logging level - * @param moduleName The name of the module doing the logging - * @param prefix The prefix of the log message - * @param vaf1 The first message format descriptor - * @param vaf2 The second message format descriptor - **/ -static void emitLogMessage(const char *level, - const char *moduleName, - const char *prefix, - const struct va_format *vaf1, - const struct va_format *vaf2) -{ - if (in_interrupt()) { - printk("%s%s[%s]: %s%pV%pV\n", - level, moduleName, getCurrentInterruptType(), - prefix, vaf1, vaf2); - return; - } - - // Not at interrupt level; we have a process we can look at, and - // might have a device ID. - int deviceInstance = getThreadDeviceID(); - if (deviceInstance != -1) { - printk("%s%s%u:%s: %s%pV%pV\n", - level, moduleName, deviceInstance, current->comm, - prefix, vaf1, vaf2); - return; - } - - if (((current->flags & PF_KTHREAD) != 0) - && (strncmp(moduleName, current->comm, strlen(moduleName)) == 0)) { - /* - * It's a kernel thread starting with "kvdo" (or whatever). Assume it's - * ours and that its name is sufficient. - */ - printk("%s%s: %s%pV%pV\n", - level, current->comm, - prefix, vaf1, vaf2); - return; - } - - // Identify the module and the process. - printk("%s%s: %s: %s%pV%pV\n", - level, moduleName, current->comm, - prefix, vaf1, vaf2); -} - -/**********************************************************************/ -void logMessagePack(int priority, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - va_list args2) -{ - if (priority > getLogLevel()) { - return; - } - - /* - * The kernel's printk has some magic for indirection to a secondary - * va_list. It wants us to supply a pointer to the va_list. - * - * However, va_list varies across platforms and can be an array - * type, which makes passing it around as an argument kind of - * tricky, due to the automatic conversion to a pointer. This makes - * taking the address of the argument a dicey thing; if we use "&a" - * it works fine for non-array types, but for array types we get the - * address of a pointer. Functions like va_copy and sprintf don't - * care as they get "va_list" values passed and are written to do - * the right thing, but printk explicitly wants the address of the - * va_list. - * - * So, we copy the va_list values to ensure that "&" consistently - * works the way we want. - */ - va_list args1Copy; - va_copy(args1Copy, args1); - va_list args2Copy; - va_copy(args2Copy, args2); - struct va_format vaf1 = { - .fmt = (fmt1 != NULL) ? fmt1 : "", - .va = &args1Copy, - }; - struct va_format vaf2 = { - .fmt = (fmt2 != NULL) ? fmt2 : "", - .va = &args2Copy, - }; - - if (prefix == NULL) { - prefix = ""; - } - - emitLogMessage(priorityToLogLevel(priority), THIS_MODULE->name, - prefix, &vaf1, &vaf2); - - va_end(args1Copy); - va_end(args2Copy); -} - -/**********************************************************************/ -void logEmbeddedMessage(int priority, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - ...) -{ - va_list ap; - va_start(ap, fmt2); - logMessagePack(priority, prefix, fmt1, args1, fmt2, ap); - va_end(ap); -} - -#pragma GCC diagnostic push -/* - * GCC (version 8.1.1 20180502 (Red Hat 8.1.1-1)) on Fedora 28 seems - * to think that this function should get a printf format - * attribute. But we have no second format string, and no additional - * arguments at the call site, and GCC also gets unhappy trying to - * analyze the format and values when there are none. So we'll just - * shut it up. - */ -#pragma GCC diagnostic ignored "-Wsuggest-attribute=format" -/** - * Log a message. - * - * This helper function exists solely to create a valid va_list with - * no useful info. It does the real work of vLogMessage, which wants a - * second va_list object to pass down. - * - * @param priority The syslog priority value for the message. - * @param format The format of the message (a printf style format) - * @param args The variadic argument list of format parameters. - **/ -static void vLogMessageHelper(int priority, - const char *format, - va_list args, - ...) -{ - va_list dummy; - va_start(dummy, args); - logMessagePack(priority, NULL, format, args, NULL, dummy); - va_end(dummy); -} -#pragma GCC diagnostic pop - -/*****************************************************************************/ -void vLogMessage(int priority, const char *format, va_list args) -{ - vLogMessageHelper(priority, format, args); -} - -/**********************************************************************/ -void logMessage(int priority, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(priority, format, args); - va_end(args); -} - -/**********************************************************************/ -__attribute__((format(printf, 2, 3))) -static void logAtLevel(int priority, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(priority, format, args); - va_end(args); -} - -/**********************************************************************/ -void logDebug(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_DEBUG, format, args); - va_end(args); -} - -/**********************************************************************/ -void logInfo(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_INFO, format, args); - va_end(args); -} - -/**********************************************************************/ -void logNotice(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_NOTICE, format, args); - va_end(args); -} - -/**********************************************************************/ -void logWarning(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_WARNING, format, args); - va_end(args); -} - -/**********************************************************************/ -void logError(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_ERR, format, args); - va_end(args); -} - -/**********************************************************************/ -void vLogError(const char *format, va_list args) -{ - vLogMessage(LOG_ERR, format, args); -} - -/**********************************************************************/ -void logBacktrace(int priority) -{ - logAtLevel(priority, "[backtrace]"); - if (priority > logLevel) { - return; - } - dump_stack(); -} - -/**********************************************************************/ -int vLogWithStringError(int priority, - int errnum, - const char *format, - va_list args) -{ - char errbuf[ERRBUF_SIZE] = ""; - logEmbeddedMessage(priority, NULL, format, args, ": %s (%d)", - stringError(errnum, errbuf, sizeof(errbuf)), - errnum); - return errnum; -} - -/**********************************************************************/ -int logWithStringError(int priority, int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(priority, errnum, format, args); - va_end(args); - return errnum; -} - -/**********************************************************************/ -int logErrorWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_ERR, errnum, format, args); - va_end(args); - return errnum; -} - -/**********************************************************************/ -int vLogErrorWithStringError(int errnum, const char *format, va_list args) -{ - vLogWithStringError(LOG_ERR, errnum, format, args); - return errnum; -} - -/**********************************************************************/ -int logWarningWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_WARNING, errnum, format, args); - va_end(args); - return errnum; -} - -/**********************************************************************/ -int logDebugWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_DEBUG, errnum, format, args); - va_end(args); - return errnum; -} - -/**********************************************************************/ -int logInfoWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_INFO, errnum, format, args); - va_end(args); - return errnum; -} - -/**********************************************************************/ -int logNoticeWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_NOTICE, errnum, format, args); - va_end(args); - return errnum; -} - -/**********************************************************************/ -int logFatalWithStringError(int errnum, const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogWithStringError(LOG_CRIT, errnum, format, args); - va_end(args); - return errnum; -} - -/**********************************************************************/ -int logUnrecoverable(int errnum, const char *format, ...) -{ - if ((errnum == UDS_SUCCESS || errnum == UDS_QUEUED) || (errnum == 0)) { - return errnum; - } - - va_list args; - va_start(args, format); - vLogWithStringError(LOG_CRIT, errnum, format, args); - va_end(args); - return makeUnrecoverable(errnum); -} - -/**********************************************************************/ -void logFatal(const char *format, ...) -{ - va_list args; - - va_start(args, format); - vLogMessage(LOG_CRIT, format, args); - va_end(args); -} - -/**********************************************************************/ -void pauseForLogger(void) -{ - // Hopefully, a few milliseconds of sleep will be large enough - // for the kernel log buffer to be flushed. - msleep(4); -} diff --git a/vdo/kernel/logger.h b/vdo/kernel/logger.h deleted file mode 100644 index 6e8088e..0000000 --- a/vdo/kernel/logger.h +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/logger.h#2 $ - */ - -#ifndef LOGGER_H -#define LOGGER_H 1 - -#include -#include -#include -#include - -#define LOG_EMERG 0 /* system is unusable */ -#define LOG_ALERT 1 /* action must be taken immediately */ -#define LOG_CRIT 2 /* critical conditions */ -#define LOG_ERR 3 /* error conditions */ -#define LOG_WARNING 4 /* warning conditions */ -#define LOG_NOTICE 5 /* normal but significant condition */ -#define LOG_INFO 6 /* informational */ -#define LOG_DEBUG 7 /* debug-level messages */ - -// Make it easy to log real pointer values using %px when in development. -#define PRIptr "pK" - -/** - * @file - * - * The functions in this file are not thread safe in the sense that nothing - * prevents multiple threads from opening or closing loggers out from under - * other threads. In reality this isn't a problem since the only calls in - * production code to openLogger() and closeLogger() are made in uds.c while - * uds mutex is held, and uds does not make any logging calls before it calls - * openLogger or after it calls closeLogger(). - * - * All of the log() functions will preserve the callers value of errno. - **/ - -/** - * Get the current logging level. - * - * @return the current logging priority level. - **/ -int getLogLevel(void); - -/** - * Set the current logging level. - * - * @param newLogLevel the new value for the logging priority level. - **/ -void setLogLevel(int newLogLevel); - -/** - * Return the integer logging priority represented by a name. - * - * @param string the name of the logging priority (case insensitive). - * - * @return the integer priority named by string, or DEFAULT_PRIORITY - * if not recognized. - **/ -int stringToPriority(const char *string); - -/** - * Return the printable name of a logging priority. - * - * @return the priority name - **/ -const char *priorityToString(int priority); - -/** - * Log a debug message. - * - * @param format The format of the message (a printf style format) - **/ -void logDebug(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log an informational message. - * - * @param format The format of the message (a printf style format) - **/ -void logInfo(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log a normal (but notable) condition. - * - * @param format The format of the message (a printf style format) - **/ -void logNotice(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log a warning. - * - * @param format The format of the message (a printf style format) - **/ -void logWarning(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log an error. - * - * @param format The format of the message (a printf style format) - **/ -void logError(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log an error. - * - * @param format The format of the message (a printf style format) - * @param args args for format. - **/ - -void vLogError(const char *format, va_list args) - __attribute__((format(printf, 1, 0))); - -/** - * Log a message embedded within another message. - * - * @param priority the priority at which to log the message - * @param prefix optional string prefix to message, may be NULL - * @param fmt1 format of message first part, may be NULL - * @param args1 arguments for message first part - * @param fmt2 format of message second part - **/ -void logEmbeddedMessage(int priority, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - ...) - __attribute__((format(printf, 3, 0), format(printf, 5, 6))); - -/** - * Log a message pack consisting of multiple variable sections. - * - * @param priority the priority at which to log the message - * @param prefix optional string prefix to message, may be NULL - * @param fmt1 format of message first part, may be NULL - * @param args1 arguments for message first part - * @param fmt2 format of message second part, may be NULL - * @param args2 arguments for message second part - **/ -void logMessagePack(int priority, - const char *prefix, - const char *fmt1, - va_list args1, - const char *fmt2, - va_list args2) - __attribute__((format(printf, 3, 0))); - -/** - * Log a stack backtrace. - * - * @param priority The priority at which to log the backtrace - **/ -void logBacktrace(int priority); - -/** - * Log a message with an error from an error code. - * - * @param priority The priority of the logging entry - * @param errnum Int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * - * @return errnum - **/ -int logWithStringError(int priority, int errnum, const char *format, ...) - __attribute__((format(printf, 3, 4))); - -/** - * Log a message with an error from an error code. - * - * @param priority The priority of the logging entry - * @param errnum Int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * @param args The list of arguments with format. - * - * @return errnum - **/ -int vLogWithStringError(int priority, - int errnum, - const char *format, - va_list args) - __attribute__((format(printf, 3, 0))); - -/** - * Log an error prefixed with the string associated with the errnum. - * - * @param errnum Int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * - * @return errnum - **/ -int logErrorWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logDebugWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logInfoWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logNoticeWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logWarningWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/**********************************************************************/ -int logFatalWithStringError(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/** - * Log an error prefixed with the string associated with the errnum. - * - * @param errnum Int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * @param args a va_list of args for the format. - * @return errnum - **/ -int vLogErrorWithStringError(int errnum, const char *format, va_list args) - __attribute__((format(printf, 2, 0))); - -/** - * Log an ERROR level message and return makeUnrecoverable(errnum) - * UDS_SUCCESS is ignored and returned. - * - * @param errnum Int value of errno or a UDS_* value. - * @param format The format of the message (a printf style format) - * @return makeUnrecoverable(errnum) or UDS_SUCCESS. - **/ -int logUnrecoverable(int errnum, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/** - * Log a fatal error. - * - * @param format The format of the message (a printf style format) - **/ -void logFatal(const char *format, ...) __attribute__((format(printf, 1, 2))); - -/** - * Log a message -- for internal use only. - * - * @param priority The syslog priority value for the message. - * @param format The format of the message (a printf style format) - * @param args The variadic argument list of format parameters. - **/ -void vLogMessage(int priority, const char *format, va_list args) - __attribute__((format(printf, 2, 0))); - -/** - * Log a message. - * - * @param priority The syslog priority value for the message. - * @param format The format of the message (a printf style format) - **/ -void logMessage(int priority, const char *format, ...) - __attribute__((format(printf, 2, 3))); - -/** - * Sleep or delay a short time (likely a few milliseconds) in an attempt allow - * the log buffers to be written out in case they might be overrun. This is - * unnecessary in user-space (and is a no-op there), but is needed when - * quickly issuing a lot of log output in the Linux kernel, as when dumping a - * large number of data structures. - **/ -void pauseForLogger(void); - -#endif /* LOGGER_H */ diff --git a/vdo/kernel/memoryUsage.c b/vdo/kernel/memoryUsage.c deleted file mode 100644 index 86521a4..0000000 --- a/vdo/kernel/memoryUsage.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/memoryUsage.c#3 $ - */ - -#include "memoryUsage.h" - -#include "memoryAlloc.h" - -#include "kernelStatistics.h" - -/**********************************************************************/ -MemoryUsage getMemoryUsage() -{ - MemoryUsage memoryUsage; - getMemoryStats(&memoryUsage.bytesUsed, &memoryUsage.peakBytesUsed); - return memoryUsage; -} - diff --git a/vdo/kernel/memoryUsage.h b/vdo/kernel/memoryUsage.h deleted file mode 100644 index 336ab0a..0000000 --- a/vdo/kernel/memoryUsage.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/memoryUsage.h#1 $ - */ - -#ifndef MEMORY_USAGE_H -#define MEMORY_USAGE_H 1 - -#include "memoryAlloc.h" - -#include "kernelStatistics.h" - -/** - * Get the memory usage for statistics reporting. - * - * @return The memory usage - **/ -MemoryUsage getMemoryUsage(void) - __attribute__((warn_unused_result)); - -#endif /* MEMORY_USAGE_H */ diff --git a/vdo/kernel/poolSysfs.c b/vdo/kernel/poolSysfs.c deleted file mode 100644 index 7f37480..0000000 --- a/vdo/kernel/poolSysfs.c +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/poolSysfs.c#1 $ - */ - -#include "poolSysfs.h" - -#include "memoryAlloc.h" - -#include "vdo.h" - -#include "dedupeIndex.h" - -typedef struct poolAttribute { - struct attribute attr; - ssize_t (*show)(KernelLayer *layer, char *buf); - ssize_t (*store)(KernelLayer *layer, const char *value, size_t count); -} PoolAttribute; - -/**********************************************************************/ -static ssize_t vdoPoolAttrShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - PoolAttribute *poolAttr = container_of(attr, PoolAttribute, attr); - if (poolAttr->show == NULL) { - return -EINVAL; - } - KernelLayer *layer = container_of(kobj, KernelLayer, kobj); - return poolAttr->show(layer, buf); -} - -/**********************************************************************/ -static ssize_t vdoPoolAttrStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) -{ - PoolAttribute *poolAttr = container_of(attr, PoolAttribute, attr); - if (poolAttr->store == NULL) { - return -EINVAL; - } - KernelLayer *layer = container_of(kobj, KernelLayer, kobj); - return poolAttr->store(layer, buf, length); -} - -static struct sysfs_ops vdoPoolSysfsOps = { - .show = vdoPoolAttrShow, - .store = vdoPoolAttrStore, -}; - -/**********************************************************************/ -static ssize_t poolCompressingShow(KernelLayer *layer, char *buf) -{ - return sprintf(buf, "%s\n", (getKVDOCompressing(&layer->kvdo) ? "1" : "0")); -} - -/**********************************************************************/ -static ssize_t poolDiscardsActiveShow(KernelLayer *layer, char *buf) -{ - return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.active); -} - -/**********************************************************************/ -static ssize_t poolDiscardsLimitShow(KernelLayer *layer, char *buf) -{ - return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.limit); -} - -/**********************************************************************/ -static ssize_t poolDiscardsLimitStore(KernelLayer *layer, - const char *buf, - size_t length) -{ - unsigned int value; - if ((length > 12) || (sscanf(buf, "%u", &value) != 1) || (value < 1)) { - return -EINVAL; - } - layer->discardLimiter.limit = value; - return length; -} - -/**********************************************************************/ -static ssize_t poolDiscardsMaximumShow(KernelLayer *layer, char *buf) -{ - return sprintf(buf, "%" PRIu32 "\n", layer->discardLimiter.maximum); -} - -/**********************************************************************/ -static ssize_t poolInstanceShow(KernelLayer *layer, char *buf) -{ - return sprintf(buf, "%u\n", layer->instance); -} - -/**********************************************************************/ -static ssize_t poolRequestsActiveShow(KernelLayer *layer, char *buf) -{ - return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.active); -} - -/**********************************************************************/ -static ssize_t poolRequestsLimitShow(KernelLayer *layer, char *buf) -{ - return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.limit); -} - -/**********************************************************************/ -static ssize_t poolRequestsMaximumShow(KernelLayer *layer, char *buf) -{ - return sprintf(buf, "%" PRIu32 "\n", layer->requestLimiter.maximum); -} - -/**********************************************************************/ -static void vdoPoolRelease(struct kobject *kobj) -{ - KernelLayer *layer = container_of(kobj, KernelLayer, kobj); - freeVDO(&layer->kvdo.vdo); - FREE(layer); -} - -static PoolAttribute vdoPoolCompressingAttr = { - .attr = { .name = "compressing", .mode = 0444, }, - .show = poolCompressingShow, -}; - -static PoolAttribute vdoPoolDiscardsActiveAttr = { - .attr = { .name = "discards_active", .mode = 0444, }, - .show = poolDiscardsActiveShow, -}; - -static PoolAttribute vdoPoolDiscardsLimitAttr = { - .attr = { .name = "discards_limit", .mode = 0644, }, - .show = poolDiscardsLimitShow, - .store = poolDiscardsLimitStore, -}; - -static PoolAttribute vdoPoolDiscardsMaximumAttr = { - .attr = { .name = "discards_maximum", .mode = 0444, }, - .show = poolDiscardsMaximumShow, -}; - -static PoolAttribute vdoPoolInstanceAttr = { - .attr = { .name = "instance", .mode = 0444, }, - .show = poolInstanceShow, -}; - -static PoolAttribute vdoPoolRequestsActiveAttr = { - .attr = { .name = "requests_active", .mode = 0444, }, - .show = poolRequestsActiveShow, -}; - -static PoolAttribute vdoPoolRequestsLimitAttr = { - .attr = { .name = "requests_limit", .mode = 0444, }, - .show = poolRequestsLimitShow, -}; - -static PoolAttribute vdoPoolRequestsMaximumAttr = { - .attr = { .name = "requests_maximum", .mode = 0444, }, - .show = poolRequestsMaximumShow, -}; - -static struct attribute *poolAttrs[] = { - &vdoPoolCompressingAttr.attr, - &vdoPoolDiscardsActiveAttr.attr, - &vdoPoolDiscardsLimitAttr.attr, - &vdoPoolDiscardsMaximumAttr.attr, - &vdoPoolInstanceAttr.attr, - &vdoPoolRequestsActiveAttr.attr, - &vdoPoolRequestsLimitAttr.attr, - &vdoPoolRequestsMaximumAttr.attr, - NULL, -}; - -struct kobj_type kernelLayerKobjType = { - .release = vdoPoolRelease, - .sysfs_ops = &vdoPoolSysfsOps, - .default_attrs = poolAttrs, -}; - -/**********************************************************************/ -static void workQueueDirectoryRelease(struct kobject *kobj) -{ - /* - * The workQueueDirectory holds an implicit reference to its parent, - * the kernelLayer object (->kobj), so even if there are some - * external references held to the workQueueDirectory when work - * queue shutdown calls kobject_put on the kernelLayer object, the - * kernelLayer object won't actually be released and won't free the - * KernelLayer storage until the workQueueDirectory object is - * released first. - * - * So, we don't need to do any additional explicit management here. - * - * (But we aren't allowed to use a NULL function pointer to indicate - * a no-op.) - */ -} - -/**********************************************************************/ -static struct attribute *noAttrs[] = { - NULL, -}; - -static struct sysfs_ops noSysfsOps = { - // These should never be reachable since there are no attributes. - .show = NULL, - .store = NULL, -}; - -struct kobj_type workQueueDirectoryKobjType = { - .release = workQueueDirectoryRelease, - .sysfs_ops = &noSysfsOps, - .default_attrs = noAttrs, -}; diff --git a/vdo/kernel/poolSysfs.h b/vdo/kernel/poolSysfs.h deleted file mode 100644 index 85fe11c..0000000 --- a/vdo/kernel/poolSysfs.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/poolSysfs.h#1 $ - */ - -#ifndef POOL_SYSFS_H -#define POOL_SYSFS_H - -#include - -// The kobj_type used for setting up the kernel layer kobject. -extern struct kobj_type kernelLayerKobjType; -// The kobj_type used for the "work_queues" subdirectory. -extern struct kobj_type workQueueDirectoryKobjType; - -// The sysfs_ops used for the "statistics" subdirectory. -extern struct sysfs_ops poolStatsSysfsOps; -// The attribute used for the "statistics" subdirectory. -extern struct attribute *poolStatsAttrs[]; - -#endif /* POOL_SYSFS_H */ diff --git a/vdo/kernel/poolSysfsStats.c b/vdo/kernel/poolSysfsStats.c deleted file mode 100644 index daa0cf0..0000000 --- a/vdo/kernel/poolSysfsStats.c +++ /dev/null @@ -1,2628 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - */ - -#include "dedupeIndex.h" -#include "logger.h" -#include "poolSysfs.h" -#include "statistics.h" -#include "statusProcfs.h" -#include "threadDevice.h" -#include "vdo.h" - -typedef struct poolStatsAttribute { - struct attribute attr; - ssize_t (*show)(KernelLayer *layer, char *buf); -} PoolStatsAttribute; - -static ssize_t poolStatsAttrShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - PoolStatsAttribute *poolStatsAttr = container_of(attr, PoolStatsAttribute, - attr); - - if (poolStatsAttr->show == NULL) { - return -EINVAL; - } - KernelLayer *layer = container_of(kobj, KernelLayer, statsDirectory); - return poolStatsAttr->show(layer, buf); -} - -struct sysfs_ops poolStatsSysfsOps = { - .show = poolStatsAttrShow, - .store = NULL, -}; - -/**********************************************************************/ -/** Number of blocks used for data */ -static ssize_t poolStatsDataBlocksUsedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.dataBlocksUsed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsDataBlocksUsedAttr = { - .attr = { .name = "data_blocks_used", .mode = 0444, }, - .show = poolStatsDataBlocksUsedShow, -}; - -/**********************************************************************/ -/** Number of blocks used for VDO metadata */ -static ssize_t poolStatsOverheadBlocksUsedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.overheadBlocksUsed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsOverheadBlocksUsedAttr = { - .attr = { .name = "overhead_blocks_used", .mode = 0444, }, - .show = poolStatsOverheadBlocksUsedShow, -}; - -/**********************************************************************/ -/** Number of logical blocks that are currently mapped to physical blocks */ -static ssize_t poolStatsLogicalBlocksUsedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.logicalBlocksUsed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsLogicalBlocksUsedAttr = { - .attr = { .name = "logical_blocks_used", .mode = 0444, }, - .show = poolStatsLogicalBlocksUsedShow, -}; - -/**********************************************************************/ -/** number of physical blocks */ -static ssize_t poolStatsPhysicalBlocksShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.physicalBlocks); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsPhysicalBlocksAttr = { - .attr = { .name = "physical_blocks", .mode = 0444, }, - .show = poolStatsPhysicalBlocksShow, -}; - -/**********************************************************************/ -/** number of logical blocks */ -static ssize_t poolStatsLogicalBlocksShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.logicalBlocks); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsLogicalBlocksAttr = { - .attr = { .name = "logical_blocks", .mode = 0444, }, - .show = poolStatsLogicalBlocksShow, -}; - -/**********************************************************************/ -/** Size of the block map page cache, in bytes */ -static ssize_t poolStatsBlockMapCacheSizeShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMapCacheSize); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapCacheSizeAttr = { - .attr = { .name = "block_map_cache_size", .mode = 0444, }, - .show = poolStatsBlockMapCacheSizeShow, -}; - -/**********************************************************************/ -/** String describing the active write policy of the VDO */ -static ssize_t poolStatsWritePolicyShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%s\n", layer->vdoStatsStorage.writePolicy); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsWritePolicyAttr = { - .attr = { .name = "write_policy", .mode = 0444, }, - .show = poolStatsWritePolicyShow, -}; - -/**********************************************************************/ -/** The physical block size */ -static ssize_t poolStatsBlockSizeShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockSize); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockSizeAttr = { - .attr = { .name = "block_size", .mode = 0444, }, - .show = poolStatsBlockSizeShow, -}; - -/**********************************************************************/ -/** Number of times the VDO has successfully recovered */ -static ssize_t poolStatsCompleteRecoveriesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.completeRecoveries); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsCompleteRecoveriesAttr = { - .attr = { .name = "complete_recoveries", .mode = 0444, }, - .show = poolStatsCompleteRecoveriesShow, -}; - -/**********************************************************************/ -/** Number of times the VDO has recovered from read-only mode */ -static ssize_t poolStatsReadOnlyRecoveriesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.readOnlyRecoveries); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsReadOnlyRecoveriesAttr = { - .attr = { .name = "read_only_recoveries", .mode = 0444, }, - .show = poolStatsReadOnlyRecoveriesShow, -}; - -/**********************************************************************/ -/** String describing the operating mode of the VDO */ -static ssize_t poolStatsModeShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%s\n", layer->vdoStatsStorage.mode); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsModeAttr = { - .attr = { .name = "mode", .mode = 0444, }, - .show = poolStatsModeShow, -}; - -/**********************************************************************/ -/** Whether the VDO is in recovery mode */ -static ssize_t poolStatsInRecoveryModeShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%d\n", layer->vdoStatsStorage.inRecoveryMode); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsInRecoveryModeAttr = { - .attr = { .name = "in_recovery_mode", .mode = 0444, }, - .show = poolStatsInRecoveryModeShow, -}; - -/**********************************************************************/ -/** What percentage of recovery mode work has been completed */ -static ssize_t poolStatsRecoveryPercentageShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%u\n", layer->vdoStatsStorage.recoveryPercentage); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsRecoveryPercentageAttr = { - .attr = { .name = "recovery_percentage", .mode = 0444, }, - .show = poolStatsRecoveryPercentageShow, -}; - -/**********************************************************************/ -/** Number of compressed data items written since startup */ -static ssize_t poolStatsPackerCompressedFragmentsWrittenShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedFragmentsWritten); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsPackerCompressedFragmentsWrittenAttr = { - .attr = { .name = "packer_compressed_fragments_written", .mode = 0444, }, - .show = poolStatsPackerCompressedFragmentsWrittenShow, -}; - -/**********************************************************************/ -/** Number of blocks containing compressed items written since startup */ -static ssize_t poolStatsPackerCompressedBlocksWrittenShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedBlocksWritten); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsPackerCompressedBlocksWrittenAttr = { - .attr = { .name = "packer_compressed_blocks_written", .mode = 0444, }, - .show = poolStatsPackerCompressedBlocksWrittenShow, -}; - -/**********************************************************************/ -/** Number of VIOs that are pending in the packer */ -static ssize_t poolStatsPackerCompressedFragmentsInPackerShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.packer.compressedFragmentsInPacker); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsPackerCompressedFragmentsInPackerAttr = { - .attr = { .name = "packer_compressed_fragments_in_packer", .mode = 0444, }, - .show = poolStatsPackerCompressedFragmentsInPackerShow, -}; - -/**********************************************************************/ -/** The total number of slabs from which blocks may be allocated */ -static ssize_t poolStatsAllocatorSlabCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsAllocatorSlabCountAttr = { - .attr = { .name = "allocator_slab_count", .mode = 0444, }, - .show = poolStatsAllocatorSlabCountShow, -}; - -/**********************************************************************/ -/** The total number of slabs from which blocks have ever been allocated */ -static ssize_t poolStatsAllocatorSlabsOpenedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabsOpened); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsAllocatorSlabsOpenedAttr = { - .attr = { .name = "allocator_slabs_opened", .mode = 0444, }, - .show = poolStatsAllocatorSlabsOpenedShow, -}; - -/**********************************************************************/ -/** The number of times since loading that a slab has been re-opened */ -static ssize_t poolStatsAllocatorSlabsReopenedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.allocator.slabsReopened); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsAllocatorSlabsReopenedAttr = { - .attr = { .name = "allocator_slabs_reopened", .mode = 0444, }, - .show = poolStatsAllocatorSlabsReopenedShow, -}; - -/**********************************************************************/ -/** Number of times the on-disk journal was full */ -static ssize_t poolStatsJournalDiskFullShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.diskFull); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsJournalDiskFullAttr = { - .attr = { .name = "journal_disk_full", .mode = 0444, }, - .show = poolStatsJournalDiskFullShow, -}; - -/**********************************************************************/ -/** Number of times the recovery journal requested slab journal commits. */ -static ssize_t poolStatsJournalSlabJournalCommitsRequestedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.slabJournalCommitsRequested); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsJournalSlabJournalCommitsRequestedAttr = { - .attr = { .name = "journal_slab_journal_commits_requested", .mode = 0444, }, - .show = poolStatsJournalSlabJournalCommitsRequestedShow, -}; - -/**********************************************************************/ -/** The total number of items on which processing has started */ -static ssize_t poolStatsJournalEntriesStartedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.started); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsJournalEntriesStartedAttr = { - .attr = { .name = "journal_entries_started", .mode = 0444, }, - .show = poolStatsJournalEntriesStartedShow, -}; - -/**********************************************************************/ -/** The total number of items for which a write operation has been issued */ -static ssize_t poolStatsJournalEntriesWrittenShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.written); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsJournalEntriesWrittenAttr = { - .attr = { .name = "journal_entries_written", .mode = 0444, }, - .show = poolStatsJournalEntriesWrittenShow, -}; - -/**********************************************************************/ -/** The total number of items for which a write operation has completed */ -static ssize_t poolStatsJournalEntriesCommittedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.entries.committed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsJournalEntriesCommittedAttr = { - .attr = { .name = "journal_entries_committed", .mode = 0444, }, - .show = poolStatsJournalEntriesCommittedShow, -}; - -/**********************************************************************/ -/** The total number of items on which processing has started */ -static ssize_t poolStatsJournalBlocksStartedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.started); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsJournalBlocksStartedAttr = { - .attr = { .name = "journal_blocks_started", .mode = 0444, }, - .show = poolStatsJournalBlocksStartedShow, -}; - -/**********************************************************************/ -/** The total number of items for which a write operation has been issued */ -static ssize_t poolStatsJournalBlocksWrittenShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.written); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsJournalBlocksWrittenAttr = { - .attr = { .name = "journal_blocks_written", .mode = 0444, }, - .show = poolStatsJournalBlocksWrittenShow, -}; - -/**********************************************************************/ -/** The total number of items for which a write operation has completed */ -static ssize_t poolStatsJournalBlocksCommittedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.journal.blocks.committed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsJournalBlocksCommittedAttr = { - .attr = { .name = "journal_blocks_committed", .mode = 0444, }, - .show = poolStatsJournalBlocksCommittedShow, -}; - -/**********************************************************************/ -/** Number of times the on-disk journal was full */ -static ssize_t poolStatsSlabJournalDiskFullCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.diskFullCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsSlabJournalDiskFullCountAttr = { - .attr = { .name = "slab_journal_disk_full_count", .mode = 0444, }, - .show = poolStatsSlabJournalDiskFullCountShow, -}; - -/**********************************************************************/ -/** Number of times an entry was added over the flush threshold */ -static ssize_t poolStatsSlabJournalFlushCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.flushCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsSlabJournalFlushCountAttr = { - .attr = { .name = "slab_journal_flush_count", .mode = 0444, }, - .show = poolStatsSlabJournalFlushCountShow, -}; - -/**********************************************************************/ -/** Number of times an entry was added over the block threshold */ -static ssize_t poolStatsSlabJournalBlockedCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.blockedCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsSlabJournalBlockedCountAttr = { - .attr = { .name = "slab_journal_blocked_count", .mode = 0444, }, - .show = poolStatsSlabJournalBlockedCountShow, -}; - -/**********************************************************************/ -/** Number of times a tail block was written */ -static ssize_t poolStatsSlabJournalBlocksWrittenShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.blocksWritten); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsSlabJournalBlocksWrittenAttr = { - .attr = { .name = "slab_journal_blocks_written", .mode = 0444, }, - .show = poolStatsSlabJournalBlocksWrittenShow, -}; - -/**********************************************************************/ -/** Number of times we had to wait for the tail to write */ -static ssize_t poolStatsSlabJournalTailBusyCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabJournal.tailBusyCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsSlabJournalTailBusyCountAttr = { - .attr = { .name = "slab_journal_tail_busy_count", .mode = 0444, }, - .show = poolStatsSlabJournalTailBusyCountShow, -}; - -/**********************************************************************/ -/** Number of blocks written */ -static ssize_t poolStatsSlabSummaryBlocksWrittenShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.slabSummary.blocksWritten); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsSlabSummaryBlocksWrittenAttr = { - .attr = { .name = "slab_summary_blocks_written", .mode = 0444, }, - .show = poolStatsSlabSummaryBlocksWrittenShow, -}; - -/**********************************************************************/ -/** Number of reference blocks written */ -static ssize_t poolStatsRefCountsBlocksWrittenShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.refCounts.blocksWritten); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsRefCountsBlocksWrittenAttr = { - .attr = { .name = "ref_counts_blocks_written", .mode = 0444, }, - .show = poolStatsRefCountsBlocksWrittenShow, -}; - -/**********************************************************************/ -/** number of dirty (resident) pages */ -static ssize_t poolStatsBlockMapDirtyPagesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.dirtyPages); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapDirtyPagesAttr = { - .attr = { .name = "block_map_dirty_pages", .mode = 0444, }, - .show = poolStatsBlockMapDirtyPagesShow, -}; - -/**********************************************************************/ -/** number of clean (resident) pages */ -static ssize_t poolStatsBlockMapCleanPagesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.cleanPages); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapCleanPagesAttr = { - .attr = { .name = "block_map_clean_pages", .mode = 0444, }, - .show = poolStatsBlockMapCleanPagesShow, -}; - -/**********************************************************************/ -/** number of free pages */ -static ssize_t poolStatsBlockMapFreePagesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.freePages); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapFreePagesAttr = { - .attr = { .name = "block_map_free_pages", .mode = 0444, }, - .show = poolStatsBlockMapFreePagesShow, -}; - -/**********************************************************************/ -/** number of pages in failed state */ -static ssize_t poolStatsBlockMapFailedPagesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.failedPages); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapFailedPagesAttr = { - .attr = { .name = "block_map_failed_pages", .mode = 0444, }, - .show = poolStatsBlockMapFailedPagesShow, -}; - -/**********************************************************************/ -/** number of pages incoming */ -static ssize_t poolStatsBlockMapIncomingPagesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.incomingPages); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapIncomingPagesAttr = { - .attr = { .name = "block_map_incoming_pages", .mode = 0444, }, - .show = poolStatsBlockMapIncomingPagesShow, -}; - -/**********************************************************************/ -/** number of pages outgoing */ -static ssize_t poolStatsBlockMapOutgoingPagesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.outgoingPages); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapOutgoingPagesAttr = { - .attr = { .name = "block_map_outgoing_pages", .mode = 0444, }, - .show = poolStatsBlockMapOutgoingPagesShow, -}; - -/**********************************************************************/ -/** how many times free page not avail */ -static ssize_t poolStatsBlockMapCachePressureShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->vdoStatsStorage.blockMap.cachePressure); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapCachePressureAttr = { - .attr = { .name = "block_map_cache_pressure", .mode = 0444, }, - .show = poolStatsBlockMapCachePressureShow, -}; - -/**********************************************************************/ -/** number of getVDOPageAsync() for read */ -static ssize_t poolStatsBlockMapReadCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.readCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapReadCountAttr = { - .attr = { .name = "block_map_read_count", .mode = 0444, }, - .show = poolStatsBlockMapReadCountShow, -}; - -/**********************************************************************/ -/** number or getVDOPageAsync() for write */ -static ssize_t poolStatsBlockMapWriteCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.writeCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapWriteCountAttr = { - .attr = { .name = "block_map_write_count", .mode = 0444, }, - .show = poolStatsBlockMapWriteCountShow, -}; - -/**********************************************************************/ -/** number of times pages failed to read */ -static ssize_t poolStatsBlockMapFailedReadsShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.failedReads); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapFailedReadsAttr = { - .attr = { .name = "block_map_failed_reads", .mode = 0444, }, - .show = poolStatsBlockMapFailedReadsShow, -}; - -/**********************************************************************/ -/** number of times pages failed to write */ -static ssize_t poolStatsBlockMapFailedWritesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.failedWrites); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapFailedWritesAttr = { - .attr = { .name = "block_map_failed_writes", .mode = 0444, }, - .show = poolStatsBlockMapFailedWritesShow, -}; - -/**********************************************************************/ -/** number of gets that are reclaimed */ -static ssize_t poolStatsBlockMapReclaimedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.reclaimed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapReclaimedAttr = { - .attr = { .name = "block_map_reclaimed", .mode = 0444, }, - .show = poolStatsBlockMapReclaimedShow, -}; - -/**********************************************************************/ -/** number of gets for outgoing pages */ -static ssize_t poolStatsBlockMapReadOutgoingShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.readOutgoing); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapReadOutgoingAttr = { - .attr = { .name = "block_map_read_outgoing", .mode = 0444, }, - .show = poolStatsBlockMapReadOutgoingShow, -}; - -/**********************************************************************/ -/** number of gets that were already there */ -static ssize_t poolStatsBlockMapFoundInCacheShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.foundInCache); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapFoundInCacheAttr = { - .attr = { .name = "block_map_found_in_cache", .mode = 0444, }, - .show = poolStatsBlockMapFoundInCacheShow, -}; - -/**********************************************************************/ -/** number of gets requiring discard */ -static ssize_t poolStatsBlockMapDiscardRequiredShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.discardRequired); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapDiscardRequiredAttr = { - .attr = { .name = "block_map_discard_required", .mode = 0444, }, - .show = poolStatsBlockMapDiscardRequiredShow, -}; - -/**********************************************************************/ -/** number of gets enqueued for their page */ -static ssize_t poolStatsBlockMapWaitForPageShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.waitForPage); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapWaitForPageAttr = { - .attr = { .name = "block_map_wait_for_page", .mode = 0444, }, - .show = poolStatsBlockMapWaitForPageShow, -}; - -/**********************************************************************/ -/** number of gets that have to fetch */ -static ssize_t poolStatsBlockMapFetchRequiredShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.fetchRequired); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapFetchRequiredAttr = { - .attr = { .name = "block_map_fetch_required", .mode = 0444, }, - .show = poolStatsBlockMapFetchRequiredShow, -}; - -/**********************************************************************/ -/** number of page fetches */ -static ssize_t poolStatsBlockMapPagesLoadedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.pagesLoaded); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapPagesLoadedAttr = { - .attr = { .name = "block_map_pages_loaded", .mode = 0444, }, - .show = poolStatsBlockMapPagesLoadedShow, -}; - -/**********************************************************************/ -/** number of page saves */ -static ssize_t poolStatsBlockMapPagesSavedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.pagesSaved); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapPagesSavedAttr = { - .attr = { .name = "block_map_pages_saved", .mode = 0444, }, - .show = poolStatsBlockMapPagesSavedShow, -}; - -/**********************************************************************/ -/** the number of flushes issued */ -static ssize_t poolStatsBlockMapFlushCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.blockMap.flushCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBlockMapFlushCountAttr = { - .attr = { .name = "block_map_flush_count", .mode = 0444, }, - .show = poolStatsBlockMapFlushCountShow, -}; - -/**********************************************************************/ -/** Number of times the UDS advice proved correct */ -static ssize_t poolStatsHashLockDedupeAdviceValidShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.dedupeAdviceValid); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsHashLockDedupeAdviceValidAttr = { - .attr = { .name = "hash_lock_dedupe_advice_valid", .mode = 0444, }, - .show = poolStatsHashLockDedupeAdviceValidShow, -}; - -/**********************************************************************/ -/** Number of times the UDS advice proved incorrect */ -static ssize_t poolStatsHashLockDedupeAdviceStaleShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.dedupeAdviceStale); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsHashLockDedupeAdviceStaleAttr = { - .attr = { .name = "hash_lock_dedupe_advice_stale", .mode = 0444, }, - .show = poolStatsHashLockDedupeAdviceStaleShow, -}; - -/**********************************************************************/ -/** Number of writes with the same data as another in-flight write */ -static ssize_t poolStatsHashLockConcurrentDataMatchesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.concurrentDataMatches); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsHashLockConcurrentDataMatchesAttr = { - .attr = { .name = "hash_lock_concurrent_data_matches", .mode = 0444, }, - .show = poolStatsHashLockConcurrentDataMatchesShow, -}; - -/**********************************************************************/ -/** Number of writes whose hash collided with an in-flight write */ -static ssize_t poolStatsHashLockConcurrentHashCollisionsShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.hashLock.concurrentHashCollisions); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsHashLockConcurrentHashCollisionsAttr = { - .attr = { .name = "hash_lock_concurrent_hash_collisions", .mode = 0444, }, - .show = poolStatsHashLockConcurrentHashCollisionsShow, -}; - -/**********************************************************************/ -/** number of times VDO got an invalid dedupe advice PBN from UDS */ -static ssize_t poolStatsErrorsInvalidAdvicePBNCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.invalidAdvicePBNCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsErrorsInvalidAdvicePBNCountAttr = { - .attr = { .name = "errors_invalid_advicePBNCount", .mode = 0444, }, - .show = poolStatsErrorsInvalidAdvicePBNCountShow, -}; - -/**********************************************************************/ -/** number of times a VIO completed with a VDO_NO_SPACE error */ -static ssize_t poolStatsErrorsNoSpaceErrorCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.noSpaceErrorCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsErrorsNoSpaceErrorCountAttr = { - .attr = { .name = "errors_no_space_error_count", .mode = 0444, }, - .show = poolStatsErrorsNoSpaceErrorCountShow, -}; - -/**********************************************************************/ -/** number of times a VIO completed with a VDO_READ_ONLY error */ -static ssize_t poolStatsErrorsReadOnlyErrorCountShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKVDOStatistics(&layer->kvdo, &layer->vdoStatsStorage); - retval = sprintf(buf, "%llu\n", layer->vdoStatsStorage.errors.readOnlyErrorCount); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsErrorsReadOnlyErrorCountAttr = { - .attr = { .name = "errors_read_only_error_count", .mode = 0444, }, - .show = poolStatsErrorsReadOnlyErrorCountShow, -}; - -/**********************************************************************/ -/** The VDO instance */ -static ssize_t poolStatsInstanceShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.instance); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsInstanceAttr = { - .attr = { .name = "instance", .mode = 0444, }, - .show = poolStatsInstanceShow, -}; - -/**********************************************************************/ -/** Current number of active VIOs */ -static ssize_t poolStatsCurrentVIOsInProgressShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.currentVIOsInProgress); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsCurrentVIOsInProgressAttr = { - .attr = { .name = "currentVIOs_in_progress", .mode = 0444, }, - .show = poolStatsCurrentVIOsInProgressShow, -}; - -/**********************************************************************/ -/** Maximum number of active VIOs */ -static ssize_t poolStatsMaxVIOsShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.maxVIOs); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsMaxVIOsAttr = { - .attr = { .name = "maxVIOs", .mode = 0444, }, - .show = poolStatsMaxVIOsShow, -}; - -/**********************************************************************/ -/** Number of times the UDS index was too slow in responding */ -static ssize_t poolStatsDedupeAdviceTimeoutsShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.dedupeAdviceTimeouts); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsDedupeAdviceTimeoutsAttr = { - .attr = { .name = "dedupe_advice_timeouts", .mode = 0444, }, - .show = poolStatsDedupeAdviceTimeoutsShow, -}; - -/**********************************************************************/ -/** Number of flush requests submitted to the storage device */ -static ssize_t poolStatsFlushOutShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.flushOut); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsFlushOutAttr = { - .attr = { .name = "flush_out", .mode = 0444, }, - .show = poolStatsFlushOutShow, -}; - -/**********************************************************************/ -/** Logical block size */ -static ssize_t poolStatsLogicalBlockSizeShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.logicalBlockSize); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsLogicalBlockSizeAttr = { - .attr = { .name = "logical_block_size", .mode = 0444, }, - .show = poolStatsLogicalBlockSizeShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosInReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInReadAttr = { - .attr = { .name = "bios_in_read", .mode = 0444, }, - .show = poolStatsBiosInReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosInWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInWriteAttr = { - .attr = { .name = "bios_in_write", .mode = 0444, }, - .show = poolStatsBiosInWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosInDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInDiscardAttr = { - .attr = { .name = "bios_in_discard", .mode = 0444, }, - .show = poolStatsBiosInDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosInFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInFlushAttr = { - .attr = { .name = "bios_in_flush", .mode = 0444, }, - .show = poolStatsBiosInFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosInFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosIn.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInFuaAttr = { - .attr = { .name = "bios_in_fua", .mode = 0444, }, - .show = poolStatsBiosInFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosInPartialReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInPartialReadAttr = { - .attr = { .name = "bios_in_partial_read", .mode = 0444, }, - .show = poolStatsBiosInPartialReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosInPartialWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInPartialWriteAttr = { - .attr = { .name = "bios_in_partial_write", .mode = 0444, }, - .show = poolStatsBiosInPartialWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosInPartialDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInPartialDiscardAttr = { - .attr = { .name = "bios_in_partial_discard", .mode = 0444, }, - .show = poolStatsBiosInPartialDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosInPartialFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInPartialFlushAttr = { - .attr = { .name = "bios_in_partial_flush", .mode = 0444, }, - .show = poolStatsBiosInPartialFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosInPartialFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInPartial.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInPartialFuaAttr = { - .attr = { .name = "bios_in_partial_fua", .mode = 0444, }, - .show = poolStatsBiosInPartialFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosOutReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutReadAttr = { - .attr = { .name = "bios_out_read", .mode = 0444, }, - .show = poolStatsBiosOutReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosOutWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutWriteAttr = { - .attr = { .name = "bios_out_write", .mode = 0444, }, - .show = poolStatsBiosOutWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosOutDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutDiscardAttr = { - .attr = { .name = "bios_out_discard", .mode = 0444, }, - .show = poolStatsBiosOutDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosOutFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutFlushAttr = { - .attr = { .name = "bios_out_flush", .mode = 0444, }, - .show = poolStatsBiosOutFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosOutFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOut.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutFuaAttr = { - .attr = { .name = "bios_out_fua", .mode = 0444, }, - .show = poolStatsBiosOutFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosMetaReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaReadAttr = { - .attr = { .name = "bios_meta_read", .mode = 0444, }, - .show = poolStatsBiosMetaReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosMetaWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaWriteAttr = { - .attr = { .name = "bios_meta_write", .mode = 0444, }, - .show = poolStatsBiosMetaWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosMetaDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaDiscardAttr = { - .attr = { .name = "bios_meta_discard", .mode = 0444, }, - .show = poolStatsBiosMetaDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosMetaFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaFlushAttr = { - .attr = { .name = "bios_meta_flush", .mode = 0444, }, - .show = poolStatsBiosMetaFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosMetaFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMeta.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaFuaAttr = { - .attr = { .name = "bios_meta_fua", .mode = 0444, }, - .show = poolStatsBiosMetaFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosJournalReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalReadAttr = { - .attr = { .name = "bios_journal_read", .mode = 0444, }, - .show = poolStatsBiosJournalReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosJournalWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalWriteAttr = { - .attr = { .name = "bios_journal_write", .mode = 0444, }, - .show = poolStatsBiosJournalWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosJournalDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalDiscardAttr = { - .attr = { .name = "bios_journal_discard", .mode = 0444, }, - .show = poolStatsBiosJournalDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosJournalFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalFlushAttr = { - .attr = { .name = "bios_journal_flush", .mode = 0444, }, - .show = poolStatsBiosJournalFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosJournalFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournal.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalFuaAttr = { - .attr = { .name = "bios_journal_fua", .mode = 0444, }, - .show = poolStatsBiosJournalFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosPageCacheReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheReadAttr = { - .attr = { .name = "bios_page_cache_read", .mode = 0444, }, - .show = poolStatsBiosPageCacheReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosPageCacheWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheWriteAttr = { - .attr = { .name = "bios_page_cache_write", .mode = 0444, }, - .show = poolStatsBiosPageCacheWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosPageCacheDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheDiscardAttr = { - .attr = { .name = "bios_page_cache_discard", .mode = 0444, }, - .show = poolStatsBiosPageCacheDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosPageCacheFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheFlushAttr = { - .attr = { .name = "bios_page_cache_flush", .mode = 0444, }, - .show = poolStatsBiosPageCacheFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosPageCacheFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCache.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheFuaAttr = { - .attr = { .name = "bios_page_cache_fua", .mode = 0444, }, - .show = poolStatsBiosPageCacheFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosOutCompletedReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutCompletedReadAttr = { - .attr = { .name = "bios_out_completed_read", .mode = 0444, }, - .show = poolStatsBiosOutCompletedReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosOutCompletedWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutCompletedWriteAttr = { - .attr = { .name = "bios_out_completed_write", .mode = 0444, }, - .show = poolStatsBiosOutCompletedWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosOutCompletedDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutCompletedDiscardAttr = { - .attr = { .name = "bios_out_completed_discard", .mode = 0444, }, - .show = poolStatsBiosOutCompletedDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosOutCompletedFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutCompletedFlushAttr = { - .attr = { .name = "bios_out_completed_flush", .mode = 0444, }, - .show = poolStatsBiosOutCompletedFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosOutCompletedFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosOutCompleted.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosOutCompletedFuaAttr = { - .attr = { .name = "bios_out_completed_fua", .mode = 0444, }, - .show = poolStatsBiosOutCompletedFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosMetaCompletedReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaCompletedReadAttr = { - .attr = { .name = "bios_meta_completed_read", .mode = 0444, }, - .show = poolStatsBiosMetaCompletedReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosMetaCompletedWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaCompletedWriteAttr = { - .attr = { .name = "bios_meta_completed_write", .mode = 0444, }, - .show = poolStatsBiosMetaCompletedWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosMetaCompletedDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaCompletedDiscardAttr = { - .attr = { .name = "bios_meta_completed_discard", .mode = 0444, }, - .show = poolStatsBiosMetaCompletedDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosMetaCompletedFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaCompletedFlushAttr = { - .attr = { .name = "bios_meta_completed_flush", .mode = 0444, }, - .show = poolStatsBiosMetaCompletedFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosMetaCompletedFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosMetaCompleted.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosMetaCompletedFuaAttr = { - .attr = { .name = "bios_meta_completed_fua", .mode = 0444, }, - .show = poolStatsBiosMetaCompletedFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosJournalCompletedReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalCompletedReadAttr = { - .attr = { .name = "bios_journal_completed_read", .mode = 0444, }, - .show = poolStatsBiosJournalCompletedReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosJournalCompletedWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalCompletedWriteAttr = { - .attr = { .name = "bios_journal_completed_write", .mode = 0444, }, - .show = poolStatsBiosJournalCompletedWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosJournalCompletedDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalCompletedDiscardAttr = { - .attr = { .name = "bios_journal_completed_discard", .mode = 0444, }, - .show = poolStatsBiosJournalCompletedDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosJournalCompletedFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalCompletedFlushAttr = { - .attr = { .name = "bios_journal_completed_flush", .mode = 0444, }, - .show = poolStatsBiosJournalCompletedFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosJournalCompletedFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosJournalCompleted.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosJournalCompletedFuaAttr = { - .attr = { .name = "bios_journal_completed_fua", .mode = 0444, }, - .show = poolStatsBiosJournalCompletedFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosPageCacheCompletedReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheCompletedReadAttr = { - .attr = { .name = "bios_page_cache_completed_read", .mode = 0444, }, - .show = poolStatsBiosPageCacheCompletedReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosPageCacheCompletedWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheCompletedWriteAttr = { - .attr = { .name = "bios_page_cache_completed_write", .mode = 0444, }, - .show = poolStatsBiosPageCacheCompletedWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosPageCacheCompletedDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheCompletedDiscardAttr = { - .attr = { .name = "bios_page_cache_completed_discard", .mode = 0444, }, - .show = poolStatsBiosPageCacheCompletedDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosPageCacheCompletedFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheCompletedFlushAttr = { - .attr = { .name = "bios_page_cache_completed_flush", .mode = 0444, }, - .show = poolStatsBiosPageCacheCompletedFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosPageCacheCompletedFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosPageCacheCompleted.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosPageCacheCompletedFuaAttr = { - .attr = { .name = "bios_page_cache_completed_fua", .mode = 0444, }, - .show = poolStatsBiosPageCacheCompletedFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosAcknowledgedReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedReadAttr = { - .attr = { .name = "bios_acknowledged_read", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosAcknowledgedWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedWriteAttr = { - .attr = { .name = "bios_acknowledged_write", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosAcknowledgedDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedDiscardAttr = { - .attr = { .name = "bios_acknowledged_discard", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosAcknowledgedFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedFlushAttr = { - .attr = { .name = "bios_acknowledged_flush", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosAcknowledgedFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledged.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedFuaAttr = { - .attr = { .name = "bios_acknowledged_fua", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosAcknowledgedPartialReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedPartialReadAttr = { - .attr = { .name = "bios_acknowledged_partial_read", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedPartialReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosAcknowledgedPartialWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedPartialWriteAttr = { - .attr = { .name = "bios_acknowledged_partial_write", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedPartialWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosAcknowledgedPartialDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedPartialDiscardAttr = { - .attr = { .name = "bios_acknowledged_partial_discard", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedPartialDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosAcknowledgedPartialFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedPartialFlushAttr = { - .attr = { .name = "bios_acknowledged_partial_flush", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedPartialFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosAcknowledgedPartialFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosAcknowledgedPartial.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosAcknowledgedPartialFuaAttr = { - .attr = { .name = "bios_acknowledged_partial_fua", .mode = 0444, }, - .show = poolStatsBiosAcknowledgedPartialFuaShow, -}; - -/**********************************************************************/ -/** Number of not REQ_WRITE bios */ -static ssize_t poolStatsBiosInProgressReadShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.read); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInProgressReadAttr = { - .attr = { .name = "bios_in_progress_read", .mode = 0444, }, - .show = poolStatsBiosInProgressReadShow, -}; - -/**********************************************************************/ -/** Number of REQ_WRITE bios */ -static ssize_t poolStatsBiosInProgressWriteShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.write); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInProgressWriteAttr = { - .attr = { .name = "bios_in_progress_write", .mode = 0444, }, - .show = poolStatsBiosInProgressWriteShow, -}; - -/**********************************************************************/ -/** Number of REQ_DISCARD bios */ -static ssize_t poolStatsBiosInProgressDiscardShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.discard); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInProgressDiscardAttr = { - .attr = { .name = "bios_in_progress_discard", .mode = 0444, }, - .show = poolStatsBiosInProgressDiscardShow, -}; - -/**********************************************************************/ -/** Number of REQ_FLUSH bios */ -static ssize_t poolStatsBiosInProgressFlushShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.flush); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInProgressFlushAttr = { - .attr = { .name = "bios_in_progress_flush", .mode = 0444, }, - .show = poolStatsBiosInProgressFlushShow, -}; - -/**********************************************************************/ -/** Number of REQ_FUA bios */ -static ssize_t poolStatsBiosInProgressFuaShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.biosInProgress.fua); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsBiosInProgressFuaAttr = { - .attr = { .name = "bios_in_progress_fua", .mode = 0444, }, - .show = poolStatsBiosInProgressFuaShow, -}; - -/**********************************************************************/ -/** Tracked bytes currently allocated. */ -static ssize_t poolStatsMemoryUsageBytesUsedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.memoryUsage.bytesUsed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsMemoryUsageBytesUsedAttr = { - .attr = { .name = "memory_usage_bytes_used", .mode = 0444, }, - .show = poolStatsMemoryUsageBytesUsedShow, -}; - -/**********************************************************************/ -/** Maximum tracked bytes allocated. */ -static ssize_t poolStatsMemoryUsagePeakBytesUsedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.memoryUsage.peakBytesUsed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsMemoryUsagePeakBytesUsedAttr = { - .attr = { .name = "memory_usage_peak_bytes_used", .mode = 0444, }, - .show = poolStatsMemoryUsagePeakBytesUsedShow, -}; - -/**********************************************************************/ -/** Number of chunk names stored in the index */ -static ssize_t poolStatsIndexEntriesIndexedShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.entriesIndexed); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexEntriesIndexedAttr = { - .attr = { .name = "index_entries_indexed", .mode = 0444, }, - .show = poolStatsIndexEntriesIndexedShow, -}; - -/**********************************************************************/ -/** Number of post calls that found an existing entry */ -static ssize_t poolStatsIndexPostsFoundShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.postsFound); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexPostsFoundAttr = { - .attr = { .name = "index_posts_found", .mode = 0444, }, - .show = poolStatsIndexPostsFoundShow, -}; - -/**********************************************************************/ -/** Number of post calls that added a new entry */ -static ssize_t poolStatsIndexPostsNotFoundShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.postsNotFound); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexPostsNotFoundAttr = { - .attr = { .name = "index_posts_not_found", .mode = 0444, }, - .show = poolStatsIndexPostsNotFoundShow, -}; - -/**********************************************************************/ -/** Number of query calls that found an existing entry */ -static ssize_t poolStatsIndexQueriesFoundShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.queriesFound); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexQueriesFoundAttr = { - .attr = { .name = "index_queries_found", .mode = 0444, }, - .show = poolStatsIndexQueriesFoundShow, -}; - -/**********************************************************************/ -/** Number of query calls that added a new entry */ -static ssize_t poolStatsIndexQueriesNotFoundShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.queriesNotFound); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexQueriesNotFoundAttr = { - .attr = { .name = "index_queries_not_found", .mode = 0444, }, - .show = poolStatsIndexQueriesNotFoundShow, -}; - -/**********************************************************************/ -/** Number of update calls that found an existing entry */ -static ssize_t poolStatsIndexUpdatesFoundShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.updatesFound); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexUpdatesFoundAttr = { - .attr = { .name = "index_updates_found", .mode = 0444, }, - .show = poolStatsIndexUpdatesFoundShow, -}; - -/**********************************************************************/ -/** Number of update calls that added a new entry */ -static ssize_t poolStatsIndexUpdatesNotFoundShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%llu\n", layer->kernelStatsStorage.index.updatesNotFound); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexUpdatesNotFoundAttr = { - .attr = { .name = "index_updates_not_found", .mode = 0444, }, - .show = poolStatsIndexUpdatesNotFoundShow, -}; - -/**********************************************************************/ -/** Current number of dedupe queries that are in flight */ -static ssize_t poolStatsIndexCurrDedupeQueriesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.index.currDedupeQueries); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexCurrDedupeQueriesAttr = { - .attr = { .name = "index_curr_dedupe_queries", .mode = 0444, }, - .show = poolStatsIndexCurrDedupeQueriesShow, -}; - -/**********************************************************************/ -/** Maximum number of dedupe queries that have been in flight */ -static ssize_t poolStatsIndexMaxDedupeQueriesShow(KernelLayer *layer, char *buf) -{ - ssize_t retval; - mutex_lock(&layer->statsMutex); - getKernelStats(layer, &layer->kernelStatsStorage); - retval = sprintf(buf, "%" PRIu32 "\n", layer->kernelStatsStorage.index.maxDedupeQueries); - mutex_unlock(&layer->statsMutex); - return retval; -} - -static PoolStatsAttribute poolStatsIndexMaxDedupeQueriesAttr = { - .attr = { .name = "index_max_dedupe_queries", .mode = 0444, }, - .show = poolStatsIndexMaxDedupeQueriesShow, -}; - -struct attribute *poolStatsAttrs[] = { - &poolStatsDataBlocksUsedAttr.attr, - &poolStatsOverheadBlocksUsedAttr.attr, - &poolStatsLogicalBlocksUsedAttr.attr, - &poolStatsPhysicalBlocksAttr.attr, - &poolStatsLogicalBlocksAttr.attr, - &poolStatsBlockMapCacheSizeAttr.attr, - &poolStatsWritePolicyAttr.attr, - &poolStatsBlockSizeAttr.attr, - &poolStatsCompleteRecoveriesAttr.attr, - &poolStatsReadOnlyRecoveriesAttr.attr, - &poolStatsModeAttr.attr, - &poolStatsInRecoveryModeAttr.attr, - &poolStatsRecoveryPercentageAttr.attr, - &poolStatsPackerCompressedFragmentsWrittenAttr.attr, - &poolStatsPackerCompressedBlocksWrittenAttr.attr, - &poolStatsPackerCompressedFragmentsInPackerAttr.attr, - &poolStatsAllocatorSlabCountAttr.attr, - &poolStatsAllocatorSlabsOpenedAttr.attr, - &poolStatsAllocatorSlabsReopenedAttr.attr, - &poolStatsJournalDiskFullAttr.attr, - &poolStatsJournalSlabJournalCommitsRequestedAttr.attr, - &poolStatsJournalEntriesStartedAttr.attr, - &poolStatsJournalEntriesWrittenAttr.attr, - &poolStatsJournalEntriesCommittedAttr.attr, - &poolStatsJournalBlocksStartedAttr.attr, - &poolStatsJournalBlocksWrittenAttr.attr, - &poolStatsJournalBlocksCommittedAttr.attr, - &poolStatsSlabJournalDiskFullCountAttr.attr, - &poolStatsSlabJournalFlushCountAttr.attr, - &poolStatsSlabJournalBlockedCountAttr.attr, - &poolStatsSlabJournalBlocksWrittenAttr.attr, - &poolStatsSlabJournalTailBusyCountAttr.attr, - &poolStatsSlabSummaryBlocksWrittenAttr.attr, - &poolStatsRefCountsBlocksWrittenAttr.attr, - &poolStatsBlockMapDirtyPagesAttr.attr, - &poolStatsBlockMapCleanPagesAttr.attr, - &poolStatsBlockMapFreePagesAttr.attr, - &poolStatsBlockMapFailedPagesAttr.attr, - &poolStatsBlockMapIncomingPagesAttr.attr, - &poolStatsBlockMapOutgoingPagesAttr.attr, - &poolStatsBlockMapCachePressureAttr.attr, - &poolStatsBlockMapReadCountAttr.attr, - &poolStatsBlockMapWriteCountAttr.attr, - &poolStatsBlockMapFailedReadsAttr.attr, - &poolStatsBlockMapFailedWritesAttr.attr, - &poolStatsBlockMapReclaimedAttr.attr, - &poolStatsBlockMapReadOutgoingAttr.attr, - &poolStatsBlockMapFoundInCacheAttr.attr, - &poolStatsBlockMapDiscardRequiredAttr.attr, - &poolStatsBlockMapWaitForPageAttr.attr, - &poolStatsBlockMapFetchRequiredAttr.attr, - &poolStatsBlockMapPagesLoadedAttr.attr, - &poolStatsBlockMapPagesSavedAttr.attr, - &poolStatsBlockMapFlushCountAttr.attr, - &poolStatsHashLockDedupeAdviceValidAttr.attr, - &poolStatsHashLockDedupeAdviceStaleAttr.attr, - &poolStatsHashLockConcurrentDataMatchesAttr.attr, - &poolStatsHashLockConcurrentHashCollisionsAttr.attr, - &poolStatsErrorsInvalidAdvicePBNCountAttr.attr, - &poolStatsErrorsNoSpaceErrorCountAttr.attr, - &poolStatsErrorsReadOnlyErrorCountAttr.attr, - &poolStatsInstanceAttr.attr, - &poolStatsCurrentVIOsInProgressAttr.attr, - &poolStatsMaxVIOsAttr.attr, - &poolStatsDedupeAdviceTimeoutsAttr.attr, - &poolStatsFlushOutAttr.attr, - &poolStatsLogicalBlockSizeAttr.attr, - &poolStatsBiosInReadAttr.attr, - &poolStatsBiosInWriteAttr.attr, - &poolStatsBiosInDiscardAttr.attr, - &poolStatsBiosInFlushAttr.attr, - &poolStatsBiosInFuaAttr.attr, - &poolStatsBiosInPartialReadAttr.attr, - &poolStatsBiosInPartialWriteAttr.attr, - &poolStatsBiosInPartialDiscardAttr.attr, - &poolStatsBiosInPartialFlushAttr.attr, - &poolStatsBiosInPartialFuaAttr.attr, - &poolStatsBiosOutReadAttr.attr, - &poolStatsBiosOutWriteAttr.attr, - &poolStatsBiosOutDiscardAttr.attr, - &poolStatsBiosOutFlushAttr.attr, - &poolStatsBiosOutFuaAttr.attr, - &poolStatsBiosMetaReadAttr.attr, - &poolStatsBiosMetaWriteAttr.attr, - &poolStatsBiosMetaDiscardAttr.attr, - &poolStatsBiosMetaFlushAttr.attr, - &poolStatsBiosMetaFuaAttr.attr, - &poolStatsBiosJournalReadAttr.attr, - &poolStatsBiosJournalWriteAttr.attr, - &poolStatsBiosJournalDiscardAttr.attr, - &poolStatsBiosJournalFlushAttr.attr, - &poolStatsBiosJournalFuaAttr.attr, - &poolStatsBiosPageCacheReadAttr.attr, - &poolStatsBiosPageCacheWriteAttr.attr, - &poolStatsBiosPageCacheDiscardAttr.attr, - &poolStatsBiosPageCacheFlushAttr.attr, - &poolStatsBiosPageCacheFuaAttr.attr, - &poolStatsBiosOutCompletedReadAttr.attr, - &poolStatsBiosOutCompletedWriteAttr.attr, - &poolStatsBiosOutCompletedDiscardAttr.attr, - &poolStatsBiosOutCompletedFlushAttr.attr, - &poolStatsBiosOutCompletedFuaAttr.attr, - &poolStatsBiosMetaCompletedReadAttr.attr, - &poolStatsBiosMetaCompletedWriteAttr.attr, - &poolStatsBiosMetaCompletedDiscardAttr.attr, - &poolStatsBiosMetaCompletedFlushAttr.attr, - &poolStatsBiosMetaCompletedFuaAttr.attr, - &poolStatsBiosJournalCompletedReadAttr.attr, - &poolStatsBiosJournalCompletedWriteAttr.attr, - &poolStatsBiosJournalCompletedDiscardAttr.attr, - &poolStatsBiosJournalCompletedFlushAttr.attr, - &poolStatsBiosJournalCompletedFuaAttr.attr, - &poolStatsBiosPageCacheCompletedReadAttr.attr, - &poolStatsBiosPageCacheCompletedWriteAttr.attr, - &poolStatsBiosPageCacheCompletedDiscardAttr.attr, - &poolStatsBiosPageCacheCompletedFlushAttr.attr, - &poolStatsBiosPageCacheCompletedFuaAttr.attr, - &poolStatsBiosAcknowledgedReadAttr.attr, - &poolStatsBiosAcknowledgedWriteAttr.attr, - &poolStatsBiosAcknowledgedDiscardAttr.attr, - &poolStatsBiosAcknowledgedFlushAttr.attr, - &poolStatsBiosAcknowledgedFuaAttr.attr, - &poolStatsBiosAcknowledgedPartialReadAttr.attr, - &poolStatsBiosAcknowledgedPartialWriteAttr.attr, - &poolStatsBiosAcknowledgedPartialDiscardAttr.attr, - &poolStatsBiosAcknowledgedPartialFlushAttr.attr, - &poolStatsBiosAcknowledgedPartialFuaAttr.attr, - &poolStatsBiosInProgressReadAttr.attr, - &poolStatsBiosInProgressWriteAttr.attr, - &poolStatsBiosInProgressDiscardAttr.attr, - &poolStatsBiosInProgressFlushAttr.attr, - &poolStatsBiosInProgressFuaAttr.attr, - &poolStatsMemoryUsageBytesUsedAttr.attr, - &poolStatsMemoryUsagePeakBytesUsedAttr.attr, - &poolStatsIndexEntriesIndexedAttr.attr, - &poolStatsIndexPostsFoundAttr.attr, - &poolStatsIndexPostsNotFoundAttr.attr, - &poolStatsIndexQueriesFoundAttr.attr, - &poolStatsIndexQueriesNotFoundAttr.attr, - &poolStatsIndexUpdatesFoundAttr.attr, - &poolStatsIndexUpdatesNotFoundAttr.attr, - &poolStatsIndexCurrDedupeQueriesAttr.attr, - &poolStatsIndexMaxDedupeQueriesAttr.attr, - NULL, -}; diff --git a/vdo/kernel/statusCodeBlocks.h b/vdo/kernel/statusCodeBlocks.h deleted file mode 100644 index bca19c5..0000000 --- a/vdo/kernel/statusCodeBlocks.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusCodeBlocks.h#1 $ - */ - -#ifndef STATUS_CODE_BLOCKS_H -#define STATUS_CODE_BLOCKS_H - -enum { - UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE, - VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END, - VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE, - PRP_BLOCK_START = VDO_BLOCK_END, - PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE, -}; - -#endif // STATUS_CODE_BLOCKS_H diff --git a/vdo/kernel/statusProcfs.c b/vdo/kernel/statusProcfs.c deleted file mode 100644 index 70e8c9b..0000000 --- a/vdo/kernel/statusProcfs.c +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusProcfs.c#4 $ - * - * Proc filesystem interface to the old GET_DEDUPE_STATS and - * GET_KERNEL_STATS ioctls, which can no longer be supported in 4.4 - * and later kernels. These files return the same data as the old - * ioctls do, in order to require minimal changes to our (and - * customers') utilties and test code. - * - * +--+----- /proc/vdo procfsRoot - * | - * +-+----- vdo config->poolName - * | - * +------- dedupe_stats GET_DEDUPE_STATS ioctl - * +------- kernel_stats GET_KERNEL_STATS ioctl - * - */ -#include "statusProcfs.h" - -#include - -#include "memoryAlloc.h" - -#include "releaseVersions.h" -#include "statistics.h" -#include "vdo.h" - -#include "dedupeIndex.h" -#include "ioSubmitter.h" -#include "kernelStatistics.h" -#include "logger.h" -#include "memoryUsage.h" -#include "threadDevice.h" -#include "vdoCommon.h" - -static struct proc_dir_entry *procfsRoot = NULL; - -/**********************************************************************/ -static int statusDedupeShow(struct seq_file *m, void *v) -{ - KernelLayer *layer = (KernelLayer *) m->private; - VDOStatistics *stats; - size_t len = sizeof(VDOStatistics); - RegisteredThread allocatingThread, instanceThread; - registerAllocatingThread(&allocatingThread, NULL); - registerThreadDevice(&instanceThread, layer); - int result = ALLOCATE(1, VDOStatistics, __func__, &stats); - if (result == VDO_SUCCESS) { - getKVDOStatistics(&layer->kvdo, stats); - seq_write(m, stats, len); - FREE(stats); - } - unregisterThreadDeviceID(); - unregisterAllocatingThread(); - return result; -} - -/**********************************************************************/ -static int statusDedupeOpen(struct inode *inode, struct file *file) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) - return single_open(file, statusDedupeShow, PDE_DATA(inode)); -#else - return single_open(file, statusDedupeShow, PDE(inode)->data); -#endif -} - -static const struct file_operations vdoProcfsDedupeOps = { - .open = statusDedupeOpen, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/**********************************************************************/ -static void copyBioStat(BioStats *b, const AtomicBioStats *a) -{ - b->read = atomic64_read(&a->read); - b->write = atomic64_read(&a->write); - b->discard = atomic64_read(&a->discard); - b->flush = atomic64_read(&a->flush); - b->fua = atomic64_read(&a->fua); -} - -/**********************************************************************/ -static BioStats subtractBioStats(BioStats minuend, BioStats subtrahend) -{ - return (BioStats) { - .read = minuend.read - subtrahend.read, - .write = minuend.write - subtrahend.write, - .discard = minuend.discard - subtrahend.discard, - .flush = minuend.flush - subtrahend.flush, - .fua = minuend.fua - subtrahend.fua, - }; -} - -/**********************************************************************/ -void getKernelStats(KernelLayer *layer, KernelStatistics *stats) -{ - stats->version = STATISTICS_VERSION; - stats->releaseVersion = CURRENT_RELEASE_VERSION_NUMBER; - stats->instance = layer->instance; - getLimiterValuesAtomically(&layer->requestLimiter, - &stats->currentVIOsInProgress, &stats->maxVIOs); - // albireoTimeoutReport gives the number of timeouts, and dedupeContextBusy - // gives the number of queries not made because of earlier timeouts. - stats->dedupeAdviceTimeouts = (getEventCount(&layer->albireoTimeoutReporter) - + atomic64_read(&layer->dedupeContextBusy)); - stats->flushOut = atomic64_read(&layer->flushOut); - stats->logicalBlockSize = layer->deviceConfig->logicalBlockSize; - copyBioStat(&stats->biosIn, &layer->biosIn); - copyBioStat(&stats->biosInPartial, &layer->biosInPartial); - copyBioStat(&stats->biosOut, &layer->biosOut); - copyBioStat(&stats->biosMeta, &layer->biosMeta); - copyBioStat(&stats->biosJournal, &layer->biosJournal); - copyBioStat(&stats->biosPageCache, &layer->biosPageCache); - copyBioStat(&stats->biosOutCompleted, &layer->biosOutCompleted); - copyBioStat(&stats->biosMetaCompleted, &layer->biosMetaCompleted); - copyBioStat(&stats->biosJournalCompleted, &layer->biosJournalCompleted); - copyBioStat(&stats->biosPageCacheCompleted, - &layer->biosPageCacheCompleted); - copyBioStat(&stats->biosAcknowledged, &layer->biosAcknowledged); - copyBioStat(&stats->biosAcknowledgedPartial, - &layer->biosAcknowledgedPartial); - stats->biosInProgress = subtractBioStats(stats->biosIn, - stats->biosAcknowledged); - stats->memoryUsage = getMemoryUsage(); - getIndexStatistics(layer->dedupeIndex, &stats->index); -} - -/**********************************************************************/ -static int statusKernelShow(struct seq_file *m, void *v) -{ - KernelLayer *layer = (KernelLayer *) m->private; - KernelStatistics *stats; - size_t len = sizeof(KernelStatistics); - RegisteredThread allocatingThread, instanceThread; - registerAllocatingThread(&allocatingThread, NULL); - registerThreadDevice(&instanceThread, layer); - int result = ALLOCATE(1, KernelStatistics, __func__, &stats); - if (result == VDO_SUCCESS) { - getKernelStats(layer, stats); - seq_write(m, stats, len); - FREE(stats); - } - unregisterThreadDeviceID(); - unregisterAllocatingThread(); - return result; -} - -/**********************************************************************/ -static int statusKernelOpen(struct inode *inode, struct file *file) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) - return single_open(file, statusKernelShow, PDE_DATA(inode)); -#else - return single_open(file, statusKernelShow, PDE(inode)->data); -#endif -} - -static const struct file_operations vdoProcfsKernelOps = { - .open = statusKernelOpen, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/**********************************************************************/ -int vdoInitProcfs() -{ - const char *procfsName = getProcRoot(); - procfsRoot = proc_mkdir(procfsName, NULL); - if (procfsRoot == NULL) { - logWarning("Could not create proc filesystem root %s\n", procfsName); - return -ENOMEM; - } - return VDO_SUCCESS; -} - -/**********************************************************************/ -void vdoDestroyProcfs() -{ - remove_proc_entry(getProcRoot(), NULL); - procfsRoot = NULL; -} - -/**********************************************************************/ -int vdoCreateProcfsEntry(KernelLayer *layer, const char *name, void **private) -{ - int result = VDO_SUCCESS; - - if (procfsRoot != NULL) { - struct proc_dir_entry *fsDir; - fsDir = proc_mkdir(name, procfsRoot); - if (fsDir == NULL) { - result = -ENOMEM; - } else { - if (proc_create_data(getVDOStatisticsProcFile(), 0644, fsDir, - &vdoProcfsDedupeOps, layer) == NULL) { - result = -ENOMEM; - } else if (proc_create_data(getKernelStatisticsProcFile(), 0644, fsDir, - &vdoProcfsKernelOps, layer) == NULL) { - result = -ENOMEM; - } - } - if (result < 0) { - vdoDestroyProcfsEntry(name, fsDir); - } else { - *private = fsDir; - } - } else { - logWarning("No proc filesystem root set, skipping %s\n", name); - } - return result; -} - -/**********************************************************************/ -void vdoDestroyProcfsEntry(const char *name, void *private) -{ - if (procfsRoot != NULL) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) - remove_proc_subtree(name, procfsRoot); -#else - struct proc_dir_entry *fsDir = (struct proc_dir_entry *) private; - remove_proc_entry(getVDOStatisticsProcFile(), fsDir); - remove_proc_entry(getKernelStatisticsProcFile(), fsDir); - remove_proc_entry(name, procfsRoot); -#endif - } -} diff --git a/vdo/kernel/statusProcfs.h b/vdo/kernel/statusProcfs.h deleted file mode 100644 index a884c8e..0000000 --- a/vdo/kernel/statusProcfs.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/statusProcfs.h#1 $ - * - */ - -#ifndef STATUS_PROC_H -#define STATUS_PROC_H - -#include -#include -#include "kernelLayer.h" - -/** - * Initializes the /proc/vdo directory. Should be called once when the - * module is loaded. - * - * @return 0 on success, nonzero on failure - */ -int vdoInitProcfs(void); - -/** - * Destroys the /proc/vdo directory. Should be called once when the - * module is unloaded. - */ -void vdoDestroyProcfs(void); - -/** - * Creates a subdirectory in the /proc/vdo filesystem for a particular - * vdo. - * - * @param layer the kernel layer - * @param name the subdirectory name - * @param private pointer to private storage for procfs data - * - * @return 0 on success, nonzero on failure - */ -int vdoCreateProcfsEntry(KernelLayer *layer, const char *name, void **private); - -/** - * Destroys a subdirectory in the /proc/vdo filesystem for a - * particular vdo. - * - * @param name the subdirectory name - * @param private private storage for procfs data - */ -void vdoDestroyProcfsEntry(const char *name, void *private); - -/** - * Retrieves the current kernel statistics. - * - * @param layer the kernel layer - * @param stats pointer to the structure to fill in - */ -void getKernelStats(KernelLayer *layer, KernelStatistics *stats); - -#endif /* STATUS_PROC_H */ diff --git a/vdo/kernel/sysfs.c b/vdo/kernel/sysfs.c deleted file mode 100644 index 9244bf1..0000000 --- a/vdo/kernel/sysfs.c +++ /dev/null @@ -1,343 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/sysfs.c#5 $ - */ - -#include "sysfs.h" - -#include -#include - -#include "dedupeIndex.h" -#include "dmvdo.h" -#include "logger.h" - -extern int defaultMaxRequestsActive; - -typedef struct vdoAttribute { - struct attribute attr; - ssize_t (*show)(struct kvdoDevice *d, struct attribute *attr, char *buf); - ssize_t (*store)(struct kvdoDevice *d, const char *value, size_t count); - // Location of value, if .show == showInt or showUInt or showBool. - void *valuePtr; -} VDOAttribute; - -static char *statusStrings[] = { - "UNINITIALIZED", - "READY", - "SHUTTING DOWN", -}; - -/**********************************************************************/ -static ssize_t vdoStatusShow(struct kvdoDevice *device, - struct attribute *attr, - char *buf) -{ - return sprintf(buf, "%s\n", statusStrings[device->status]); -} - -/**********************************************************************/ -static ssize_t vdoLogLevelShow(struct kvdoDevice *device, - struct attribute *attr, - char *buf) -{ - return sprintf(buf, "%s\n", priorityToString(getLogLevel())); -} - -/**********************************************************************/ -static ssize_t vdoLogLevelStore(struct kvdoDevice *device, - const char *buf, size_t n) -{ - static char internalBuf[11]; - - if (n > 10) { - return -EINVAL; - } - - memset(internalBuf, '\000', sizeof(internalBuf)); - memcpy(internalBuf, buf, n); - if (internalBuf[n - 1] == '\n') { - internalBuf[n - 1] = '\000'; - } - setLogLevel(stringToPriority(internalBuf)); - return n; -} - -/**********************************************************************/ -static ssize_t scanInt(const char *buf, - size_t n, - int *valuePtr, - int minimum, - int maximum) -{ - if (n > 12) { - return -EINVAL; - } - unsigned int value; - if (sscanf(buf, "%d", &value) != 1) { - return -EINVAL; - } - if (value < minimum) { - value = minimum; - } else if (value > maximum) { - value = maximum; - } - *valuePtr = value; - return n; -} - -/**********************************************************************/ -static ssize_t showInt(struct kvdoDevice *device, - struct attribute *attr, - char *buf) -{ - VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); - - return sprintf(buf, "%d\n", *(int *)vdoAttr->valuePtr); -} - -/**********************************************************************/ -static ssize_t scanUInt(const char *buf, - size_t n, - unsigned int *valuePtr, - unsigned int minimum, - unsigned int maximum) -{ - if (n > 12) { - return -EINVAL; - } - unsigned int value; - if (sscanf(buf, "%u", &value) != 1) { - return -EINVAL; - } - if (value < minimum) { - value = minimum; - } else if (value > maximum) { - value = maximum; - } - *valuePtr = value; - return n; -} - -/**********************************************************************/ -static ssize_t showUInt(struct kvdoDevice *device, - struct attribute *attr, - char *buf) -{ - VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); - - return sprintf(buf, "%u\n", *(unsigned int *)vdoAttr->valuePtr); -} - -/**********************************************************************/ -static ssize_t scanBool(const char *buf, size_t n, bool *valuePtr) -{ - unsigned int intValue = 0; - n = scanUInt(buf, n, &intValue, 0, 1); - if (n > 0) { - *valuePtr = (intValue != 0); - } - return n; -} - -/**********************************************************************/ -static ssize_t showBool(struct kvdoDevice *device, - struct attribute *attr, - char *buf) -{ - VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); - - return sprintf(buf, "%u\n", *(bool *)vdoAttr->valuePtr ? 1 : 0); -} - -/**********************************************************************/ -static ssize_t vdoTraceRecordingStore(struct kvdoDevice *device, - const char *buf, - size_t n) -{ - return scanBool(buf, n, &traceRecording); -} - -/**********************************************************************/ -static ssize_t vdoMaxReqActiveStore(struct kvdoDevice *device, - const char *buf, - size_t n) -{ - /* - * The base code has some hardcoded assumptions about the maximum - * number of requests that can be in progress. Maybe someday we'll - * do calculations with the actual number; for now, just make sure - * the assumption holds. - */ - return scanInt(buf, n, &defaultMaxRequestsActive, 1, MAXIMUM_USER_VIOS); -} - -/**********************************************************************/ -static ssize_t vdoAlbireoTimeoutIntervalStore(struct kvdoDevice *device, - const char *buf, - size_t n) -{ - unsigned int value; - ssize_t result = scanUInt(buf, n, &value, 0, UINT_MAX); - if (result > 0) { - setAlbireoTimeoutInterval(value); - } - return result; -} - -/**********************************************************************/ -static ssize_t vdoMinAlbireoTimerIntervalStore(struct kvdoDevice *device, - const char *buf, - size_t n) -{ - unsigned int value; - ssize_t result = scanUInt(buf, n, &value, 0, UINT_MAX); - if (result > 0) { - setMinAlbireoTimerInterval(value); - } - return result; -} - -/**********************************************************************/ -static ssize_t vdoVersionShow(struct kvdoDevice *device, - struct attribute *attr, - char *buf) -{ - return sprintf(buf, "%s\n", CURRENT_VERSION); -} - -/**********************************************************************/ -static ssize_t vdoAttrShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); - if (vdoAttr->show == NULL) { - return -EINVAL; - } - - struct kvdoDevice *device = container_of(kobj, struct kvdoDevice, kobj); - return (*vdoAttr->show)(device, attr, buf); -} - -/**********************************************************************/ -static ssize_t vdoAttrStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) -{ - VDOAttribute *vdoAttr = container_of(attr, VDOAttribute, attr); - if (vdoAttr->store == NULL) { - return -EINVAL; - } - - struct kvdoDevice *device = container_of(kobj, struct kvdoDevice, kobj); - return (*vdoAttr->store)(device, buf, length); -} - -static VDOAttribute vdoStatusAttr = { - .attr = { .name = "status", .mode = 0444, }, - .show = vdoStatusShow, -}; - -static VDOAttribute vdoLogLevelAttr = { - .attr = {.name = "log_level", .mode = 0644, }, - .show = vdoLogLevelShow, - .store = vdoLogLevelStore, -}; - -static VDOAttribute vdoMaxReqActiveAttr = { - .attr = {.name = "max_requests_active", .mode = 0644, }, - .show = showInt, - .store = vdoMaxReqActiveStore, - .valuePtr = &defaultMaxRequestsActive, -}; - -static VDOAttribute vdoAlbireoTimeoutInterval = { - .attr = {.name = "deduplication_timeout_interval", .mode = 0644, }, - .show = showUInt, - .store = vdoAlbireoTimeoutIntervalStore, - .valuePtr = &albireoTimeoutInterval, -}; - -static VDOAttribute vdoMinAlbireoTimerInterval = { - .attr = {.name = "min_deduplication_timer_interval", .mode = 0644, }, - .show = showUInt, - .store = vdoMinAlbireoTimerIntervalStore, - .valuePtr = &minAlbireoTimerInterval, -}; - -static VDOAttribute vdoTraceRecording = { - .attr = {.name = "trace_recording", .mode = 0644, }, - .show = showBool, - .store = vdoTraceRecordingStore, - .valuePtr = &traceRecording, -}; - -static VDOAttribute vdoVersionAttr = { - .attr = { .name = "version", .mode = 0444, }, - .show = vdoVersionShow, -}; - -static struct attribute *defaultAttrs[] = { - &vdoStatusAttr.attr, - &vdoLogLevelAttr.attr, - &vdoMaxReqActiveAttr.attr, - &vdoAlbireoTimeoutInterval.attr, - &vdoMinAlbireoTimerInterval.attr, - &vdoTraceRecording.attr, - &vdoVersionAttr.attr, - NULL -}; - -static struct sysfs_ops vdoSysfsOps = { - .show = vdoAttrShow, - .store = vdoAttrStore, -}; - -/**********************************************************************/ -static void vdoRelease(struct kobject *kobj) -{ - return; -} - -struct kobj_type vdo_ktype = { - .release = vdoRelease, - .sysfs_ops = &vdoSysfsOps, - .default_attrs = defaultAttrs, -}; - -/**********************************************************************/ -int vdoInitSysfs(struct kobject *deviceObject) -{ - kobject_init(deviceObject, &vdo_ktype); - int result = kobject_add(deviceObject, NULL, THIS_MODULE->name); - if (result < 0) { - logError("kobject_add failed with status %d", -result); - kobject_put(deviceObject); - } - logDebug("added sysfs objects"); - return result; -}; - -/**********************************************************************/ -void vdoPutSysfs(struct kobject *deviceObject) -{ - kobject_put(deviceObject); -} diff --git a/vdo/kernel/sysfs.h b/vdo/kernel/sysfs.h deleted file mode 100644 index 3dbac04..0000000 --- a/vdo/kernel/sysfs.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/sysfs.h#2 $ - */ - -#ifndef ALBIREO_SYSFS_H -#define ALBIREO_SYSFS_H - -#include "kernelLayer.h" - -struct kvdoDevice; - -/** -* Initializes the sysfs objects global to all vdo devices. -* -* @param deviceObject the kobject of the kvdoDevice to initialize. -*/ -int vdoInitSysfs(struct kobject *deviceObject); - -/** - * Releases the global sysfs objects. - * - * @param deviceObject the kobject of the kvdoDevice to release. - */ -void vdoPutSysfs(struct kobject *deviceObject); - -#endif /* ALBIREO_SYSFS_H */ diff --git a/vdo/kernel/threadDevice.c b/vdo/kernel/threadDevice.c deleted file mode 100644 index 49fb909..0000000 --- a/vdo/kernel/threadDevice.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadDevice.c#1 $ - */ - -#include "threadDevice.h" - -#include "threadRegistry.h" - -/* - * A registry of all threads temporarily associated with particular - * VDO devices. - */ -static ThreadRegistry deviceIDThreadRegistry; - -/**********************************************************************/ -void registerThreadDeviceID(RegisteredThread *newThread, unsigned int *idPtr) -{ - registerThread(&deviceIDThreadRegistry, newThread, idPtr); -} - -/**********************************************************************/ -void unregisterThreadDeviceID(void) -{ - unregisterThread(&deviceIDThreadRegistry); -} - -/**********************************************************************/ -int getThreadDeviceID(void) -{ - const unsigned int *pointer = lookupThread(&deviceIDThreadRegistry); - return pointer ? *pointer : -1; -} - -/**********************************************************************/ -void initializeThreadDeviceRegistry(void) -{ - initializeThreadRegistry(&deviceIDThreadRegistry); -} diff --git a/vdo/kernel/threadDevice.h b/vdo/kernel/threadDevice.h deleted file mode 100644 index 61b4ce6..0000000 --- a/vdo/kernel/threadDevice.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadDevice.h#1 $ - */ - -#include "kernelLayer.h" - -/** - * Temporarily register the current thread as being associated with a - * VDO device id number, for logging purposes. - * - * Any such registered thread must later be unregistered via - * unregisterThreadDeviceID. - * - * The pointed-to ID number should be nonzero. - * - * @param newThread RegisteredThread structure to use for the current thread - * @param idPtr Location where the ID number is stored - **/ -void registerThreadDeviceID(RegisteredThread *newThread, unsigned int *idPtr); - -/** - * Temporarily register the current thread as being associated with an - * existing VDO device, for logging purposes. - * - * Any such registered thread must later be unregistered via - * unregisterThreadDeviceID. - * - * @param newThread RegisteredThread structure to use for the current thread - * @param layer The KernelLayer object for the VDO device - **/ -static inline void registerThreadDevice(RegisteredThread *newThread, - KernelLayer *layer) -{ - registerThreadDeviceID(newThread, &layer->instance); -} - -/** - * Cancel registration of the current thread as being associated with - * a VDO device or device ID number. - **/ -void unregisterThreadDeviceID(void); - -/** - * Get the VDO device ID number temporarily associated with the - * current thread, if any. - * - * @return the device ID number, if any, or -1 - **/ -int getThreadDeviceID(void); - -/** - * Initialize the thread device-ID registry. - **/ -void initializeThreadDeviceRegistry(void); diff --git a/vdo/kernel/threadRegistry.c b/vdo/kernel/threadRegistry.c deleted file mode 100644 index 6184d3c..0000000 --- a/vdo/kernel/threadRegistry.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadRegistry.c#1 $ - */ - -#include "threadRegistry.h" - -#include -#include - -#include "permassert.h" - -/* - * We need to be careful when using other facilities that may use - * threadRegistry functions in their normal operation. For example, - * we do not want to invoke the logger while holding a lock. - */ - -/*****************************************************************************/ -void registerThread(ThreadRegistry *registry, - RegisteredThread *newThread, - const void *pointer) -{ - INIT_LIST_HEAD(&newThread->links); - newThread->pointer = pointer; - newThread->task = current; - - bool foundIt = false; - RegisteredThread *thread; - write_lock(®istry->lock); - list_for_each_entry(thread, ®istry->links, links) { - if (thread->task == current) { - // This should not have been there. - // We'll complain after releasing the lock. - list_del_init(&thread->links); - foundIt = true; - break; - } - } - list_add_tail(&newThread->links, ®istry->links); - write_unlock(®istry->lock); - ASSERT_LOG_ONLY(!foundIt, "new thread not already in registry"); -} - -/*****************************************************************************/ -void unregisterThread(ThreadRegistry *registry) -{ - bool foundIt = false; - RegisteredThread *thread; - write_lock(®istry->lock); - list_for_each_entry(thread, ®istry->links, links) { - if (thread->task == current) { - list_del_init(&thread->links); - foundIt = true; - break; - } - } - write_unlock(®istry->lock); - ASSERT_LOG_ONLY(foundIt, "thread found in registry"); -} - -/*****************************************************************************/ -void initializeThreadRegistry(ThreadRegistry *registry) -{ - INIT_LIST_HEAD(®istry->links); - rwlock_init(®istry->lock); -} - -/*****************************************************************************/ -const void *lookupThread(ThreadRegistry *registry) -{ - const void *result = NULL; - read_lock(®istry->lock); - RegisteredThread *thread; - list_for_each_entry(thread, ®istry->links, links) { - if (thread->task == current) { - result = thread->pointer; - break; - } - } - read_unlock(®istry->lock); - return result; -} diff --git a/vdo/kernel/threadRegistry.h b/vdo/kernel/threadRegistry.h deleted file mode 100644 index f32325e..0000000 --- a/vdo/kernel/threadRegistry.h +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threadRegistry.h#1 $ - */ - -#ifndef THREAD_REGISTRY_H -#define THREAD_REGISTRY_H 1 - -#include -#include - -/* - * We don't expect this set to ever get really large, so a linked list - * is adequate. - */ - -typedef struct threadRegistry { - struct list_head links; - rwlock_t lock; -} ThreadRegistry; - -typedef struct registeredThread { - struct list_head links; - const void *pointer; - struct task_struct *task; -} RegisteredThread; - -/*****************************************************************************/ - -/** - * Initialize a registry of threads and associated data pointers. - * - * @param registry The registry to initialize - **/ -void initializeThreadRegistry(ThreadRegistry *registry); - -/** - * Register the current thread and associate it with a data pointer. - * - * This call will log messages if the thread is already registered. - * - * @param registry The thread registry - * @param newThread RegisteredThread structure to use for the current thread - * @param pointer The value to associated with the current thread - **/ -void registerThread(ThreadRegistry *registry, - RegisteredThread *newThread, - const void *pointer); - -/** - * Remove the registration for the current thread. - * - * A message may be logged if the thread was not registered. - * - * @param registry The thread registry - **/ -void unregisterThread(ThreadRegistry *registry); - -/** - * Fetch a pointer that may have been registered for the current - * thread. If the thread is not registered, a null pointer is - * returned. - * - * @param registry The thread registry - * - * @return the registered pointer, if any, or NULL - **/ -const void *lookupThread(ThreadRegistry *registry); - -#endif /* THREAD_REGISTRY_H */ diff --git a/vdo/kernel/threads.c b/vdo/kernel/threads.c deleted file mode 100644 index 2f905ed..0000000 --- a/vdo/kernel/threads.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threads.c#1 $ - */ - -#include "threads.h" - -#include -#include - -/**********************************************************************/ -pid_t getThreadId(void) -{ - return in_interrupt() ? -1 : current->pid; -} diff --git a/vdo/kernel/threads.h b/vdo/kernel/threads.h deleted file mode 100644 index 25f8b47..0000000 --- a/vdo/kernel/threads.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/threads.h#1 $ - */ - -#ifndef THREADS_H -#define THREADS_H - -#include - -/** - * Return the id of the current thread. - * In kernel interrupt context, returns -1. - * - * @return the thread id - **/ -pid_t getThreadId(void) - __attribute__((warn_unused_result)); - -#endif /* THREADS_H */ diff --git a/vdo/kernel/udsIndex.c b/vdo/kernel/udsIndex.c deleted file mode 100644 index a202446..0000000 --- a/vdo/kernel/udsIndex.c +++ /dev/null @@ -1,835 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/udsIndex.c#16 $ - */ - -#include "udsIndex.h" - -#include "logger.h" -#include "memoryAlloc.h" -#include "murmur/MurmurHash3.h" -#include "numeric.h" -#include "stringUtils.h" -#include "uds-block.h" - -/*****************************************************************************/ - -typedef struct udsAttribute { - struct attribute attr; - const char *(*showString)(DedupeIndex *); -} UDSAttribute; - -/*****************************************************************************/ - -enum { UDS_Q_ACTION }; - -/*****************************************************************************/ - -// These are the values in the atomic dedupeContext.requestState field -enum { - // The UdsRequest object is not in use. - UR_IDLE = 0, - // The UdsRequest object is in use, and VDO is waiting for the result. - UR_BUSY = 1, - // The UdsRequest object is in use, but has timed out. - UR_TIMED_OUT = 2, -}; - -/*****************************************************************************/ - -typedef enum { - // The UDS index is closed - IS_CLOSED = 0, - // The UDS index session is opening or closing - IS_CHANGING = 1, - // The UDS index is open. There is a UDS index session. - IS_OPENED = 2, -} IndexState; - -/*****************************************************************************/ - -typedef struct udsIndex { - DedupeIndex common; - struct kobject dedupeObject; - RegisteredThread allocatingThread; - char *indexName; - UdsConfiguration configuration; - struct uds_parameters udsParams; - struct uds_index_session *indexSession; - atomic_t active; - // This spinlock protects the state fields and the starting of dedupe - // requests. - spinlock_t stateLock; - KvdoWorkItem workItem; // protected by stateLock - KvdoWorkQueue *udsQueue; // protected by stateLock - unsigned int maximum; // protected by stateLock - IndexState indexState; // protected by stateLock - IndexState indexTarget; // protected by stateLock - bool changing; // protected by stateLock - bool createFlag; // protected by stateLock - bool dedupeFlag; // protected by stateLock - bool deduping; // protected by stateLock - bool errorFlag; // protected by stateLock - bool suspended; // protected by stateLock - // This spinlock protects the pending list, the pending flag in each KVIO, - // and the timeout list. - spinlock_t pendingLock; - struct list_head pendingHead; // protected by pendingLock - struct timer_list pendingTimer; // protected by pendingLock - bool startedTimer; // protected by pendingLock -} UDSIndex; - -/*****************************************************************************/ - -// Version 1: user space albireo index (limited to 32 bytes) -// Version 2: kernel space albireo index (limited to 16 bytes) -enum { - UDS_ADVICE_VERSION = 2, - // version byte + state byte + 64-bit little-endian PBN - UDS_ADVICE_SIZE = 1 + 1 + sizeof(uint64_t), -}; - -/*****************************************************************************/ - - // We want to ensure that there is only one copy of the following constants. -static const char *CLOSED = "closed"; -static const char *CLOSING = "closing"; -static const char *ERROR = "error"; -static const char *OFFLINE = "offline"; -static const char *ONLINE = "online"; -static const char *OPENING = "opening"; -static const char *SUSPENDED = "suspended"; -static const char *UNKNOWN = "unknown"; - -/*****************************************************************************/ -static const char *indexStateToString(UDSIndex *index, IndexState state) -{ - if (index->suspended) { - return SUSPENDED; - } - - switch (state) { - case IS_CLOSED: - // Closed. The errorFlag tells if it is because of an error. - return index->errorFlag ? ERROR : CLOSED; - case IS_CHANGING: - // The indexTarget tells if we are opening or closing the index. - return index->indexTarget == IS_OPENED ? OPENING : CLOSING; - case IS_OPENED: - // Opened. The dedupeFlag tells if we are online or offline. - return index->dedupeFlag ? ONLINE : OFFLINE; - default: - return UNKNOWN; - } -} - -/** - * Encode VDO duplicate advice into the newMetadata field of a UDS request. - * - * @param request The UDS request to receive the encoding - * @param advice The advice to encode - **/ -static void encodeUDSAdvice(UdsRequest *request, DataLocation advice) -{ - size_t offset = 0; - struct udsChunkData *encoding = &request->newMetadata; - encoding->data[offset++] = UDS_ADVICE_VERSION; - encoding->data[offset++] = advice.state; - encodeUInt64LE(encoding->data, &offset, advice.pbn); - BUG_ON(offset != UDS_ADVICE_SIZE); -} - -/** - * Decode VDO duplicate advice from the oldMetadata field of a UDS request. - * - * @param request The UDS request containing the encoding - * @param advice The DataLocation to receive the decoded advice - * - * @return true if valid advice was found and decoded - **/ -static bool decodeUDSAdvice(const UdsRequest *request, DataLocation *advice) -{ - if ((request->status != UDS_SUCCESS) || !request->found) { - return false; - } - - size_t offset = 0; - const struct udsChunkData *encoding = &request->oldMetadata; - byte version = encoding->data[offset++]; - if (version != UDS_ADVICE_VERSION) { - logError("invalid UDS advice version code %u", version); - return false; - } - - advice->state = encoding->data[offset++]; - decodeUInt64LE(encoding->data, &offset, &advice->pbn); - BUG_ON(offset != UDS_ADVICE_SIZE); - return true; -} - -/*****************************************************************************/ -static void finishIndexOperation(UdsRequest *udsRequest) -{ - DataKVIO *dataKVIO = container_of(udsRequest, DataKVIO, - dedupeContext.udsRequest); - DedupeContext *dedupeContext = &dataKVIO->dedupeContext; - if (compareAndSwap32(&dedupeContext->requestState, UR_BUSY, UR_IDLE)) { - KVIO *kvio = dataKVIOAsKVIO(dataKVIO); - UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); - - spin_lock_bh(&index->pendingLock); - if (dedupeContext->isPending) { - list_del(&dedupeContext->pendingList); - dedupeContext->isPending = false; - } - spin_unlock_bh(&index->pendingLock); - - dedupeContext->status = udsRequest->status; - if ((udsRequest->type == UDS_POST) || (udsRequest->type == UDS_QUERY)) { - DataLocation advice; - if (decodeUDSAdvice(udsRequest, &advice)) { - setDedupeAdvice(dedupeContext, &advice); - } else { - setDedupeAdvice(dedupeContext, NULL); - } - } - invokeDedupeCallback(dataKVIO); - atomic_dec(&index->active); - } else { - compareAndSwap32(&dedupeContext->requestState, UR_TIMED_OUT, UR_IDLE); - } -} - -/*****************************************************************************/ -static void startExpirationTimer(UDSIndex *index, DataKVIO *dataKVIO) -{ - if (!index->startedTimer) { - index->startedTimer = true; - mod_timer(&index->pendingTimer, - getAlbireoTimeout(dataKVIO->dedupeContext.submissionTime)); - } -} - -/*****************************************************************************/ -static void startIndexOperation(KvdoWorkItem *item) -{ - KVIO *kvio = workItemAsKVIO(item); - DataKVIO *dataKVIO = kvioAsDataKVIO(kvio); - UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); - DedupeContext *dedupeContext = &dataKVIO->dedupeContext; - - spin_lock_bh(&index->pendingLock); - list_add_tail(&dedupeContext->pendingList, &index->pendingHead); - dedupeContext->isPending = true; - startExpirationTimer(index, dataKVIO); - spin_unlock_bh(&index->pendingLock); - - UdsRequest *udsRequest = &dedupeContext->udsRequest; - int status = udsStartChunkOperation(udsRequest); - if (status != UDS_SUCCESS) { - udsRequest->status = status; - finishIndexOperation(udsRequest); - } -} - -/*****************************************************************************/ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) -static void timeoutIndexOperations(struct timer_list *t) -#else -static void timeoutIndexOperations(unsigned long arg) -#endif -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) - UDSIndex *index = from_timer(index, t, pendingTimer); -#else - UDSIndex *index = (UDSIndex *) arg; -#endif - LIST_HEAD(expiredHead); - uint64_t timeoutJiffies = msecs_to_jiffies(albireoTimeoutInterval); - unsigned long earliestSubmissionAllowed = jiffies - timeoutJiffies; - spin_lock_bh(&index->pendingLock); - index->startedTimer = false; - while (!list_empty(&index->pendingHead)) { - DataKVIO *dataKVIO = list_first_entry(&index->pendingHead, DataKVIO, - dedupeContext.pendingList); - DedupeContext *dedupeContext = &dataKVIO->dedupeContext; - if (earliestSubmissionAllowed <= dedupeContext->submissionTime) { - startExpirationTimer(index, dataKVIO); - break; - } - list_del(&dedupeContext->pendingList); - dedupeContext->isPending = false; - list_add_tail(&dedupeContext->pendingList, &expiredHead); - } - spin_unlock_bh(&index->pendingLock); - while (!list_empty(&expiredHead)) { - DataKVIO *dataKVIO = list_first_entry(&expiredHead, DataKVIO, - dedupeContext.pendingList); - DedupeContext *dedupeContext = &dataKVIO->dedupeContext; - list_del(&dedupeContext->pendingList); - if (compareAndSwap32(&dedupeContext->requestState, - UR_BUSY, UR_TIMED_OUT)) { - dedupeContext->status = ETIMEDOUT; - invokeDedupeCallback(dataKVIO); - atomic_dec(&index->active); - kvdoReportDedupeTimeout(dataKVIOAsKVIO(dataKVIO)->layer, 1); - } - } -} - -/*****************************************************************************/ -static void enqueueIndexOperation(DataKVIO *dataKVIO, - UdsCallbackType operation) -{ - KVIO *kvio = dataKVIOAsKVIO(dataKVIO); - DedupeContext *dedupeContext = &dataKVIO->dedupeContext; - UDSIndex *index = container_of(kvio->layer->dedupeIndex, UDSIndex, common); - dedupeContext->status = UDS_SUCCESS; - dedupeContext->submissionTime = jiffies; - if (compareAndSwap32(&dedupeContext->requestState, UR_IDLE, UR_BUSY)) { - UdsRequest *udsRequest = &dataKVIO->dedupeContext.udsRequest; - udsRequest->chunkName = *dedupeContext->chunkName; - udsRequest->callback = finishIndexOperation; - udsRequest->session = index->indexSession; - udsRequest->type = operation; - udsRequest->update = true; - if ((operation == UDS_POST) || (operation == UDS_UPDATE)) { - encodeUDSAdvice(udsRequest, getDedupeAdvice(dedupeContext)); - } - - setupWorkItem(&kvio->enqueueable.workItem, startIndexOperation, NULL, - UDS_Q_ACTION); - - spin_lock(&index->stateLock); - if (index->deduping) { - enqueueWorkQueue(index->udsQueue, &kvio->enqueueable.workItem); - unsigned int active = atomic_inc_return(&index->active); - if (active > index->maximum) { - index->maximum = active; - } - kvio = NULL; - } else { - atomicStore32(&dedupeContext->requestState, UR_IDLE); - } - spin_unlock(&index->stateLock); - } else { - // A previous user of the KVIO had a dedupe timeout - // and its request is still outstanding. - atomic64_inc(&kvio->layer->dedupeContextBusy); - } - if (kvio != NULL) { - invokeDedupeCallback(dataKVIO); - } -} - -/*****************************************************************************/ -static void closeIndex(UDSIndex *index) -{ - // Change the index state so that getIndexStatistics will not try to - // use the index session we are closing. - index->indexState = IS_CHANGING; - spin_unlock(&index->stateLock); - int result = udsCloseIndex(index->indexSession); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "Error closing index %s", - index->indexName); - } - spin_lock(&index->stateLock); - index->indexState = IS_CLOSED; - index->errorFlag |= result != UDS_SUCCESS; - // ASSERTION: We leave in IS_CLOSED state. -} - -/*****************************************************************************/ -static void openIndex(UDSIndex *index) -{ - // ASSERTION: We enter in IS_CLOSED state. - bool createFlag = index->createFlag; - index->createFlag = false; - // Change the index state so that the it will be reported to the outside - // world as "opening". - index->indexState = IS_CHANGING; - index->errorFlag = false; - // Open the index session, while not holding the stateLock - spin_unlock(&index->stateLock); - - int result = udsOpenIndex(createFlag ? UDS_CREATE : UDS_LOAD, - index->indexName, &index->udsParams, - index->configuration, index->indexSession); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "Error opening index %s", - index->indexName); - } - spin_lock(&index->stateLock); - if (!createFlag) { - switch (result) { - case UDS_CORRUPT_COMPONENT: - case UDS_NO_INDEX: - // Either there is no index, or there is no way we can recover the index. - // We will be called again and try to create a new index. - index->indexState = IS_CLOSED; - index->createFlag = true; - return; - default: - break; - } - } - if (result == UDS_SUCCESS) { - index->indexState = IS_OPENED; - } else { - index->indexState = IS_CLOSED; - index->indexTarget = IS_CLOSED; - index->errorFlag = true; - spin_unlock(&index->stateLock); - logInfo("Setting UDS index target state to error"); - spin_lock(&index->stateLock); - } - // ASSERTION: On success, we leave in IS_OPEN state. - // ASSERTION: On failure, we leave in IS_CLOSED state. -} - -/*****************************************************************************/ -static void changeDedupeState(KvdoWorkItem *item) -{ - UDSIndex *index = container_of(item, UDSIndex, workItem); - spin_lock(&index->stateLock); - // Loop until the index is in the target state and the create flag is - // clear. - while (!index->suspended && - ((index->indexState != index->indexTarget) || - index->createFlag)) { - if (index->indexState == IS_OPENED) { - closeIndex(index); - } else { - openIndex(index); - } - } - index->changing = false; - index->deduping = index->dedupeFlag && (index->indexState == IS_OPENED); - spin_unlock(&index->stateLock); -} - - -/*****************************************************************************/ -static void launchDedupeStateChange(UDSIndex *index) -{ - // ASSERTION: We enter with the state_lock held. - if (index->changing || index->suspended) { - // Either a change is already in progress, or changes are - // not allowed. - return; - } - - if (index->createFlag || - (index->indexState != index->indexTarget)) { - index->changing = true; - index->deduping = false; - setupWorkItem(&index->workItem, - changeDedupeState, - NULL, - UDS_Q_ACTION); - enqueueWorkQueue(index->udsQueue, &index->workItem); - return; - } - - // Online vs. offline changes happen immediately - index->deduping = (index->dedupeFlag && !index->suspended && - (index->indexState == IS_OPENED)); - - // ASSERTION: We exit with the state_lock held. -} - -/*****************************************************************************/ -static void setTargetState(UDSIndex *index, - IndexState target, - bool changeDedupe, - bool dedupe, - bool setCreate) -{ - spin_lock(&index->stateLock); - const char *oldState = indexStateToString(index, index->indexTarget); - if (changeDedupe) { - index->dedupeFlag = dedupe; - } - if (setCreate) { - index->createFlag = true; - } - index->indexTarget = target; - launchDedupeStateChange(index); - const char *newState = indexStateToString(index, index->indexTarget); - spin_unlock(&index->stateLock); - if (oldState != newState) { - logInfo("Setting UDS index target state to %s", newState); - } -} - -/*****************************************************************************/ -static void suspendUDSIndex(DedupeIndex *dedupeIndex, bool saveFlag) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - spin_lock(&index->stateLock); - index->suspended = true; - IndexState indexState = index->indexState; - spin_unlock(&index->stateLock); - if (indexState != IS_CLOSED) { - int result = udsSuspendIndexSession(index->indexSession, saveFlag); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "Error suspending dedupe index"); - } - } -} - -/*****************************************************************************/ -static void resumeUDSIndex(DedupeIndex *dedupeIndex) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - int result = udsResumeIndexSession(index->indexSession); - if (result != UDS_SUCCESS) { - logErrorWithStringError(result, "Error resuming dedupe index"); - } - spin_lock(&index->stateLock); - index->suspended = false; - launchDedupeStateChange(index); - spin_unlock(&index->stateLock); -} - -/*****************************************************************************/ - -/*****************************************************************************/ -static void dumpUDSIndex(DedupeIndex *dedupeIndex, bool showQueue) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - spin_lock(&index->stateLock); - const char *state = indexStateToString(index, index->indexState); - const char *target = (index->changing - ? indexStateToString(index, index->indexTarget) - : NULL); - spin_unlock(&index->stateLock); - logInfo("UDS index: state: %s", state); - if (target != NULL) { - logInfo("UDS index: changing to state: %s", target); - } - if (showQueue) { - dumpWorkQueue(index->udsQueue); - } -} - -/*****************************************************************************/ -static void finishUDSIndex(DedupeIndex *dedupeIndex) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - setTargetState(index, IS_CLOSED, false, false, false); - udsDestroyIndexSession(index->indexSession); - finishWorkQueue(index->udsQueue); -} - -/*****************************************************************************/ -static void freeUDSIndex(DedupeIndex *dedupeIndex) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - freeWorkQueue(&index->udsQueue); - spin_lock_bh(&index->pendingLock); - if (index->startedTimer) { - del_timer_sync(&index->pendingTimer); - } - spin_unlock_bh(&index->pendingLock); - kobject_put(&index->dedupeObject); -} - -/*****************************************************************************/ -static const char *getUDSStateName(DedupeIndex *dedupeIndex) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - spin_lock(&index->stateLock); - const char *state = indexStateToString(index, index->indexState); - spin_unlock(&index->stateLock); - return state; -} - -/*****************************************************************************/ -static void getUDSStatistics(DedupeIndex *dedupeIndex, IndexStatistics *stats) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - spin_lock(&index->stateLock); - IndexState indexState = index->indexState; - stats->maxDedupeQueries = index->maximum; - spin_unlock(&index->stateLock); - stats->currDedupeQueries = atomic_read(&index->active); - if (indexState == IS_OPENED) { - UdsIndexStats indexStats; - int result = udsGetIndexStats(index->indexSession, &indexStats); - if (result == UDS_SUCCESS) { - stats->entriesIndexed = indexStats.entriesIndexed; - } else { - logErrorWithStringError(result, "Error reading index stats"); - } - UdsContextStats contextStats; - result = udsGetIndexSessionStats(index->indexSession, &contextStats); - if (result == UDS_SUCCESS) { - stats->postsFound = contextStats.postsFound; - stats->postsNotFound = contextStats.postsNotFound; - stats->queriesFound = contextStats.queriesFound; - stats->queriesNotFound = contextStats.queriesNotFound; - stats->updatesFound = contextStats.updatesFound; - stats->updatesNotFound = contextStats.updatesNotFound; - } else { - logErrorWithStringError(result, "Error reading context stats"); - } - } -} - - -/*****************************************************************************/ -static int processMessage(DedupeIndex *dedupeIndex, const char *name) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - if (strcasecmp(name, "index-close") == 0) { - setTargetState(index, IS_CLOSED, false, false, false); - return 0; - } else if (strcasecmp(name, "index-create") == 0) { - setTargetState(index, IS_OPENED, false, false, true); - return 0; - } else if (strcasecmp(name, "index-disable") == 0) { - setTargetState(index, IS_OPENED, true, false, false); - return 0; - } else if (strcasecmp(name, "index-enable") == 0) { - setTargetState(index, IS_OPENED, true, true, false); - return 0; - } - return -EINVAL; -} - -/*****************************************************************************/ -static void udsPost(DataKVIO *dataKVIO) -{ - enqueueIndexOperation(dataKVIO, UDS_POST); -} - -/*****************************************************************************/ -static void udsQuery(DataKVIO *dataKVIO) -{ - enqueueIndexOperation(dataKVIO, UDS_QUERY); -} - -/*****************************************************************************/ -static void startUDSIndex(DedupeIndex *dedupeIndex, bool createFlag) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - setTargetState(index, IS_OPENED, true, true, createFlag); -} - -/*****************************************************************************/ -static void stopUDSIndex(DedupeIndex *dedupeIndex) -{ - UDSIndex *index = container_of(dedupeIndex, UDSIndex, common); - setTargetState(index, IS_CLOSED, false, false, false); -} - -/*****************************************************************************/ -static void udsUpdate(DataKVIO *dataKVIO) -{ - enqueueIndexOperation(dataKVIO, UDS_UPDATE); -} - -/*****************************************************************************/ -static void dedupeKobjRelease(struct kobject *kobj) -{ - UDSIndex *index = container_of(kobj, UDSIndex, dedupeObject); - udsFreeConfiguration(index->configuration); - FREE(index->indexName); - FREE(index); -} - -/*****************************************************************************/ -static ssize_t dedupeStatusShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - UDSAttribute *ua = container_of(attr, UDSAttribute, attr); - UDSIndex *index = container_of(kobj, UDSIndex, dedupeObject); - if (ua->showString != NULL) { - return sprintf(buf, "%s\n", ua->showString(&index->common)); - } else { - return -EINVAL; - } -} - -/*****************************************************************************/ -static ssize_t dedupeStatusStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) -{ - return -EINVAL; -} - -/*****************************************************************************/ - -static struct sysfs_ops dedupeSysfsOps = { - .show = dedupeStatusShow, - .store = dedupeStatusStore, -}; - -static UDSAttribute dedupeStatusAttribute = { - .attr = {.name = "status", .mode = 0444, }, - .showString = getUDSStateName, -}; - -static struct attribute *dedupeAttributes[] = { - &dedupeStatusAttribute.attr, - NULL, -}; - -static struct kobj_type dedupeKobjType = { - .release = dedupeKobjRelease, - .sysfs_ops = &dedupeSysfsOps, - .default_attrs = dedupeAttributes, -}; - -/*****************************************************************************/ -static void startUDSQueue(void *ptr) -{ - /* - * Allow the UDS dedupe worker thread to do memory allocations. It will - * only do allocations during the UDS calls that open or close an index, - * but those allocations can safely sleep while reserving a large amount - * of memory. We could use an allocationsAllowed boolean (like the base - * threads do), but it would be an unnecessary embellishment. - */ - UDSIndex *index = ptr; - registerAllocatingThread(&index->allocatingThread, NULL); -} - -/*****************************************************************************/ -static void finishUDSQueue(void *ptr) -{ - unregisterAllocatingThread(); -} - -/*****************************************************************************/ -int makeUDSIndex(KernelLayer *layer, DedupeIndex **indexPtr) -{ - UDSIndex *index; - int result = ALLOCATE(1, UDSIndex, "UDS index data", &index); - if (result != UDS_SUCCESS) { - return result; - } - - result = allocSprintf("index name", &index->indexName, - "dev=%s offset=4096 size=%llu", - layer->deviceConfig->parentDeviceName, - getIndexRegionSize(layer->geometry) * VDO_BLOCK_SIZE); - if (result != UDS_SUCCESS) { - logError("Creating index name failed (%d)", result); - FREE(index); - return result; - } - - index->udsParams = (struct uds_parameters) UDS_PARAMETERS_INITIALIZER; - indexConfigToUdsParameters(&layer->geometry.indexConfig, &index->udsParams); - result = indexConfigToUdsConfiguration(&layer->geometry.indexConfig, - &index->configuration); - if (result != VDO_SUCCESS) { - FREE(index->indexName); - FREE(index); - return result; - } - udsConfigurationSetNonce(index->configuration, - (UdsNonce) layer->geometry.nonce); - - result = udsCreateIndexSession(&index->indexSession); - if (result != UDS_SUCCESS) { - udsFreeConfiguration(index->configuration); - FREE(index->indexName); - FREE(index); - return result; - } - - static const KvdoWorkQueueType udsQueueType = { - .start = startUDSQueue, - .finish = finishUDSQueue, - .actionTable = { - { .name = "uds_action", .code = UDS_Q_ACTION, .priority = 0 }, - }, - }; - result = makeWorkQueue(layer->threadNamePrefix, "dedupeQ", - &layer->wqDirectory, layer, index, &udsQueueType, 1, - &index->udsQueue); - if (result != VDO_SUCCESS) { - logError("UDS index queue initialization failed (%d)", result); - udsDestroyIndexSession(index->indexSession); - udsFreeConfiguration(index->configuration); - FREE(index->indexName); - FREE(index); - return result; - } - - kobject_init(&index->dedupeObject, &dedupeKobjType); - result = kobject_add(&index->dedupeObject, &layer->kobj, "dedupe"); - if (result != VDO_SUCCESS) { - freeWorkQueue(&index->udsQueue); - udsDestroyIndexSession(index->indexSession); - udsFreeConfiguration(index->configuration); - FREE(index->indexName); - FREE(index); - return result; - } - - index->common.dump = dumpUDSIndex; - index->common.free = freeUDSIndex; - index->common.getDedupeStateName = getUDSStateName; - index->common.getStatistics = getUDSStatistics; - index->common.message = processMessage; - index->common.post = udsPost; - index->common.query = udsQuery; - index->common.resume = resumeUDSIndex; - index->common.start = startUDSIndex; - index->common.stop = stopUDSIndex; - index->common.suspend = suspendUDSIndex; - index->common.finish = finishUDSIndex; - index->common.update = udsUpdate; - - INIT_LIST_HEAD(&index->pendingHead); - spin_lock_init(&index->pendingLock); - spin_lock_init(&index->stateLock); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) - timer_setup(&index->pendingTimer, timeoutIndexOperations, 0); -#else - setup_timer(&index->pendingTimer, timeoutIndexOperations, - (unsigned long) index); -#endif - - *indexPtr = &index->common; - return VDO_SUCCESS; -} diff --git a/vdo/kernel/udsIndex.h b/vdo/kernel/udsIndex.h deleted file mode 100644 index 19a7470..0000000 --- a/vdo/kernel/udsIndex.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/udsIndex.h#1 $ - */ - -#ifndef UDS_INDEX_H -#define UDS_INDEX_H - -#include "dedupeIndex.h" - -/** - * Make a UDS index - * - * @param layer the kernel layer - * @param indexPtr dedupe index returned here - * - * @return VDO_SUCCESS or an error code - **/ -int makeUDSIndex(KernelLayer *layer, DedupeIndex **indexPtr) - __attribute__ ((__warn_unused_result__)); - -#endif /* UDS_INDEX_H */ diff --git a/vdo/kernel/vdoCommon.h b/vdo/kernel/vdoCommon.h deleted file mode 100644 index c83e066..0000000 --- a/vdo/kernel/vdoCommon.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoCommon.h#1 $ - */ - -#ifndef VDO_COMMON_H -#define VDO_COMMON_H - -enum { - // Whether the bio acknowledgement queue is used for acks of reads. - USE_BIO_ACK_QUEUE_FOR_READ = 0, -}; - -#endif /* VDO_COMMON_H */ diff --git a/vdo/kernel/vdoStringUtils.c b/vdo/kernel/vdoStringUtils.c deleted file mode 100644 index d12580c..0000000 --- a/vdo/kernel/vdoStringUtils.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoStringUtils.c#1 $ - */ - -#include "vdoStringUtils.h" - -#include "errors.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "stringUtils.h" - -#include "statusCodes.h" - -/**********************************************************************/ -char *vAppendToBuffer(char *buffer, - char *bufEnd, - const char *fmt, - va_list args) -{ - size_t n = vsnprintf(buffer, bufEnd - buffer, fmt, args); - if (n >= (size_t) (bufEnd - buffer)) { - buffer = bufEnd; - } else { - buffer += n; - } - return buffer; -} - -/**********************************************************************/ -char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - char *pos = vAppendToBuffer(buffer, bufEnd, fmt, ap); - va_end(ap); - return pos; -} - -/**********************************************************************/ -void freeStringArray(char **stringArray) -{ - for (unsigned int offset = 0; stringArray[offset] != NULL; offset++) { - FREE(stringArray[offset]); - } - FREE(stringArray); -} - -/**********************************************************************/ -int splitString(const char *string, char separator, char ***substringArrayPtr) -{ - unsigned int substringCount = 1; - for (const char *s = string; *s != 0; s++) { - if (*s == separator) { - substringCount++; - } - } - - char **substrings; - int result = ALLOCATE(substringCount + 1, char *, "string-splitting array", - &substrings); - if (result != UDS_SUCCESS) { - return result; - } - unsigned int currentSubstring = 0; - for (const char *s = string; *s != 0; s++) { - if (*s == separator) { - ptrdiff_t length = s - string; - result = ALLOCATE(length + 1, char, "split string", - &substrings[currentSubstring]); - if (result != UDS_SUCCESS) { - freeStringArray(substrings); - return result; - } - // Trailing NUL is already in place after allocation; deal with - // the zero or more non-NUL bytes in the string. - if (length > 0) { - memcpy(substrings[currentSubstring], string, length); - } - string = s + 1; - currentSubstring++; - BUG_ON(currentSubstring >= substringCount); - } - } - // Process final string, with no trailing separator. - BUG_ON(currentSubstring != (substringCount - 1)); - ptrdiff_t length = strlen(string); - result = ALLOCATE(length + 1, char, "split string", - &substrings[currentSubstring]); - if (result != UDS_SUCCESS) { - freeStringArray(substrings); - return result; - } - memcpy(substrings[currentSubstring], string, length); - currentSubstring++; - // substrings[currentSubstring] is NULL already - *substringArrayPtr = substrings; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int joinStrings(char **substringArray, - size_t arrayLength, - char separator, - char **stringPtr) -{ - size_t stringLength = 0; - for (size_t i = 0; (i < arrayLength) && (substringArray[i] != NULL); i++) { - stringLength += strlen(substringArray[i]) + 1; - } - - char *output; - int result = ALLOCATE(stringLength, char, __func__, &output); - if (result != VDO_SUCCESS) { - return result; - } - - char *currentPosition = &output[0]; - for (size_t i = 0; (i < arrayLength) && (substringArray[i] != NULL); i++) { - currentPosition = appendToBuffer(currentPosition, output + stringLength, - "%s", substringArray[i]); - *currentPosition = separator; - currentPosition++; - } - - // We output one too many separators; replace the last with a zero byte. - if (currentPosition != output) { - *(currentPosition - 1) = '\0'; - } - - *stringPtr = output; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int stringToUInt(const char *input, unsigned int *valuePtr) -{ - unsigned long longValue; - int result = kstrtoul(input, 10, &longValue); - if (result != 0) { - return result; - } - - if (longValue > UINT_MAX) { - return -ERANGE; - } - - *valuePtr = longValue; - return UDS_SUCCESS; -} diff --git a/vdo/kernel/vdoStringUtils.h b/vdo/kernel/vdoStringUtils.h deleted file mode 100644 index 067ed9e..0000000 --- a/vdo/kernel/vdoStringUtils.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/vdoStringUtils.h#1 $ - */ - -#ifndef VDO_STRING_UTILS_H -#define VDO_STRING_UTILS_H - -#include -#include - -/** - * Helper to append a string to a buffer. - * - * @param buffer the place at which to append the string - * @param bufEnd pointer to the end of the buffer - * @param fmt a printf format string - * - * @return the updated buffer position after the append - * - * if insufficient space is available, the contents are silently truncated - **/ -char *appendToBuffer(char *buffer, char *bufEnd, const char *fmt, ...); - -/** - * Variable-arglist helper to append a string to a buffer. - * If insufficient space is available, the contents are silently truncated. - * - * @param buffer the place at which to append the string - * @param bufEnd pointer to the end of the buffer - * @param fmt a printf format string - * @param args printf arguments - * - * @return the updated buffer position after the append - **/ -char *vAppendToBuffer(char *buffer, - char *bufEnd, - const char *fmt, - va_list args); - -/** - * Split the input string into substrings, separated at occurrences of - * the indicated character, returning a null-terminated list of string - * pointers. - * - * The string pointers and the pointer array itself should both be - * freed with FREE() when no longer needed. This can be done with - * freeStringArray (below) if the pointers in the array are not - * changed. Since the array and copied strings are allocated by this - * function, it may only be used in contexts where allocation is - * permitted. - * - * Empty substrings are not ignored; that is, returned substrings may - * be empty strings if the separator occurs twice in a row. - * - * @param [in] string The input string to be broken apart - * @param [in] separator The separator character - * @param [out] substringArrayPtr The NULL-terminated substring array - * - * @return UDS_SUCCESS or -ENOMEM - **/ -int splitString(const char *string, char separator, char ***substringArrayPtr) - __attribute__((warn_unused_result)); - -/** - * Join the input substrings into one string, joined with the indicated - * character, returning a string. - * - * @param [in] substringArray The NULL-terminated substring array - * @param [in] arrayLength A bound on the number of valid elements - * in substringArray, in case it is not - * NULL-terminated. - * @param [in] separator The separator character - * @param [out] stringPtr A pointer to hold the joined string - * - * @return VDO_SUCCESS or an error - **/ -int joinStrings(char **substringArray, - size_t arrayLength, - char separator, - char **stringPtr) - __attribute__((warn_unused_result)); - -/** - * Free a list of non-NULL string pointers, and then the list itself. - * - * @param stringArray The string list - **/ -void freeStringArray(char **stringArray); - -/** - * Parse a string as an "unsigned int" value, yielding the value. - * On overflow, -ERANGE is returned. On invalid number, -EINVAL is - * returned. - * - * @param [in] input The string to be processed - * @param [out] valuePtr The value of the number read - * - * @return UDS_SUCCESS or -EINVAL or -ERANGE. - **/ -int stringToUInt(const char *input, unsigned int *valuePtr) - __attribute__((warn_unused_result)); - -#endif /* VDO_STRING_UTILS_H */ diff --git a/vdo/kernel/verify.c b/vdo/kernel/verify.c deleted file mode 100644 index 672ac91..0000000 --- a/vdo/kernel/verify.c +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/verify.c#3 $ - */ - -#include "verify.h" - -#include "logger.h" - -#include "dataKVIO.h" -#include "numeric.h" - -/** - * Compare blocks of memory for equality. - * - * This assumes the blocks are likely to be large; it's not well - * optimized for comparing just a few bytes. This is desirable - * because the Linux kernel memcmp() routine on x86 is not well - * optimized for large blocks, and the performance penalty turns out - * to be significant if you're doing lots of 4KB comparisons. - * - * @param pointerArgument1 first data block - * @param pointerArgument2 second data block - * @param length length of the data block - * - * @return true iff the two blocks are equal - **/ -__attribute__((warn_unused_result)) -static bool memoryEqual(void *pointerArgument1, - void *pointerArgument2, - size_t length) -{ - byte *pointer1 = pointerArgument1; - byte *pointer2 = pointerArgument2; - while (length >= sizeof(uint64_t)) { - /* - * GET_UNALIGNED is just for paranoia. (1) On x86_64 it is - * treated the same as an aligned access. (2) In this use case, - * one or both of the inputs will almost(?) always be aligned. - */ - if (GET_UNALIGNED(uint64_t, pointer1) - != GET_UNALIGNED(uint64_t, pointer2)) { - return false; - } - pointer1 += sizeof(uint64_t); - pointer2 += sizeof(uint64_t); - length -= sizeof(uint64_t); - } - while (length > 0) { - if (*pointer1 != *pointer2) { - return false; - } - pointer1++; - pointer2++; - length--; - } - return true; -} - -/** - * Verify the Albireo-provided deduplication advice, and invoke a - * callback once the answer is available. - * - * After we've compared the stored data with the data to be written, - * or after we've failed to be able to do so, the stored VIO callback - * is queued to be run in the main (kvdoReqQ) thread. - * - * If the advice turns out to be stale and the deduplication session - * is still active, submit a correction. (Currently the correction - * must be sent before the callback can be invoked, if the dedupe - * session is still live.) - * - * @param item The workitem from the queue - **/ -static void verifyDuplicationWork(KvdoWorkItem *item) -{ - DataKVIO *dataKVIO = workItemAsDataKVIO(item); - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION("$F;j=dedupe;cb=verify")); - - if (likely(memoryEqual(dataKVIO->dataBlock, dataKVIO->readBlock.data, - VDO_BLOCK_SIZE))) { - // Leave dataKVIO->dataVIO.isDuplicate set to true. - } else { - dataKVIO->dataVIO.isDuplicate = false; - } - - kvdoEnqueueDataVIOCallback(dataKVIO); -} - -/** - * Verify the Albireo-provided deduplication advice, and invoke a - * callback once the answer is available. - * - * @param dataKVIO The DataKVIO that we are looking to dedupe. - **/ -static void verifyReadBlockCallback(DataKVIO *dataKVIO) -{ - dataKVIOAddTraceRecord(dataKVIO, THIS_LOCATION(NULL)); - int err = dataKVIO->readBlock.status; - if (unlikely(err != 0)) { - logDebug("%s: err %d", __func__, err); - dataKVIO->dataVIO.isDuplicate = false; - kvdoEnqueueDataVIOCallback(dataKVIO); - return; - } - - launchDataKVIOOnCPUQueue(dataKVIO, verifyDuplicationWork, NULL, - CPU_Q_ACTION_COMPRESS_BLOCK); -} - -/**********************************************************************/ -void kvdoVerifyDuplication(DataVIO *dataVIO) -{ - ASSERT_LOG_ONLY(dataVIO->isDuplicate, "advice to verify must be valid"); - ASSERT_LOG_ONLY(dataVIO->duplicate.state != MAPPING_STATE_UNMAPPED, - "advice to verify must not be a discard"); - ASSERT_LOG_ONLY(dataVIO->duplicate.pbn != ZERO_BLOCK, - "advice to verify must not point to the zero block"); - ASSERT_LOG_ONLY(!dataVIO->isZeroBlock, - "zeroed block should not have advice to verify"); - - TraceLocation location - = THIS_LOCATION("verifyDuplication;dup=update(verify);io=verify"); - dataVIOAddTraceRecord(dataVIO, location); - kvdoReadBlock(dataVIO, dataVIO->duplicate.pbn, dataVIO->duplicate.state, - BIO_Q_ACTION_VERIFY, verifyReadBlockCallback); -} - -/**********************************************************************/ -bool kvdoCompareDataVIOs(DataVIO *first, DataVIO *second) -{ - dataVIOAddTraceRecord(second, THIS_LOCATION(NULL)); - DataKVIO *a = dataVIOAsDataKVIO(first); - DataKVIO *b = dataVIOAsDataKVIO(second); - return memoryEqual(a->dataBlock, b->dataBlock, VDO_BLOCK_SIZE); -} diff --git a/vdo/kernel/verify.h b/vdo/kernel/verify.h deleted file mode 100644 index 5b03dd7..0000000 --- a/vdo/kernel/verify.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/verify.h#1 $ - */ - -#include "kernelLayer.h" - -/** - * Verify the Albireo-provided deduplication advice, and invoke a callback once - * the answer is available. This is done through a call to kvdoReadBlock() - * which will eventually call back to verifyDuplication() once the block is - * read and possibly uncompressed. - * - * @param dataVIO The DataVIO with advice filled in. - **/ -void kvdoVerifyDuplication(DataVIO *dataVIO); - -/** - * Implements DataVIOComparator. - * - * @param first The first DataVIO to compare - * @param second The second DataVIO to compare - * - * @return true if the contents of the two DataVIOs are the same - **/ -bool kvdoCompareDataVIOs(DataVIO *first, DataVIO *second) - __attribute__((warn_unused_result)); diff --git a/vdo/kernel/workItemStats.c b/vdo/kernel/workItemStats.c deleted file mode 100644 index 2027cd8..0000000 --- a/vdo/kernel/workItemStats.c +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workItemStats.c#4 $ - */ - -#include "workItemStats.h" - -#include "atomic.h" -#include "logger.h" - -/** - * Scan the work queue stats table for the provided work function and - * priority value. If it's not found, see if an empty slot is - * available. - * - * @param table The work queue's function table - * @param work The function we want to record stats for - * @param priority The priority of the work item - * - * @return The index of the slot to use (matching or empty), or - * NUM_WORK_QUEUE_ITEM_STATS if the table is full of - * non-matching entries. - **/ -static inline unsigned int scanStatTable(const KvdoWorkFunctionTable *table, - KvdoWorkFunction work, - unsigned int priority) -{ - unsigned int i; - /* - * See comments in getStatTableIndex regarding order of memory - * accesses. Work function first, then a barrier, then priority. - */ - for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { - if (table->functions[i] == NULL) { - return i; - } else if (table->functions[i] == work) { - smp_rmb(); - if (table->priorities[i] == priority) { - return i; - } - } - } - return NUM_WORK_QUEUE_ITEM_STATS; -} - -/** - * Scan the work queue stats table for the provided work function and - * priority value. Assign an empty slot if necessary. - * - * @param stats The stats structure - * @param work The function we want to record stats for - * @param priority The priority of the work item - * - * @return The index of the matching slot, or NUM_WORK_QUEUE_ITEM_STATS - * if the table is full of non-matching entries. - **/ -static unsigned int getStatTableIndex(KvdoWorkItemStats *stats, - KvdoWorkFunction work, - unsigned int priority) -{ - KvdoWorkFunctionTable *functionTable = &stats->functionTable; - - unsigned int index = scanStatTable(functionTable, work, priority); - if (unlikely(index == NUM_WORK_QUEUE_ITEM_STATS) - || likely(functionTable->functions[index] != NULL)) { - return index; - } - - unsigned long flags = 0; - // The delayed-work-item processing uses queue->lock in some cases, - // and one case may call into this function, so we can't reuse - // queue->lock here. - spin_lock_irqsave(&functionTable->lock, flags); - // Recheck now that we've got the lock... - index = scanStatTable(functionTable, work, priority); - if ((index == NUM_WORK_QUEUE_ITEM_STATS) - || (functionTable->functions[index] != NULL)) { - spin_unlock_irqrestore(&functionTable->lock, flags); - return index; - } - - /* - * An uninitialized priority is indistinguishable from a zero - * priority. So store the priority first, and enforce the ordering, - * so that a non-null work function pointer indicates we've finished - * filling in the value. (And, to make this work, we have to read - * the work function first and priority second, when comparing.) - */ - functionTable->priorities[index] = priority; - smp_wmb(); - functionTable->functions[index] = work; - spin_unlock_irqrestore(&functionTable->lock, flags); - return index; -} - -/** - * Get counters on work items, identified by index into the internal - * array. - * - * @param [in] stats The collected statistics - * @param [in] index The index - * @param [out] enqueuedPtr The total work items enqueued - * @param [out] processedPtr The number of work items processed - * @param [out] pendingPtr The number of work items still pending - **/ -static void getWorkItemCountsByItem(const KvdoWorkItemStats *stats, - unsigned int index, - uint64_t *enqueuedPtr, - uint64_t *processedPtr, - unsigned int *pendingPtr) -{ - uint64_t enqueued = atomic64_read(&stats->enqueued[index]); - uint64_t processed = stats->times[index].count; - unsigned int pending; - if (enqueued < processed) { - // Probably just out of sync. - pending = 1; - } else { - pending = enqueued - processed; - // Pedantic paranoia: Check for overflow of the 32-bit "pending". - if ((pending + processed) < enqueued) { - pending = UINT_MAX; - } - } - *enqueuedPtr = enqueued; - *processedPtr = processed; - *pendingPtr = pending; -} - -/** - * Get counters on work items not covered by any index value. - * - * @param [in] stats The collected statistics - * @param [out] enqueuedPtr The total work items enqueued - * @param [out] processedPtr The number of work items processed - **/ -static void getOtherWorkItemCounts(const KvdoWorkItemStats *stats, - uint64_t *enqueuedPtr, - uint64_t *processedPtr) -{ - unsigned int pending; - getWorkItemCountsByItem(stats, NUM_WORK_QUEUE_ITEM_STATS, - enqueuedPtr, processedPtr, &pending); -} - -/** - * Get timing stats on work items, identified by index into the - * internal array. - * - * @param [in] stats The collected statistics - * @param [in] index The index into the array - * @param [out] min The minimum execution time - * @param [out] mean The mean execution time - * @param [out] max The maximum execution time - **/ -static void getWorkItemTimesByItem(const KvdoWorkItemStats *stats, - unsigned int index, - uint64_t *min, - uint64_t *mean, - uint64_t *max) -{ - *min = stats->times[index].min; - *mean = getSampleAverage(&stats->times[index]); - *max = stats->times[index].max; -} - -/**********************************************************************/ -void updateWorkItemStatsForEnqueue(KvdoWorkItemStats *stats, - KvdoWorkItem *item, - int priority) -{ - item->statTableIndex = getStatTableIndex(stats, item->statsFunction, - priority); - atomic64_add(1, &stats->enqueued[item->statTableIndex]); -} - -/**********************************************************************/ -char *getFunctionName(void *pointer, char *buffer, size_t bufferLength) -{ - if (pointer == NULL) { - /* - * Format "%ps" logs a null pointer as "(null)" with a bunch of - * leading spaces. We sometimes use this when logging lots of - * data; don't be so verbose. - */ - strncpy(buffer, "-", bufferLength); - } else { - /* - * Use a non-const array instead of a string literal below to - * defeat gcc's format checking, which doesn't understand that - * "%ps" actually does support a precision spec in Linux kernel - * code. - */ - static char truncatedFunctionNameFormatString[] = "%.*ps"; - snprintf(buffer, bufferLength, - truncatedFunctionNameFormatString, - bufferLength - 1, - pointer); - - char *space = strchr(buffer, ' '); - if (space != NULL) { - *space = '\0'; - } - } - - return buffer; -} - -/**********************************************************************/ -size_t formatWorkItemStats(const KvdoWorkItemStats *stats, - char *buffer, - size_t length) -{ - const KvdoWorkFunctionTable *functionIDs = &stats->functionTable; - size_t currentOffset = 0; - - uint64_t enqueued, processed; - int i; - for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { - if (functionIDs->functions[i] == NULL) { - break; - } - if (atomic64_read(&stats->enqueued[i]) == 0) { - continue; - } - /* - * The reporting of all of "pending", "enqueued" and "processed" - * here seems redundant, but "pending" is limited to 0 in the case - * where "processed" exceeds "enqueued", either through current - * activity and a lack of synchronization when fetching stats, or - * a coding bug. This report is intended largely for debugging, so - * we'll go ahead and print the not-necessarily-redundant values. - */ - unsigned int pending; - getWorkItemCountsByItem(stats, i, &enqueued, &processed, &pending); - - // Format: fn prio enq proc timeo [ min max mean ] - if (ENABLE_PER_FUNCTION_TIMING_STATS) { - uint64_t min, mean, max; - getWorkItemTimesByItem(stats, i, &min, &mean, &max); - currentOffset += snprintf(buffer + currentOffset, - length - currentOffset, - "%-36ps %d %10llu %10" PRIu64 - " %10llu %10llu %10" PRIu64 - "\n", - functionIDs->functions[i], - functionIDs->priorities[i], - enqueued, processed, - min, max, mean); - } else { - currentOffset += snprintf(buffer + currentOffset, - length - currentOffset, - "%-36ps %d %10llu %10" PRIu64 - "\n", - functionIDs->functions[i], - functionIDs->priorities[i], - enqueued, processed); - } - if (currentOffset >= length) { - break; - } - } - if ((i == NUM_WORK_QUEUE_ITEM_STATS) && (currentOffset < length)) { - uint64_t enqueued, processed; - getOtherWorkItemCounts(stats, &enqueued, &processed); - if (enqueued > 0) { - currentOffset += snprintf(buffer + currentOffset, - length - currentOffset, - "%-36s %d %10llu %10" PRIu64 - "\n", - "OTHER", 0, - enqueued, processed); - } - } - return currentOffset; -} - -/**********************************************************************/ -void logWorkItemStats(const KvdoWorkItemStats *stats) -{ - uint64_t totalEnqueued = 0; - uint64_t totalProcessed = 0; - - const KvdoWorkFunctionTable *functionIDs = &stats->functionTable; - - int i; - for (i = 0; i < NUM_WORK_QUEUE_ITEM_STATS; i++) { - if (functionIDs->functions[i] == NULL) { - break; - } - if (atomic64_read(&stats->enqueued[i]) == 0) { - continue; - } - /* - * The reporting of all of "pending", "enqueued" and "processed" - * here seems redundant, but "pending" is limited to 0 in the case - * where "processed" exceeds "enqueued", either through current - * activity and a lack of synchronization when fetching stats, or - * a coding bug. This report is intended largely for debugging, so - * we'll go ahead and print the not-necessarily-redundant values. - */ - uint64_t enqueued, processed; - unsigned int pending; - getWorkItemCountsByItem(stats, i, &enqueued, &processed, &pending); - totalEnqueued += enqueued; - totalProcessed += processed; - - static char work[256]; // arbitrary size - getFunctionName(functionIDs->functions[i], work, sizeof(work)); - - if (ENABLE_PER_FUNCTION_TIMING_STATS) { - uint64_t min, mean, max; - getWorkItemTimesByItem(stats, i, &min, &mean, &max); - logInfo(" priority %d: %u pending" - " %llu enqueued %llu processed" - " %s" - " times %llu/%llu/%lluns", - functionIDs->priorities[i], - pending, enqueued, processed, work, - min, mean, max); - } else { - logInfo(" priority %d: %u pending" - " %llu enqueued %llu processed" - " %s", - functionIDs->priorities[i], - pending, enqueued, processed, work); - } - } - if (i == NUM_WORK_QUEUE_ITEM_STATS) { - uint64_t enqueued, processed; - getOtherWorkItemCounts(stats, &enqueued, &processed); - if (enqueued > 0) { - totalEnqueued += enqueued; - totalProcessed += processed; - logInfo(" ... others: %llu enqueued %llu processed", - enqueued, processed); - } - } - logInfo(" total: %llu enqueued %llu processed", - totalEnqueued, totalProcessed); -} diff --git a/vdo/kernel/workItemStats.h b/vdo/kernel/workItemStats.h deleted file mode 100644 index 0898f3b..0000000 --- a/vdo/kernel/workItemStats.h +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workItemStats.h#2 $ - */ - -#ifndef WORK_ITEM_STATS_H -#define WORK_ITEM_STATS_H - -#include "timeUtils.h" - -#include "workQueue.h" - -enum { - // Whether to enable tracking of per-work-function run-time stats. - ENABLE_PER_FUNCTION_TIMING_STATS = 0, - // How many work function/priority pairs to track call stats for - NUM_WORK_QUEUE_ITEM_STATS = 18, -}; - -typedef struct simpleStats { - uint64_t count; - uint64_t sum; - uint64_t min; - uint64_t max; -} SimpleStats; - -/* - * We track numbers of work items handled (and optionally the - * wall-clock time to run the work functions), broken down by - * individual work functions (or alternate functions that the caller - * wants recorded, like the VIO completion callback function if we're - * just enqueueing a work function that invokes that indirectly) and - * priority. - * - * The first part of this structure manages the function/priority - * pairs, and is read frequently but updated rarely (once for each - * pair, plus possibly spin lock contention). - * - * The second part holds counters, and is updated often; different - * parts are updated by various threads as described below. The last - * element of each array, index NUM_WORK_QUEUE_ITEM_STATS, is updated - * only if we have filled the arrays and can't add the current work - * function/priority. See how the statTableIndex field is set in - * workItemStats.c. - * - * All fields may additionally be read when reporting statistics - * (including optionally reporting stats when the worker thread shuts - * down), but that's rare and shouldn't significantly affect cache - * contention issues. - * - * There is no "pending" count per work function here. For reporting - * statistics, it can be approximated by looking at the other fields. - * Do not rely on them being precise and synchronized, though. - */ -typedef struct kvdoWorkItemStatsFunctionTable { - /* - * The spin lock is used to protect .functions and .priorities - * during updates. All three are modified by producers (enqueueing - * threads) but only rarely. The .functions and .priorities arrays - * are read by producers very frequently. - */ - spinlock_t lock; - KvdoWorkFunction functions[NUM_WORK_QUEUE_ITEM_STATS]; - uint8_t priorities[NUM_WORK_QUEUE_ITEM_STATS]; -} KvdoWorkFunctionTable; - -typedef struct kvdoWorkItemStats { - /* - * Table of functions and priorities, for determining the index to - * use into the counter arrays below. - * - * This table is read by producers (usually multiple entries) for - * every work item enqueued, and when reporting stats. It is updated - * by producers, and only the first time a new (work-function, - * priority) combination is seen. - */ - KvdoWorkFunctionTable functionTable; - // Skip to (somewhere on) the next cache line - char pad[CACHE_LINE_BYTES - sizeof(atomic64_t)]; - /* - * The .enqueued field is updated by producers only, once per work - * item processed; __sync operations are used to update these - * values. - */ - atomic64_t enqueued[NUM_WORK_QUEUE_ITEM_STATS + 1]; - // Skip to (somewhere on) the next cache line - char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; - /* - * These values are updated only by the consumer (worker thread). We - * overload the .times[].count field as a count of items processed, - * so if we're not doing the optional processing-time tracking - * (controlled via an option in workQueue.c), we need to explicitly - * update the count. - * - * Since only one thread can ever update these values, no - * synchronization is used. - */ - SimpleStats times[NUM_WORK_QUEUE_ITEM_STATS + 1]; -} KvdoWorkItemStats; - -/** - * Initialize a statistics structure for tracking sample - * values. Assumes the storage was already zeroed out at allocation - * time. - * - * @param stats The statistics structure - **/ -static inline void initSimpleStats(SimpleStats *stats) -{ - // Assume other fields are initialized to zero at allocation. - stats->min = UINT64_MAX; -} - -/** - * Update the statistics being tracked for a new sample value. - * - * @param stats The statistics structure - * @param value The new value to be folded in - **/ -static inline void addSample(SimpleStats *stats, uint64_t value) -{ - stats->count++; - stats->sum += value; - if (stats->min > value) { - stats->min = value; - } - if (stats->max < value) { - stats->max = value; - } -} - -/** - * Return the average of the samples collected. - * - * @param stats The statistics structure - * - * @return The average sample value - **/ -static inline uint64_t getSampleAverage(const SimpleStats *stats) -{ - uint64_t slop = stats->count / 2; - return (stats->sum + slop) / stats->count; -} - -/** - * Update all work queue statistics (work-item and otherwise) after - * enqueueing a work item. - * - * @param stats The statistics structure - * @param item The work item enqueued - * @param priority The work item's priority - **/ -void updateWorkItemStatsForEnqueue(KvdoWorkItemStats *stats, - KvdoWorkItem *item, - int priority); - -/** - * Update all work queue statistics (work-item and otherwise) after enqueueing - * a work item. - * - * This is a very lightweight function (after optimizing away conditionals and - * no-ops) and is called for every work item processed, hence the inline - * definition. - * - * This function requires that recordStartTime and - * updateWorkItemStatsForWorkTime below both get called as well; in some cases - * counters may be updated in updateWorkItemStatsForWorkTime rather than here. - * - * @param stats The statistics structure - * @param item The work item enqueued - **/ -static inline void updateWorkItemStatsForDequeue(KvdoWorkItemStats *stats, - KvdoWorkItem *item) -{ - // The times[].count field is overloaded as a count of items - // processed. - if (!ENABLE_PER_FUNCTION_TIMING_STATS) { - stats->times[item->statTableIndex].count++; - } else { - // In this case, updateWorkItemStatsForWorkTime will bump the counter. - } -} - -/** - * Record the starting time for processing a work item, if timing - * stats are enabled and if we haven't run out of room for recording - * stats in the table. - * - * @param index The work item's index into the internal array - * - * @return The current time, or zero - **/ -static inline uint64_t recordStartTime(unsigned int index) -{ - return (ENABLE_PER_FUNCTION_TIMING_STATS ? currentTime(CLOCK_MONOTONIC) : 0); -} - -/** - * Update the work queue statistics with the wall-clock time for - * processing a work item, if timing stats are enabled and if we - * haven't run out of room for recording stats in the table. - * - * @param stats The statistics structure - * @param index The work item's index into the internal array - * @param startTime The start time as reported by recordStartTime - **/ -static inline void updateWorkItemStatsForWorkTime(KvdoWorkItemStats *stats, - unsigned int index, - uint64_t startTime) -{ - if (ENABLE_PER_FUNCTION_TIMING_STATS) { - uint64_t endTime = currentTime(CLOCK_MONOTONIC); - addSample(&stats->times[index], endTime - startTime); - } -} - -/** - * Convert the pointer into a string representation, using a function - * name if available. - * - * @param pointer The pointer to be converted - * @param buffer The output buffer - * @param bufferLength The size of the output buffer - **/ -char *getFunctionName(void *pointer, char *buffer, size_t bufferLength); - -/** - * Dump statistics broken down by work function and priority into the - * kernel log. - * - * @param stats The statistics structure - **/ -void logWorkItemStats(const KvdoWorkItemStats *stats); - -/** - * Format counters for per-work-function stats for reporting via /sys. - * - * @param [in] stats The statistics structure - * @param [out] buffer The output buffer - * @param [in] length The size of the output buffer - * - * @return The size of the string actually written - **/ -size_t formatWorkItemStats(const KvdoWorkItemStats *stats, - char *buffer, - size_t length); - -#endif // WORK_ITEM_STATS_H diff --git a/vdo/kernel/workQueue.c b/vdo/kernel/workQueue.c deleted file mode 100644 index 8be3285..0000000 --- a/vdo/kernel/workQueue.c +++ /dev/null @@ -1,1152 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueue.c#11 $ - */ - -#include "workQueue.h" - -#include -#include -#include - -#include "atomic.h" -#include "logger.h" -#include "memoryAlloc.h" -#include "permassert.h" -#include "stringUtils.h" - -#include "numeric.h" -#include "workItemStats.h" -#include "workQueueHandle.h" -#include "workQueueInternals.h" -#include "workQueueStats.h" -#include "workQueueSysfs.h" - -enum { - // Time between work queue heartbeats in usec. The default kernel - // configurations generally have 1ms or 4ms tick rates, so let's make this a - // multiple for accuracy. - FUNNEL_HEARTBEAT_INTERVAL = 4000, - - // Time to wait for a work queue to flush remaining items during shutdown. - // Specified in milliseconds. - FUNNEL_FINISH_SLEEP = 5000, -}; - -static struct mutex queueDataLock; -static SimpleWorkQueue queueData; - -static void freeSimpleWorkQueue(SimpleWorkQueue *queue); -static void finishSimpleWorkQueue(SimpleWorkQueue *queue); - -// work item lists (used for delayed work items) - -/**********************************************************************/ -static void initializeWorkItemList(KvdoWorkItemList *list) -{ - list->tail = NULL; -} - -/**********************************************************************/ -static void addToWorkItemList(KvdoWorkItemList *list, KvdoWorkItem *item) -{ - if (list->tail == NULL) { - item->next = item; - } else { - KvdoWorkItem *head = list->tail->next; - list->tail->next = item; - item->next = head; - } - list->tail = item; -} - -/**********************************************************************/ -static bool isWorkItemListEmpty(KvdoWorkItemList *list) -{ - return list->tail == NULL; -} - -/**********************************************************************/ -static KvdoWorkItem *workItemListPoll(KvdoWorkItemList *list) -{ - KvdoWorkItem *tail = list->tail; - if (tail == NULL) { - return NULL; - } - // Extract and return head of list. - KvdoWorkItem *head = tail->next; - // Only one entry? - if (head == tail) { - list->tail = NULL; - } else { - tail->next = head->next; - } - head->next = NULL; - return head; -} - -/**********************************************************************/ -static KvdoWorkItem *workItemListPeek(KvdoWorkItemList *list) -{ - KvdoWorkItem *tail = list->tail; - return tail ? tail->next : NULL; -} - -// Finding the SimpleWorkQueue to actually operate on. - -/** - * Pick the next subordinate service queue in rotation. - * - * This doesn't need to be 100% precise in distributing work items around, so - * playing loose with concurrent field modifications isn't going to hurt us. - * (Avoiding the atomic ops may help us a bit in performance, but we'll still - * have contention over the fields.) - * - * @param queue The round-robin-type work queue - * - * @return A subordinate work queue - **/ -static inline SimpleWorkQueue *nextServiceQueue(RoundRobinWorkQueue *queue) -{ - unsigned int index = (queue->serviceQueueRotor++ % queue->numServiceQueues); - return queue->serviceQueues[index]; -} - -/** - * Find a simple work queue on which to operate. - * - * If the argument is already a simple work queue, use it. If it's a - * round-robin work queue, pick the next subordinate service queue and use it. - * - * @param queue a work queue (round-robin or simple) - * - * @return a simple work queue - **/ -static inline SimpleWorkQueue *pickSimpleQueue(KvdoWorkQueue *queue) -{ - return (queue->roundRobinMode - ? nextServiceQueue(asRoundRobinWorkQueue(queue)) - : asSimpleWorkQueue(queue)); -} - -// Processing normal work items. - -/** - * Scan the work queue's work item lists, and dequeue and return the next - * waiting work item, if any. - * - * We scan the funnel queues from highest priority to lowest, once; there is - * therefore a race condition where a high-priority work item can be enqueued - * followed by a lower-priority one, and we'll grab the latter (but we'll catch - * the high-priority item on the next call). If strict enforcement of - * priorities becomes necessary, this function will need fixing. - * - * @param queue the work queue - * - * @return a work item pointer, or NULL - **/ -static KvdoWorkItem *pollForWorkItem(SimpleWorkQueue *queue) -{ - KvdoWorkItem *item = NULL; - for (int i = READ_ONCE(queue->numPriorityLists) - 1; i >= 0; i--) { - FunnelQueueEntry *link = funnelQueuePoll(queue->priorityLists[i]); - if (link != NULL) { - item = container_of(link, KvdoWorkItem, workQueueEntryLink); - break; - } - } - - return item; -} - -/** - * Add a work item into the queue, and inform the caller of any additional - * processing necessary. - * - * If the worker thread may not be awake, true is returned, and the caller - * should attempt a wakeup. - * - * @param queue The work queue - * @param item The work item to add - * - * @return true iff the caller should wake the worker thread - **/ -__attribute__((warn_unused_result)) -static bool enqueueWorkQueueItem(SimpleWorkQueue *queue, KvdoWorkItem *item) -{ - ASSERT_LOG_ONLY(item->myQueue == NULL, - "item %" PRIptr " (fn %" PRIptr "/%" PRIptr - ") to enqueue (%" PRIptr - ") is not already queued (%" PRIptr ")", - item, item->work, item->statsFunction, queue, - item->myQueue); - if (ASSERT(item->action < WORK_QUEUE_ACTION_COUNT, - "action is in range for queue") != VDO_SUCCESS) { - item->action = 0; - } - unsigned int priority = READ_ONCE(queue->priorityMap[item->action]); - - // Update statistics. - updateStatsForEnqueue(&queue->stats, item, priority); - - item->myQueue = &queue->common; - - // Funnel queue handles the synchronization for the put. - funnelQueuePut(queue->priorityLists[priority], &item->workQueueEntryLink); - - /* - * Due to how funnel-queue synchronization is handled (just atomic - * operations), the simplest safe implementation here would be to wake-up any - * waiting threads after enqueueing each item. Even if the funnel queue is - * not empty at the time of adding an item to the queue, the consumer thread - * may not see this since it is not guaranteed to have the same view of the - * queue as a producer thread. - * - * However, the above is wasteful so instead we attempt to minimize the - * number of thread wakeups. This is normally unsafe due to the above - * consumer-producer synchronization constraints. To correct this a timeout - * mechanism is used to wake the thread periodically to handle the occasional - * race condition that triggers and results in this thread not being woken - * properly. - * - * In most cases, the above timeout will not occur prior to some other work - * item being added after the queue is set to idle state, so thread wakeups - * will generally be triggered much faster than this interval. The timeout - * provides protection against the cases where more work items are either not - * added or are added too infrequently. - * - * This is also why we can get away with the normally-unsafe optimization for - * the common case by checking queue->idle first without synchronization. The - * race condition exists, but another work item getting enqueued can wake us - * up, and if we don't get that either, we still have the timeout to fall - * back on. - * - * Developed and tuned for some x86 boxes; untested whether this is any - * better or worse for other platforms, with or without the explicit memory - * barrier. - */ - smp_mb(); - return ((atomic_read(&queue->idle) == 1) - && (atomic_cmpxchg(&queue->idle, 1, 0) == 1)); -} - -/** - * Compute an approximate indication of the number of pending work items. - * - * No synchronization is used, so it's guaranteed to be correct only if there - * is no activity. - * - * @param queue The work queue to examine - * - * @return the estimate of the number of pending work items - **/ -static unsigned int getPendingCount(SimpleWorkQueue *queue) -{ - KvdoWorkItemStats *stats = &queue->stats.workItemStats; - long long pending = 0; - for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { - pending += atomic64_read(&stats->enqueued[i]); - pending -= stats->times[i].count; - } - if (pending < 0) { - /* - * If we fetched numbers that were changing, we can get negative results. - * Just return an indication that there's some activity. - */ - pending = 1; - } - return pending; -} - -/** - * Run any start hook that may be defined for the work queue. - * - * @param queue The work queue - **/ -static void runStartHook(SimpleWorkQueue *queue) -{ - if (queue->type->start != NULL) { - queue->type->start(queue->private); - } -} - -/** - * Run any finish hook that may be defined for the work queue. - * - * @param queue The work queue - **/ -static void runFinishHook(SimpleWorkQueue *queue) -{ - if (queue->type->finish != NULL) { - queue->type->finish(queue->private); - } -} - -/** - * If the work queue has a suspend hook, invoke it, and when it finishes, check - * again for any pending work items. - * - * We assume a check for pending work items has just been done and turned up - * empty; so, if no suspend hook exists, we can just return NULL without doing - * another check. - * - * @param [in] queue The work queue preparing to suspend - * - * @return the newly found work item, if any - **/ -static KvdoWorkItem *runSuspendHook(SimpleWorkQueue *queue) -{ - if (queue->type->suspend == NULL) { - return NULL; - } - - queue->type->suspend(queue->private); - return pollForWorkItem(queue); -} - -/** - * Check whether a work queue has delayed work items pending. - * - * @param queue The work queue - * - * @return true iff delayed work items are pending - **/ -static bool hasDelayedWorkItems(SimpleWorkQueue *queue) -{ - bool result; - unsigned long flags; - spin_lock_irqsave(&queue->lock, flags); - result = !isWorkItemListEmpty(&queue->delayedItems); - spin_unlock_irqrestore(&queue->lock, flags); - return result; -} - -/** - * Wait for the next work item to process, or until kthread_should_stop - * indicates that it's time for us to shut down. - * - * If kthread_should_stop says it's time to stop but we have pending work - * items, return a work item. - * - * Update statistics relating to scheduler interactions. - * - * @param [in] queue The work queue to wait on - * @param [in] timeoutInterval How long to wait each iteration - * - * @return the next work item, or NULL to indicate shutdown is requested - **/ -static KvdoWorkItem *waitForNextWorkItem(SimpleWorkQueue *queue, - TimeoutJiffies timeoutInterval) -{ - KvdoWorkItem *item = runSuspendHook(queue); - if (item != NULL) { - return item; - } - - DEFINE_WAIT(wait); - while (true) { - atomic64_set(&queue->firstWakeup, 0); - prepare_to_wait(&queue->waitingWorkerThreads, &wait, TASK_INTERRUPTIBLE); - /* - * Don't set the idle flag until a wakeup will not be lost. - * - * Force synchronization between setting the idle flag and checking the - * funnel queue; the producer side will do them in the reverse order. - * (There's still a race condition we've chosen to allow, because we've got - * a timeout below that unwedges us if we hit it, but this may narrow the - * window a little.) - */ - atomic_set(&queue->idle, 1); - memoryFence(); // store-load barrier between "idle" and funnel queue - - item = pollForWorkItem(queue); - if (item != NULL) { - break; - } - - /* - * We need to check for thread-stop after setting TASK_INTERRUPTIBLE state - * up above. Otherwise, schedule() will put the thread to sleep and might - * miss a wakeup from kthread_stop() call in finishWorkQueue(). - * - * If there are delayed work items, we need to wait for them to - * get run. Then, when we check kthread_should_stop again, we'll - * finally exit. - */ - if (kthread_should_stop() && !hasDelayedWorkItems(queue)) { - /* - * Recheck once again in case we *just* converted a delayed work item to - * a regular enqueued work item. - * - * It's important that processDelayedWorkItems holds the spin lock until - * it finishes enqueueing the work item to run. - * - * Funnel queues aren't synchronized between producers and consumer. - * Normally a producer interrupted mid-update can hide a later producer's - * entry until the first completes. This would be a problem, except that - * when kthread_stop is called, we should already have ceased adding new - * work items and have waited for all the regular work items to finish; - * (recurring) delayed work items should be the only exception. - * - * Worker thread shutdown would be simpler if even the delayed work items - * were required to be completed and not re-queued before shutting down a - * work queue. - */ - item = pollForWorkItem(queue); - break; - } - - /* - * We don't need to update the wait count atomically since this is the only - * place it is modified and there is only one thread involved. - */ - queue->stats.waits++; - uint64_t timeBeforeSchedule = currentTime(CLOCK_MONOTONIC); - atomic64_add(timeBeforeSchedule - queue->mostRecentWakeup, - &queue->stats.runTime); - // Wake up often, to address the missed-wakeup race. - schedule_timeout(timeoutInterval); - queue->mostRecentWakeup = currentTime(CLOCK_MONOTONIC); - uint64_t callDurationNS = queue->mostRecentWakeup - timeBeforeSchedule; - enterHistogramSample(queue->stats.scheduleTimeHistogram, - callDurationNS / 1000); - - /* - * Check again before resetting firstWakeup for more accurate - * stats. (It's still racy, which can't be fixed without requiring - * tighter synchronization between producer and consumer sides.) - */ - item = pollForWorkItem(queue); - if (item != NULL) { - break; - } - } - - if (item != NULL) { - uint64_t firstWakeup = atomic64_read(&queue->firstWakeup); - /* - * We sometimes register negative wakeup latencies without this fencing. - * Whether it's forcing full serialization between the read of firstWakeup - * and the "rdtsc" that might be used depending on the clock source that - * helps, or some extra nanoseconds of delay covering for high-resolution - * clocks not being quite in sync between CPUs, is not yet clear. - */ - loadFence(); - if (firstWakeup != 0) { - enterHistogramSample(queue->stats.wakeupLatencyHistogram, - (currentTime(CLOCK_MONOTONIC) - firstWakeup) / 1000); - enterHistogramSample(queue->stats.wakeupQueueLengthHistogram, - getPendingCount(queue)); - } - } - finish_wait(&queue->waitingWorkerThreads, &wait); - atomic_set(&queue->idle, 0); - - return item; -} - -/** - * Get the next work item to process, possibly waiting for one, unless - * kthread_should_stop indicates that it's time for us to shut down. - * - * If kthread_should_stop says it's time to stop but we have pending work - * items, return a work item. - * - * @param [in] queue The work queue to wait on - * @param [in] timeoutInterval How long to wait each iteration - * - * @return the next work item, or NULL to indicate shutdown is requested - **/ -static KvdoWorkItem *getNextWorkItem(SimpleWorkQueue *queue, - TimeoutJiffies timeoutInterval) -{ - KvdoWorkItem *item = pollForWorkItem(queue); - if (item != NULL) { - return item; - } - return waitForNextWorkItem(queue, timeoutInterval); -} - -/** - * Execute a work item from a work queue, and do associated bookkeeping. - * - * @param [in] queue the work queue the item is from - * @param [in] item the work item to run - **/ -static void processWorkItem(SimpleWorkQueue *queue, - KvdoWorkItem *item) -{ - if (ASSERT(item->myQueue == &queue->common, - "item %" PRIptr " from queue %" PRIptr - " marked as being in this queue (%" PRIptr ")", - item, queue, item->myQueue) == UDS_SUCCESS) { - updateStatsForDequeue(&queue->stats, item); - item->myQueue = NULL; - } - - // Save the index, so we can use it after the work function. - unsigned int index = item->statTableIndex; - uint64_t workStartTime = recordStartTime(index); - item->work(item); - // We just surrendered control of the work item; no more access. - item = NULL; - updateWorkItemStatsForWorkTime(&queue->stats.workItemStats, index, - workStartTime); - - /* - * Be friendly to a CPU that has other work to do, if the kernel has told us - * to. This speeds up some performance tests; that "other work" might include - * other VDO threads. - * - * N.B.: We compute the pending count info here without any synchronization, - * but it's for stats reporting only, so being imprecise isn't too big a - * deal, as long as reads and writes are atomic operations. - */ - if (need_resched()) { - uint64_t timeBeforeReschedule = currentTime(CLOCK_MONOTONIC); - // Record the queue length we have *before* rescheduling. - unsigned int queueLen = getPendingCount(queue); - cond_resched(); - uint64_t timeAfterReschedule = currentTime(CLOCK_MONOTONIC); - - enterHistogramSample(queue->stats.rescheduleQueueLengthHistogram, - queueLen); - uint64_t runTimeNS = timeBeforeReschedule - queue->mostRecentWakeup; - enterHistogramSample(queue->stats.runTimeBeforeRescheduleHistogram, - runTimeNS / 1000); - atomic64_add(runTimeNS, &queue->stats.runTime); - uint64_t callTimeNS = timeAfterReschedule - timeBeforeReschedule; - enterHistogramSample(queue->stats.rescheduleTimeHistogram, - callTimeNS / 1000); - atomic64_add(callTimeNS, &queue->stats.rescheduleTime); - queue->mostRecentWakeup = timeAfterReschedule; - } -} - -/** - * Main loop of the work queue worker thread. - * - * Waits for work items and runs them, until told to stop. - * - * @param queue The work queue to run - **/ -static void serviceWorkQueue(SimpleWorkQueue *queue) -{ - TimeoutJiffies timeoutInterval = - maxLong(2, usecs_to_jiffies(FUNNEL_HEARTBEAT_INTERVAL + 1) - 1); - - runStartHook(queue); - - while (true) { - KvdoWorkItem *item = getNextWorkItem(queue, timeoutInterval); - if (item == NULL) { - // No work items but kthread_should_stop was triggered. - break; - } - // Process the work item - processWorkItem(queue, item); - } - - runFinishHook(queue); -} - -/** - * Initialize per-thread data for a new worker thread and run the work queue. - * Called in a new thread created by kthread_run(). - * - * @param ptr A pointer to the KvdoWorkQueue to run. - * - * @return 0 (indicating success to kthread_run()) - **/ -static int workQueueRunner(void *ptr) -{ - SimpleWorkQueue *queue = ptr; - kobject_get(&queue->common.kobj); - - WorkQueueStackHandle queueHandle; - initializeWorkQueueStackHandle(&queueHandle, queue); - queue->stats.startTime = queue->mostRecentWakeup = currentTime(CLOCK_MONOTONIC); - unsigned long flags; - spin_lock_irqsave(&queue->lock, flags); - queue->started = true; - spin_unlock_irqrestore(&queue->lock, flags); - wake_up(&queue->startWaiters); - serviceWorkQueue(queue); - - // Zero out handle structure for safety. - memset(&queueHandle, 0, sizeof(queueHandle)); - - kobject_put(&queue->common.kobj); - return 0; -} - -// Preparing work items - -/**********************************************************************/ -void setupWorkItem(KvdoWorkItem *item, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action) -{ - ASSERT_LOG_ONLY(item->myQueue == NULL, - "setupWorkItem not called on enqueued work item"); - item->work = work; - item->statsFunction = ((statsFunction == NULL) ? work : statsFunction); - item->statTableIndex = 0; - item->action = action; - item->myQueue = NULL; - item->executionTime = 0; - item->next = NULL; -} - -// Thread management - -/**********************************************************************/ -static inline void wakeWorkerThread(SimpleWorkQueue *queue) -{ - smp_mb(); - atomic64_cmpxchg(&queue->firstWakeup, 0, currentTime(CLOCK_MONOTONIC)); - // Despite the name, there's a maximum of one thread in this list. - wake_up(&queue->waitingWorkerThreads); -} - -// Delayed work items - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) -/** - * Timer function invoked when a delayed work item is ready to run. - * - * @param timer The timer which has just finished - **/ -static void processDelayedWorkItems(struct timer_list *timer) -#else -/** - * Timer function invoked when a delayed work item is ready to run. - * - * @param data The queue pointer, as an unsigned long - **/ -static void processDelayedWorkItems(unsigned long data) -#endif -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) - SimpleWorkQueue *queue = from_timer(queue, timer, delayedItemsTimer); -#else - SimpleWorkQueue *queue = (SimpleWorkQueue *) data; -#endif - Jiffies nextExecutionTime = 0; - bool reschedule = false; - bool needsWakeup = false; - - unsigned long flags; - spin_lock_irqsave(&queue->lock, flags); - while (!isWorkItemListEmpty(&queue->delayedItems)) { - KvdoWorkItem *item = workItemListPeek(&queue->delayedItems); - if (item->executionTime > jiffies) { - nextExecutionTime = item->executionTime; - reschedule = true; - break; - } - workItemListPoll(&queue->delayedItems); - item->executionTime = 0; // not actually looked at... - item->myQueue = NULL; - needsWakeup |= enqueueWorkQueueItem(queue, item); - } - spin_unlock_irqrestore(&queue->lock, flags); - if (reschedule) { - mod_timer(&queue->delayedItemsTimer, nextExecutionTime); - } - if (needsWakeup) { - wakeWorkerThread(queue); - } -} - -// Creation & teardown - -/**********************************************************************/ -static bool queueStarted(SimpleWorkQueue *queue) -{ - unsigned long flags; - spin_lock_irqsave(&queue->lock, flags); - bool started = queue->started; - spin_unlock_irqrestore(&queue->lock, flags); - return started; -} - -/** - * Create a simple work queue with a worker thread. - * - * @param [in] threadNamePrefix The per-device prefix to use in thread names - * @param [in] name The queue name - * @param [in] parentKobject The parent sysfs node - * @param [in] owner The kernel layer owning the work queue - * @param [in] private Private data of the queue for use by work - * items or other queue-specific functions - * @param [in] type The work queue type defining the lifecycle - * functions, queue actions, priorities, and - * timeout behavior - * @param [out] queuePtr Where to store the queue handle - * - * @return VDO_SUCCESS or an error code - **/ -static int makeSimpleWorkQueue(const char *threadNamePrefix, - const char *name, - struct kobject *parentKobject, - KernelLayer *owner, - void *private, - const KvdoWorkQueueType *type, - SimpleWorkQueue **queuePtr) -{ - SimpleWorkQueue *queue; - int result = ALLOCATE(1, SimpleWorkQueue, "simple work queue", &queue); - if (result != UDS_SUCCESS) { - return result; - } - - queue->type = type; - queue->private = private; - queue->common.owner = owner; - - unsigned int numPriorityLists = 1; - for (int i = 0; i < WORK_QUEUE_ACTION_COUNT; i++) { - const KvdoWorkQueueAction *action = &queue->type->actionTable[i]; - if (action->name == NULL) { - break; - } - unsigned int code = action->code; - unsigned int priority = action->priority; - - result = ASSERT(code < WORK_QUEUE_ACTION_COUNT, - "invalid action code %u in work queue initialization", - code); - if (result != VDO_SUCCESS) { - FREE(queue); - return result; - } - result = ASSERT(priority < WORK_QUEUE_PRIORITY_COUNT, - "invalid action priority %u in work queue initialization", - priority); - if (result != VDO_SUCCESS) { - FREE(queue); - return result; - } - queue->priorityMap[code] = priority; - if (numPriorityLists <= priority) { - numPriorityLists = priority + 1; - } - } - - result = duplicateString(name, "queue name", &queue->common.name); - if (result != VDO_SUCCESS) { - FREE(queue); - return -ENOMEM; - } - - init_waitqueue_head(&queue->waitingWorkerThreads); - init_waitqueue_head(&queue->startWaiters); - spin_lock_init(&queue->lock); - - initializeWorkItemList(&queue->delayedItems); -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,15,0) - timer_setup(&queue->delayedItemsTimer, processDelayedWorkItems, 0); -#else - setup_timer(&queue->delayedItemsTimer, processDelayedWorkItems, - (unsigned long) queue); -#endif - - kobject_init(&queue->common.kobj, &simpleWorkQueueKobjType); - result = kobject_add(&queue->common.kobj, parentKobject, queue->common.name); - if (result != 0) { - logError("Cannot add sysfs node: %d", result); - freeSimpleWorkQueue(queue); - return result; - } - queue->numPriorityLists = numPriorityLists; - for (int i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) { - result = makeFunnelQueue(&queue->priorityLists[i]); - if (result != UDS_SUCCESS) { - freeSimpleWorkQueue(queue); - return result; - } - } - result = initializeWorkQueueStats(&queue->stats, &queue->common.kobj); - if (result != 0) { - logError("Cannot initialize statistics tracking: %d", result); - freeSimpleWorkQueue(queue); - return result; - } - - queue->started = false; - struct task_struct *thread = NULL; - thread = kthread_run(workQueueRunner, queue, "%s:%s", threadNamePrefix, - queue->common.name); - - if (IS_ERR(thread)) { - freeSimpleWorkQueue(queue); - return (int) PTR_ERR(thread); - } - queue->thread = thread; - atomic_set(&queue->threadID, thread->pid); - /* - * If we don't wait to ensure the thread is running VDO code, a - * quick kthread_stop (due to errors elsewhere) could cause it to - * never get as far as running VDO, skipping the cleanup code. - * - * Eventually we should just make that path safe too, and then we - * won't need this synchronization. - */ - wait_event(queue->startWaiters, queueStarted(queue) == true); - *queuePtr = queue; - return UDS_SUCCESS; -} - -/**********************************************************************/ -int makeWorkQueue(const char *threadNamePrefix, - const char *name, - struct kobject *parentKobject, - KernelLayer *owner, - void *private, - const KvdoWorkQueueType *type, - unsigned int threadCount, - KvdoWorkQueue **queuePtr) -{ - if (threadCount == 1) { - SimpleWorkQueue *simpleQueue; - int result = makeSimpleWorkQueue(threadNamePrefix, name, parentKobject, - owner, private, type, &simpleQueue); - if (result == VDO_SUCCESS) { - *queuePtr = &simpleQueue->common; - } - return result; - } - - RoundRobinWorkQueue *queue; - int result = ALLOCATE(1, RoundRobinWorkQueue, "round-robin work queue", - &queue); - if (result != UDS_SUCCESS) { - return result; - } - - result = ALLOCATE(threadCount, SimpleWorkQueue *, "subordinate work queues", - &queue->serviceQueues); - if (result != UDS_SUCCESS) { - FREE(queue); - return result; - } - - queue->numServiceQueues = threadCount; - queue->common.roundRobinMode = true; - queue->common.owner = owner; - - result = duplicateString(name, "queue name", &queue->common.name); - if (result != VDO_SUCCESS) { - FREE(queue->serviceQueues); - FREE(queue); - return -ENOMEM; - } - - kobject_init(&queue->common.kobj, &roundRobinWorkQueueKobjType); - result = kobject_add(&queue->common.kobj, parentKobject, queue->common.name); - if (result != 0) { - logError("Cannot add sysfs node: %d", result); - finishWorkQueue(&queue->common); - kobject_put(&queue->common.kobj); - return result; - } - - *queuePtr = &queue->common; - - char threadName[TASK_COMM_LEN]; - for (unsigned int i = 0; i < threadCount; i++) { - snprintf(threadName, sizeof(threadName), "%s%u", name, i); - result = makeSimpleWorkQueue(threadNamePrefix, threadName, - &queue->common.kobj, owner, private, type, - &queue->serviceQueues[i]); - if (result != VDO_SUCCESS) { - queue->numServiceQueues = i; - // Destroy previously created subordinates. - finishWorkQueue(*queuePtr); - freeWorkQueue(queuePtr); - return result; - } - queue->serviceQueues[i]->parentQueue = *queuePtr; - } - - return VDO_SUCCESS; -} - -/** - * Shut down a simple work queue's worker thread. - * - * @param queue The work queue to shut down - **/ -static void finishSimpleWorkQueue(SimpleWorkQueue *queue) -{ - // Tell the worker thread to shut down. - if (queue->thread != NULL) { - atomic_set(&queue->threadID, 0); - // Waits for thread to exit. - kthread_stop(queue->thread); - } - - queue->thread = NULL; -} - -/** - * Shut down a round-robin work queue's service queues. - * - * @param queue The work queue to shut down - **/ -static void finishRoundRobinWorkQueue(RoundRobinWorkQueue *queue) -{ - SimpleWorkQueue **queueTable = queue->serviceQueues; - unsigned int count = queue->numServiceQueues; - - for (unsigned int i = 0; i < count; i++) { - finishSimpleWorkQueue(queueTable[i]); - } -} - -/**********************************************************************/ -void finishWorkQueue(KvdoWorkQueue *queue) -{ - if (queue->roundRobinMode) { - finishRoundRobinWorkQueue(asRoundRobinWorkQueue(queue)); - } else { - finishSimpleWorkQueue(asSimpleWorkQueue(queue)); - } -} - -/** - * Tear down a simple work queue, and decrement the kobject reference - * count on it. - * - * @param queue The work queue - **/ -static void freeSimpleWorkQueue(SimpleWorkQueue *queue) -{ - for (unsigned int i = 0; i < WORK_QUEUE_PRIORITY_COUNT; i++) { - freeFunnelQueue(queue->priorityLists[i]); - } - cleanupWorkQueueStats(&queue->stats); - kobject_put(&queue->common.kobj); -} - -/** - * Tear down a round-robin work queue and its service queues, and - * decrement the kobject reference count on it. - * - * @param queue The work queue - **/ -static void freeRoundRobinWorkQueue(RoundRobinWorkQueue *queue) -{ - SimpleWorkQueue **queueTable = queue->serviceQueues; - unsigned int count = queue->numServiceQueues; - - queue->serviceQueues = NULL; - for (unsigned int i = 0; i < count; i++) { - freeSimpleWorkQueue(queueTable[i]); - } - FREE(queueTable); - kobject_put(&queue->common.kobj); -} - -/**********************************************************************/ -void freeWorkQueue(KvdoWorkQueue **queuePtr) -{ - KvdoWorkQueue *queue = *queuePtr; - if (queue == NULL) { - return; - } - *queuePtr = NULL; - - finishWorkQueue(queue); - - if (queue->roundRobinMode) { - freeRoundRobinWorkQueue(asRoundRobinWorkQueue(queue)); - } else { - freeSimpleWorkQueue(asSimpleWorkQueue(queue)); - } -} - -// Debugging dumps - -/**********************************************************************/ -static void dumpSimpleWorkQueue(SimpleWorkQueue *queue) -{ - mutex_lock(&queueDataLock); - // Take a snapshot to reduce inconsistency in logged numbers. - queueData = *queue; - const char *threadStatus; - - char taskStateReport = '-'; - if (queueData.thread != NULL) { -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,14,0) - taskStateReport = task_state_to_char(queue->thread); -#else - unsigned int taskState = queue->thread->state & TASK_REPORT; - taskState &= 0x1ff; - unsigned int taskStateIndex; - if (taskState != 0) { - taskStateIndex = __ffs(taskState)+1; - BUG_ON(taskStateIndex >= sizeof(TASK_STATE_TO_CHAR_STR)); - } else { - taskStateIndex = 0; - } - taskStateReport = TASK_STATE_TO_CHAR_STR[taskStateIndex]; -#endif - } - - if (queueData.thread == NULL) { - threadStatus = "no threads"; - } else if (atomic_read(&queueData.idle)) { - threadStatus = "idle"; - } else { - threadStatus = "running"; - } - logInfo("workQ %" PRIptr " (%s) %u entries %llu waits, %s (%c)", - &queue->common, - queueData.common.name, - getPendingCount(&queueData), - queueData.stats.waits, - threadStatus, - taskStateReport); - - logWorkItemStats(&queueData.stats.workItemStats); - logWorkQueueStats(queue); - - mutex_unlock(&queueDataLock); - - // ->lock spin lock status? - // ->waitingWorkerThreads wait queue status? anyone waiting? -} - -/**********************************************************************/ -void dumpWorkQueue(KvdoWorkQueue *queue) -{ - if (queue->roundRobinMode) { - RoundRobinWorkQueue *roundRobinQueue = asRoundRobinWorkQueue(queue); - for (unsigned int i = 0; i < roundRobinQueue->numServiceQueues; i++) { - dumpSimpleWorkQueue(roundRobinQueue->serviceQueues[i]); - } - } else { - dumpSimpleWorkQueue(asSimpleWorkQueue(queue)); - } -} - -/**********************************************************************/ -void dumpWorkItemToBuffer(KvdoWorkItem *item, char *buffer, size_t length) -{ - size_t currentLength - = snprintf(buffer, length, "%.*s/", TASK_COMM_LEN, - item->myQueue == NULL ? "-" : item->myQueue->name); - if (currentLength < length) { - getFunctionName(item->statsFunction, buffer + currentLength, - length - currentLength); - } -} - -// Work submission - -/**********************************************************************/ -void enqueueWorkQueue(KvdoWorkQueue *kvdoWorkQueue, KvdoWorkItem *item) -{ - SimpleWorkQueue *queue = pickSimpleQueue(kvdoWorkQueue); - - item->executionTime = 0; - - if (enqueueWorkQueueItem(queue, item)) { - wakeWorkerThread(queue); - } -} - -/**********************************************************************/ -void enqueueWorkQueueDelayed(KvdoWorkQueue *kvdoWorkQueue, - KvdoWorkItem *item, - Jiffies executionTime) -{ - if (executionTime <= jiffies) { - enqueueWorkQueue(kvdoWorkQueue, item); - return; - } - - SimpleWorkQueue *queue = pickSimpleQueue(kvdoWorkQueue); - bool rescheduleTimer = false; - unsigned long flags; - - item->executionTime = executionTime; - - // Lock if the work item is delayed. All delayed items are handled via a - // single linked list. - spin_lock_irqsave(&queue->lock, flags); - - if (isWorkItemListEmpty(&queue->delayedItems)) { - rescheduleTimer = true; - } - /* - * XXX We should keep the list sorted, but at the moment the list won't - * grow above a single entry anyway. - */ - item->myQueue = &queue->common; - addToWorkItemList(&queue->delayedItems, item); - - spin_unlock_irqrestore(&queue->lock, flags); - - if (rescheduleTimer) { - mod_timer(&queue->delayedItemsTimer, executionTime); - } -} - -// Misc - - -/**********************************************************************/ -KvdoWorkQueue *getCurrentWorkQueue(void) -{ - SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); - return (queue == NULL) ? NULL : &queue->common; -} - -/**********************************************************************/ -KernelLayer *getWorkQueueOwner(KvdoWorkQueue *queue) -{ - return queue->owner; -} - -/**********************************************************************/ -void *getWorkQueuePrivateData(void) -{ - SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); - return (queue != NULL) ? queue->private : NULL; -} - -/**********************************************************************/ -void setWorkQueuePrivateData(void *newData) -{ - SimpleWorkQueue *queue = getCurrentThreadWorkQueue(); - BUG_ON(queue == NULL); - queue->private = newData; -} - -/**********************************************************************/ -void initWorkQueueOnce(void) -{ - // We can't use DEFINE_MUTEX because it's not compatible with c99 mode. - mutex_init(&queueDataLock); - initWorkQueueStackHandleOnce(); -} diff --git a/vdo/kernel/workQueue.h b/vdo/kernel/workQueue.h deleted file mode 100644 index 4043295..0000000 --- a/vdo/kernel/workQueue.h +++ /dev/null @@ -1,301 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueue.h#2 $ - */ - -#ifndef ALBIREO_WORK_QUEUE_H -#define ALBIREO_WORK_QUEUE_H - -#include -#include /* for TASK_COMM_LEN */ - -#include "kernelTypes.h" -#include "util/funnelQueue.h" - -enum { - MAX_QUEUE_NAME_LEN = TASK_COMM_LEN, - /** Maximum number of action definitions per work queue type */ - WORK_QUEUE_ACTION_COUNT = 8, - /** Number of priority values available */ - WORK_QUEUE_PRIORITY_COUNT = 4, -}; - -struct kvdoWorkItem { - /** Entry link for lock-free work queue */ - FunnelQueueEntry workQueueEntryLink; - /** Function to be called */ - KvdoWorkFunction work; - /** Optional alternate function for display in queue stats */ - void *statsFunction; - /** An index into the statistics table; filled in by workQueueStats code */ - unsigned int statTableIndex; - /** - * The action code given to setupWorkItem, from which a priority will be - * determined. - **/ - unsigned int action; - /** The work queue in which the item is enqueued, or NULL if not enqueued. */ - KvdoWorkQueue *myQueue; - /** - * Time at which to execute in jiffies for a delayed work item, or zero to - * queue for execution ASAP. - **/ - Jiffies executionTime; - /** List management for delayed or expired work items */ - KvdoWorkItem *next; - /** Time of enqueueing, in ns, for recording queue (waiting) time stats */ - uint64_t enqueueTime; -}; - -/** - * Table entries defining an action. - * - * Actions are intended to distinguish general classes of activity for - * prioritization purposes, but not necessarily to indicate specific work - * functions. They are indicated to setupWorkItem numerically, using an - * enumerator defined per kind of work queue -- bio submission work queue - * actions use BioQAction, cpu actions use CPUQAction, etc. For example, for - * the CPU work queues, data compression can be prioritized separately from - * final cleanup processing of a KVIO or from dedupe verification; base code - * threads prioritize all VIO callback invocation the same, but separate from - * sync or heartbeat operations. The bio acknowledgement work queue, on the - * other hand, only does one thing, so it only defines one action code. - * - * Action codes values must be small integers, 0 through - * WORK_QUEUE_ACTION_COUNT-1, and should not be duplicated for a queue type. - * - * A table of KvdoWorkQueueAction entries embedded in KvdoWorkQueueType - * specifies the name, code, and priority for each type of action in the work - * queue. The table can have at most WORK_QUEUE_ACTION_COUNT entries, but a - * NULL name indicates an earlier end to the table. - * - * Priorities may be specified as values from 0 through - * WORK_QUEUE_PRIORITY_COUNT-1, higher values indicating higher priority. - * Priorities are just strong suggestions; it's possible for a lower-priority - * work item scheduled right after a high-priority one to be run first, if the - * worker thread happens to be scanning its queues at just the wrong moment, - * but the high-priority item will be picked up next. - * - * Internally, the priorities in this table are used to initialize another - * table in the constructed work queue object, and in internal builds, - * device-mapper messages can be sent to change the priority for an action, - * identified by name, in a running VDO device. Doing so does not affect the - * priorities for other devices, or for future VDO device creation. - **/ -typedef struct kvdoWorkQueueAction { - /** Name of the action */ - char *name; - - /** The action code (per-type enum) */ - unsigned int code; - - /** The initial priority for this action */ - unsigned int priority; -} KvdoWorkQueueAction; - -typedef void (*KvdoWorkQueueFunction)(void *); - -/** - * Static attributes of a work queue that are fixed at compile time - * for a given call site. (Attributes that may be computed at run time - * are passed as separate arguments.) - **/ -typedef struct kvdoWorkQueueType { - /** A function to call in the new thread before servicing requests */ - KvdoWorkQueueFunction start; - - /** A function to call in the new thread when shutting down */ - KvdoWorkQueueFunction finish; - - /** A function to call in the new thread after running out of work */ - KvdoWorkQueueFunction suspend; - - /** Table of actions for this work queue */ - KvdoWorkQueueAction actionTable[WORK_QUEUE_ACTION_COUNT]; -} KvdoWorkQueueType; - -/** - * Create a work queue. - * - * If multiple threads are requested, work items will be distributed to them in - * round-robin fashion. - * - * @param [in] threadNamePrefix The per-device prefix to use in thread names - * @param [in] name The queue name - * @param [in] parentKobject The parent sysfs node - * @param [in] owner The kernel layer owning the work queue - * @param [in] private Private data of the queue for use by work - * items or other queue-specific functions - * @param [in] type The work queue type defining the lifecycle - * functions, queue actions, priorities, and - * timeout behavior - * @param [in] threadCount Number of service threads to set up - * @param [out] queuePtr Where to store the queue handle - * - * @return VDO_SUCCESS or an error code - **/ -int makeWorkQueue(const char *threadNamePrefix, - const char *name, - struct kobject *parentKobject, - KernelLayer *owner, - void *private, - const KvdoWorkQueueType *type, - unsigned int threadCount, - KvdoWorkQueue **queuePtr); - -/** - * Set up the fields of a work queue item. - * - * Before the first setup call (setupWorkItem or setupWorkItemWithTimeout), the - * work item must have been initialized to all-zero. Resetting a - * previously-used work item does not require another memset. - * - * The action code is typically defined in a work-queue-type-specific - * enumeration; see the description of KvdoWorkQueueAction. - * - * @param item The work item to initialize - * @param work The function pointer to execute - * @param statsFunction A function pointer to record for stats, or NULL - * @param action Action code, for determination of priority - **/ -void setupWorkItem(KvdoWorkItem *item, - KvdoWorkFunction work, - void *statsFunction, - unsigned int action); - -/** - * Add a work item to a work queue. - * - * If the work item has a timeout that has already passed, the timeout - * handler function may be invoked at this time. - * - * @param queue The queue handle - * @param item The work item to be processed - **/ -void enqueueWorkQueue(KvdoWorkQueue *queue, KvdoWorkItem *item); - -/** - * Add a work item to a work queue, to be run at a later point in time. - * - * Currently delayed work items are used only in a very limited fashion -- at - * most one at a time for any of the work queue types that use them -- and some - * shortcuts have been taken that assume that that's the case. Multiple delayed - * work items should work, but they will execute in the order they were - * enqueued. - * - * @param queue The queue handle - * @param item The work item to be processed - * @param executionTime When to run the work item (jiffies) - **/ -void enqueueWorkQueueDelayed(KvdoWorkQueue *queue, - KvdoWorkItem *item, - Jiffies executionTime); - -/** - * Shut down a work queue's worker thread. - * - * Alerts the worker thread that it should shut down, and then waits - * for it to do so. - * - * There should not be any new enqueueing of work items done once this - * function is called. Any pending delayed work items will be - * processed, as scheduled, before the worker thread shuts down, but - * they must not re-queue themselves to run again. - * - * @param queue The work queue to shut down - **/ -void finishWorkQueue(KvdoWorkQueue *queue); - -/** - * Free a work queue and null out the reference to it. - * - * @param queuePtr Where the queue handle is found - **/ -void freeWorkQueue(KvdoWorkQueue **queuePtr); - -/** - * Print work queue state and statistics to the kernel log. - * - * @param queue The work queue to examine - **/ -void dumpWorkQueue(KvdoWorkQueue *queue); - -/** - * Write to the buffer some info about the work item, for logging. - * Since the common use case is dumping info about a lot of work items - * to syslog all at once, the format favors brevity over readability. - * - * @param item The work item - * @param buffer The message buffer to fill in - * @param length The length of the message buffer - **/ -void dumpWorkItemToBuffer(KvdoWorkItem *item, char *buffer, size_t length); - - -/** - * Initialize work queue internals at module load time. - **/ -void initWorkQueueOnce(void); - -/** - * Checks whether two work items have the same action codes - * - * @param item1 The first item - * @param item2 The second item - * - * @return TRUE if the actions are the same, FALSE otherwise - */ -static inline bool areWorkItemActionsEqual(KvdoWorkItem *item1, - KvdoWorkItem *item2) -{ - return item1->action == item2->action; -} - -/** - * Returns the private data for the current thread's work queue. - * - * @return The private data pointer, or NULL if none or if the current - * thread is not a work queue thread. - **/ -void *getWorkQueuePrivateData(void); - -/** - * Updates the private data pointer for the current thread's work queue. - * - * @param newData The new private data pointer - **/ -void setWorkQueuePrivateData(void *newData); - -/** - * Returns the work queue pointer for the current thread, if any. - * - * @return The work queue pointer or NULL - **/ -KvdoWorkQueue *getCurrentWorkQueue(void); - -/** - * Returns the kernel layer that owns the work queue. - * - * @param queue The work queue - * - * @return The owner pointer supplied at work queue creation - **/ -KernelLayer *getWorkQueueOwner(KvdoWorkQueue *queue); - -#endif /* ALBIREO_WORK_QUEUE_H */ diff --git a/vdo/kernel/workQueueHandle.c b/vdo/kernel/workQueueHandle.c deleted file mode 100644 index 65b3e02..0000000 --- a/vdo/kernel/workQueueHandle.c +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueHandle.c#2 $ - */ - -#include "workQueueHandle.h" - -WorkQueueStackHandleGlobals workQueueStackHandleGlobals; - -/**********************************************************************/ -void initializeWorkQueueStackHandle(WorkQueueStackHandle *handle, - SimpleWorkQueue *queue) -{ - handle->nonce = workQueueStackHandleGlobals.nonce; - handle->queue = queue; - - long offset = (char *) handle - (char *) task_stack_page(current); - spin_lock(&workQueueStackHandleGlobals.offsetLock); - if (workQueueStackHandleGlobals.offset == 0) { - workQueueStackHandleGlobals.offset = offset; - spin_unlock(&workQueueStackHandleGlobals.offsetLock); - } else { - long foundOffset = workQueueStackHandleGlobals.offset; - spin_unlock(&workQueueStackHandleGlobals.offsetLock); - BUG_ON(foundOffset != offset); - } -} - -/**********************************************************************/ -void initWorkQueueStackHandleOnce(void) -{ - spin_lock_init(&workQueueStackHandleGlobals.offsetLock); - workQueueStackHandleGlobals.nonce = currentTime(CLOCK_MONOTONIC); -} diff --git a/vdo/kernel/workQueueHandle.h b/vdo/kernel/workQueueHandle.h deleted file mode 100644 index e72ce42..0000000 --- a/vdo/kernel/workQueueHandle.h +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueHandle.h#1 $ - */ - -#ifndef WORK_QUEUE_HANDLE_H -#define WORK_QUEUE_HANDLE_H - -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,11,0) -#include -#else -#include -#endif - -#include "workQueueInternals.h" - -/* - * Layout of a special structure stored at a consistent place on the - * stack in work queue threads. - */ -typedef struct workQueueStackHandle { - unsigned long nonce; - SimpleWorkQueue *queue; -} WorkQueueStackHandle; - -typedef struct workQueueStackHandleGlobals { - /* - * Location in the stack, relative to the task structure which is - * contained in the same memory allocation. - */ - long offset; - /* - * A lock is used to guard against multiple updaters, but once an - * update is done, the offset variable will be read-only. - */ - spinlock_t offsetLock; - /* - * A nonce chosen differently each time the module is loaded, used - * as a marker so we can check that the current thread really is a - * work queue thread. Set at module initialization time, before any - * work queues are created. - */ - unsigned long nonce; -} WorkQueueStackHandleGlobals; - -extern WorkQueueStackHandleGlobals workQueueStackHandleGlobals; - -/** - * Initialize a stack handle associated with a work queue. - * - * @param [out] handle The handle to be initialized - * @param [in] queue The work queue pointer - **/ -void initializeWorkQueueStackHandle(WorkQueueStackHandle *handle, - SimpleWorkQueue *queue); - -/** - * Return the work queue pointer recorded at initialization time in - * the work-queue stack handle initialized on the stack of the current - * thread, if any. - * - * @return the work queue pointer, or NULL - **/ -static inline SimpleWorkQueue *getCurrentThreadWorkQueue(void) -{ - WorkQueueStackHandle *handle - = (WorkQueueStackHandle *)(task_stack_page(current) - + workQueueStackHandleGlobals.offset); - if (likely(handle->nonce == workQueueStackHandleGlobals.nonce)) { - return handle->queue; - } else { - return NULL; - } -} - -/** - * Initialize the global state used by the work-queue stack-handle - * code. - **/ -void initWorkQueueStackHandleOnce(void); - -#endif // WORK_QUEUE_HANDLE_H diff --git a/vdo/kernel/workQueueInternals.h b/vdo/kernel/workQueueInternals.h deleted file mode 100644 index fc7a2a3..0000000 --- a/vdo/kernel/workQueueInternals.h +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueInternals.h#4 $ - */ - -#ifndef WORK_QUEUE_INTERNALS_H -#define WORK_QUEUE_INTERNALS_H - -#include -#include -#include -#include -#include - -#include "workItemStats.h" -#include "workQueueStats.h" - -typedef struct kvdoWorkItemList { - KvdoWorkItem *tail; -} KvdoWorkItemList; - -/** - * Work queue definition. - * - * There are two types of work queues: simple, with one worker thread, and - * round-robin, which uses a group of the former to do the work, and assigns - * work to them in -- you guessed it -- round-robin fashion. Externally, both - * are represented via the same common sub-structure, though there's actually - * not a great deal of overlap between the two types internally. - **/ -struct kvdoWorkQueue { - /** Name of just the work queue (e.g., "cpuQ12") */ - char *name; - /** - * Whether this is a round-robin work queue or a simple (one-thread) - * work queue. - **/ - bool roundRobinMode; - /** A handle to a sysfs tree for reporting stats and other info */ - struct kobject kobj; - /** The kernel layer owning this work queue */ - KernelLayer *owner; -}; - -typedef struct simpleWorkQueue SimpleWorkQueue; -typedef struct roundRobinWorkQueue RoundRobinWorkQueue; - -struct simpleWorkQueue { - /** Common work queue bits */ - KvdoWorkQueue common; - /** A copy of .thread->pid, for safety in the sysfs support */ - atomic_t threadID; - /** - * Number of priorities actually used, so we don't keep re-checking unused - * funnel queues. - **/ - unsigned int numPriorityLists; - /** - * Map from action codes to priorities. - * - * This mapping can be changed at run time in internal builds, for tuning - * purposes. - **/ - uint8_t priorityMap[WORK_QUEUE_ACTION_COUNT]; - /** The funnel queues */ - FunnelQueue *priorityLists[WORK_QUEUE_PRIORITY_COUNT]; - /** The kernel thread */ - struct task_struct *thread; - /** Life cycle functions, etc */ - const KvdoWorkQueueType *type; - /** Opaque private data pointer, defined by higher level code */ - void *private; - /** In a subordinate work queue, a link back to the round-robin parent */ - KvdoWorkQueue *parentQueue; - /** Padding for cache line separation */ - char pad[CACHE_LINE_BYTES - sizeof(KvdoWorkQueue *)]; - /** Lock protecting delayedItems, priorityMap, numPriorityLists, started */ - spinlock_t lock; - /** Any worker threads (zero or one) waiting for new work to do */ - wait_queue_head_t waitingWorkerThreads; - /** - * Hack to reduce wakeup calls if the worker thread is running. See comments - * in workQueue.c. - * - * There is a lot of redundancy with "firstWakeup", though, and the pair - * should be re-examined. - **/ - atomic_t idle; - /** Wait list for synchronization during worker thread startup */ - wait_queue_head_t startWaiters; - /** Worker thread status (boolean) */ - bool started; - - /** List of delayed work items; usually only one, if any */ - KvdoWorkItemList delayedItems; - /** - * Timer for pulling delayed work items off their list and submitting them to - * run. - * - * If the spinlock "lock" above is not held, this timer is scheduled (or - * currently firing and the callback about to acquire the lock) iff - * delayedItems is nonempty. - **/ - struct timer_list delayedItemsTimer; - - /** - * Timestamp (ns) from the submitting thread that decided to wake us up; also - * used as a flag to indicate whether a wakeup is needed. - * - * Written by submitting threads with atomic64_cmpxchg, and by the worker - * thread setting to 0. - * - * If the value is 0, the worker is probably asleep; the submitting thread - * stores a non-zero value and becomes responsible for calling wake_up on the - * worker thread. If the value is non-zero, either the worker is running or - * another thread has the responsibility for issuing the wakeup. - * - * The "sleep" mode has periodic wakeups and the worker thread may happen to - * wake up while a work item is being enqueued. If that happens, the wakeup - * may be unneeded but will be attempted anyway. - * - * So the return value from cmpxchg(firstWakeup,0,nonzero) can always be - * done, and will tell the submitting thread whether to issue the wakeup or - * not; cmpxchg is atomic, so no other synchronization is needed. - * - * A timestamp is used rather than, say, 1, so that the worker thread can - * record stats on how long it takes to actually get the worker thread - * running. - * - * There is some redundancy between this and "idle" above. - **/ - atomic64_t firstWakeup; - /** Padding for cache line separation */ - char pad2[CACHE_LINE_BYTES - sizeof(atomic64_t)]; - /** Scheduling and work-function statistics */ - KvdoWorkQueueStats stats; - /** Last time (ns) the scheduler actually woke us up */ - uint64_t mostRecentWakeup; -}; - -struct roundRobinWorkQueue { - /** Common work queue bits */ - KvdoWorkQueue common; - /** Simple work queues, for actually getting stuff done */ - SimpleWorkQueue **serviceQueues; - /** Number of subordinate work queues */ - unsigned int numServiceQueues; - /** Padding for cache line separation */ - char pad[CACHE_LINE_BYTES - sizeof(unsigned int)]; - /** - * Rotor used for dispatching across subordinate service queues. - * - * Used and updated by submitting threads. (Not atomically or with locking, - * because we don't really care about it being precise, only about getting a - * roughly even spread; if an increment is missed here and there, it's not a - * problem.) - **/ - unsigned int serviceQueueRotor; -}; - -static inline SimpleWorkQueue *asSimpleWorkQueue(KvdoWorkQueue *queue) -{ - return ((queue == NULL) - ? NULL - : container_of(queue, SimpleWorkQueue, common)); -} - -static inline const SimpleWorkQueue * -asConstSimpleWorkQueue(const KvdoWorkQueue *queue) -{ - return ((queue == NULL) - ? NULL - : container_of(queue, SimpleWorkQueue, common)); -} - -static inline RoundRobinWorkQueue *asRoundRobinWorkQueue(KvdoWorkQueue *queue) -{ - return ((queue == NULL) - ? NULL - : container_of(queue, RoundRobinWorkQueue, common)); -} - -#endif // WORK_QUEUE_INTERNALS_H diff --git a/vdo/kernel/workQueueStats.c b/vdo/kernel/workQueueStats.c deleted file mode 100644 index d5a38ae..0000000 --- a/vdo/kernel/workQueueStats.c +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueStats.c#6 $ - */ - -#include "workQueueStats.h" - -#include "atomic.h" -#include "logger.h" -#include "workItemStats.h" -#include "workQueueInternals.h" - -/**********************************************************************/ -int initializeWorkQueueStats(KvdoWorkQueueStats *stats, - struct kobject *queueKObject) -{ - spin_lock_init(&stats->workItemStats.functionTable.lock); - if (ENABLE_PER_FUNCTION_TIMING_STATS) { - for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { - initSimpleStats(&stats->workItemStats.times[i]); - } - } - - stats->queueTimeHistogram - = makeLogarithmicHistogram(queueKObject, "queue_time", - "Queue Time", "work items", "wait time", - "microseconds", 9); - if (stats->queueTimeHistogram == NULL) { - return -ENOMEM; - } - - stats->rescheduleQueueLengthHistogram - = makeLogarithmicHistogram(queueKObject, "reschedule_queue_length", - "Reschedule Queue Length", "calls", - "queued work items", NULL, 4); - if (stats->rescheduleQueueLengthHistogram == NULL) { - return -ENOMEM; - } - - stats->rescheduleTimeHistogram - = makeLogarithmicHistogram(queueKObject, "reschedule_time", - "Reschedule Time", "calls", - "sleep interval", "microseconds", 9); - if (stats->rescheduleTimeHistogram == NULL) { - return -ENOMEM; - } - - stats->runTimeBeforeRescheduleHistogram - = makeLogarithmicHistogram(queueKObject, "run_time_before_reschedule", - "Run Time Before Reschedule", - "calls", "run time", "microseconds", 9); - if (stats->runTimeBeforeRescheduleHistogram == NULL) { - return -ENOMEM; - } - - stats->scheduleTimeHistogram - = makeLogarithmicHistogram(queueKObject, "schedule_time", - "Schedule Time", - "calls", "sleep interval", "microseconds", 9); - if (stats->scheduleTimeHistogram == NULL) { - return -ENOMEM; - } - - stats->wakeupLatencyHistogram - = makeLogarithmicHistogram(queueKObject, "wakeup_latency", - "Wakeup Latency", - "wakeups", "latency", "microseconds", 9); - if (stats->wakeupLatencyHistogram == NULL) { - return -ENOMEM; - } - - stats->wakeupQueueLengthHistogram - = makeLogarithmicHistogram(queueKObject, "wakeup_queue_length", - "Wakeup Queue Length", "wakeups", - "queued work items", NULL, 4); - if (stats->wakeupQueueLengthHistogram == NULL) { - return -ENOMEM; - } - - return 0; -} - -/**********************************************************************/ -void cleanupWorkQueueStats(KvdoWorkQueueStats *stats) -{ - freeHistogram(&stats->queueTimeHistogram); - freeHistogram(&stats->rescheduleQueueLengthHistogram); - freeHistogram(&stats->rescheduleTimeHistogram); - freeHistogram(&stats->runTimeBeforeRescheduleHistogram); - freeHistogram(&stats->scheduleTimeHistogram); - freeHistogram(&stats->wakeupLatencyHistogram); - freeHistogram(&stats->wakeupQueueLengthHistogram); -} - -/**********************************************************************/ -static uint64_t getTotalProcessed(const SimpleWorkQueue *queue) -{ - uint64_t totalProcessed = 0; - for (int i = 0; i < NUM_WORK_QUEUE_ITEM_STATS + 1; i++) { - totalProcessed += queue->stats.workItemStats.times[i].count; - } - return totalProcessed; -} - -/**********************************************************************/ -void logWorkQueueStats(const SimpleWorkQueue *queue) -{ - uint64_t runtimeNS = 0; - if (queue->thread != NULL) { - runtimeNS += queue->thread->se.sum_exec_runtime; - } - - unsigned long nsPerWorkItem = 0; - uint64_t totalProcessed = getTotalProcessed(queue); - if (totalProcessed > 0) { - nsPerWorkItem = runtimeNS / totalProcessed; - } - unsigned long runtimeMS = runtimeNS / 1000; - logInfo("workQ %" PRIptr " (%s) thread cpu usage %lu.%06lus, %" PRIu64 - " tasks, %lu.%03luus/task", - queue, - queue->common.name, - runtimeMS / 1000000, runtimeMS % 1000000, - totalProcessed, - nsPerWorkItem / 1000, nsPerWorkItem % 1000); -} - -/**********************************************************************/ -ssize_t formatRunTimeStats(const KvdoWorkQueueStats *stats, char *buffer) -{ - // Get snapshots of all three at approximately the same time. - uint64_t startTime = stats->startTime; - uint64_t runTime = atomic64_read(&stats->runTime); - uint64_t rescheduleTime = atomic64_read(&stats->rescheduleTime); - loadFence(); // rdtsc barrier - uint64_t now = currentTime(CLOCK_MONOTONIC); - uint64_t lifetime = now - startTime; - - return sprintf(buffer, - "%llu %llu %llu\n", - lifetime, runTime, rescheduleTime); -} diff --git a/vdo/kernel/workQueueStats.h b/vdo/kernel/workQueueStats.h deleted file mode 100644 index 914f5f4..0000000 --- a/vdo/kernel/workQueueStats.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueStats.h#2 $ - */ - -#ifndef WORK_QUEUE_STATS_H -#define WORK_QUEUE_STATS_H - -#include "workQueue.h" - -#include "timeUtils.h" - -#include "histogram.h" -#include "workItemStats.h" - -// Defined in workQueueInternals.h after inclusion of workQueueStats.h. -struct simpleWorkQueue; - -/* - * Tracking statistics. - * - * Cache line contention issues: - * - * In workItemStats, there are read-only fields accessed mostly by - * work submitters, then fields updated by the work submitters (for - * which there will be contention), then fields rarely if ever updated - * (more than two cache lines' worth), then fields updated only by the - * worker thread. The trailing fields here are updated only by the - * worker thread. - */ -typedef struct kvdoWorkQueueStats { - // Per-work-function counters and optional nanosecond timing data - KvdoWorkItemStats workItemStats; - // How often we go to sleep waiting for work - uint64_t waits; - - // Run time data, for monitoring utilization levels. - - // Thread start time, from which we can compute lifetime thus far. - uint64_t startTime; - /* - * Time the thread has not been blocked waiting for a new work item, - * nor in cond_resched(). This will include time the thread has been - * blocked by some kernel function invoked by the work functions - * (e.g., waiting for socket buffer space). - * - * This is not redundant with runTimeBeforeRescheduleHistogram, as - * the latter doesn't count run time not followed by a cond_resched - * call. - */ - atomic64_t runTime; - // Time the thread has been suspended via cond_resched(). - // (Duplicates data hidden within rescheduleTimeHistogram.) - atomic64_t rescheduleTime; - - // Histogram of the queue times of work items (microseconds) - Histogram *queueTimeHistogram; - // How busy we are when cond_resched is called - Histogram *rescheduleQueueLengthHistogram; - // Histogram of the time cond_resched makes us sleep for (microseconds) - Histogram *rescheduleTimeHistogram; - // Histogram of the run time between cond_resched calls (microseconds) - Histogram *runTimeBeforeRescheduleHistogram; - // Histogram of the time schedule_timeout lets us sleep for (microseconds) - Histogram *scheduleTimeHistogram; - // How long from thread wakeup call to thread actually running (microseconds) - Histogram *wakeupLatencyHistogram; - // How much work is pending by the time we start running - Histogram *wakeupQueueLengthHistogram; -} KvdoWorkQueueStats; - -/** - * Initialize the work queue's statistics tracking. - * - * @param stats The statistics structure - * @param queueKObject The sysfs directory kobject for the work queue - * - * @return 0 or a kernel error code - **/ -int initializeWorkQueueStats(KvdoWorkQueueStats *stats, - struct kobject *queueKObject) - __attribute__((warn_unused_result)); - -/** - * Tear down any allocated storage or objects for statistics tracking. - * - * @param stats The statistics structure - **/ -void cleanupWorkQueueStats(KvdoWorkQueueStats *stats); - -/** - * Update the work queue statistics tracking to note the enqueueing of - * a work item. - * - * @param stats The statistics structure - * @param item The work item being enqueued - * @param priority The priority of the work item - **/ -static inline void updateStatsForEnqueue(KvdoWorkQueueStats *stats, - KvdoWorkItem *item, - int priority) -{ - updateWorkItemStatsForEnqueue(&stats->workItemStats, item, priority); - item->enqueueTime = currentTime(CLOCK_MONOTONIC); -} - -/** - * Update the work queue statistics tracking to note the dequeueing of - * a work item. - * - * @param stats The statistics structure - * @param item The work item being enqueued - **/ -static inline void updateStatsForDequeue(KvdoWorkQueueStats *stats, - KvdoWorkItem *item) -{ - updateWorkItemStatsForDequeue(&stats->workItemStats, item); - enterHistogramSample(stats->queueTimeHistogram, - (currentTime(CLOCK_MONOTONIC) - item->enqueueTime) / 1000); - item->enqueueTime = 0; -} - -/** - * Write the work queue's accumulated statistics to the kernel log. - * - * The queue pointer is needed so that its address and name can be - * logged along with the statistics. - * - * @param queue The work queue - **/ -void logWorkQueueStats(const struct simpleWorkQueue *queue); - -/** - * Format the thread lifetime, run time, and suspend time into a - * supplied buffer for reporting via sysfs. - * - * @param [in] stats The stats structure containing the run-time info - * @param [out] buffer The buffer in which to report the info - **/ -ssize_t formatRunTimeStats(const KvdoWorkQueueStats *stats, char *buffer); - -#endif // WORK_QUEUE_STATS_H diff --git a/vdo/kernel/workQueueSysfs.c b/vdo/kernel/workQueueSysfs.c deleted file mode 100644 index f9dd9cb..0000000 --- a/vdo/kernel/workQueueSysfs.c +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueSysfs.c#1 $ - */ - -#include "workQueueSysfs.h" - -#include - -#include "logger.h" -#include "memoryAlloc.h" - -#include "workQueueInternals.h" - -typedef struct workQueueAttribute { - struct attribute attr; - ssize_t (*show)(const KvdoWorkQueue *queue, char *buf); - ssize_t (*store)(KvdoWorkQueue *queue, const char *buf, size_t length); -} WorkQueueAttribute; - -/**********************************************************************/ -static ssize_t nameShow(const KvdoWorkQueue *queue, char *buf) -{ - return sprintf(buf, "%s\n", queue->name); -} - -/**********************************************************************/ -static ssize_t pidShow(const KvdoWorkQueue *queue, char *buf) -{ - return sprintf(buf, "%ld\n", - (long) atomic_read(&asConstSimpleWorkQueue(queue)->threadID)); -} - -/**********************************************************************/ -static ssize_t timesShow(const KvdoWorkQueue *queue, char *buf) -{ - return formatRunTimeStats(&asConstSimpleWorkQueue(queue)->stats, buf); -} - -/**********************************************************************/ -static ssize_t typeShow(const KvdoWorkQueue *queue, char *buf) -{ - strcpy(buf, queue->roundRobinMode ? "round-robin\n" : "simple\n"); - return strlen(buf); -} - -/**********************************************************************/ -static ssize_t workFunctionsShow(const KvdoWorkQueue *queue, char *buf) -{ - const SimpleWorkQueue *simpleQueue = asConstSimpleWorkQueue(queue); - return formatWorkItemStats(&simpleQueue->stats.workItemStats, buf, - PAGE_SIZE); -} - -/**********************************************************************/ -static WorkQueueAttribute nameAttr = { - .attr = { .name = "name", .mode = 0444, }, - .show = nameShow, -}; - -/**********************************************************************/ -static WorkQueueAttribute pidAttr = { - .attr = { .name = "pid", .mode = 0444, }, - .show = pidShow, -}; - -/**********************************************************************/ -static WorkQueueAttribute timesAttr = { - .attr = { .name = "times", .mode = 0444 }, - .show = timesShow, -}; - -/**********************************************************************/ -static WorkQueueAttribute typeAttr = { - .attr = { .name = "type", .mode = 0444, }, - .show = typeShow, -}; - -/**********************************************************************/ -static WorkQueueAttribute workFunctionsAttr = { - .attr = { .name = "work_functions", .mode = 0444, }, - .show = workFunctionsShow, -}; - -/**********************************************************************/ -static struct attribute *simpleWorkQueueAttrs[] = { - &nameAttr.attr, - &pidAttr.attr, - ×Attr.attr, - &typeAttr.attr, - &workFunctionsAttr.attr, - NULL, -}; - -/**********************************************************************/ -static struct attribute *roundRobinWorkQueueAttrs[] = { - &nameAttr.attr, - &typeAttr.attr, - NULL, -}; - -/**********************************************************************/ -static ssize_t workQueueAttrShow(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - WorkQueueAttribute *wqAttr = container_of(attr, WorkQueueAttribute, attr); - if (wqAttr->show == NULL) { - return -EINVAL; - } - KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); - return wqAttr->show(queue, buf); -} - -/**********************************************************************/ -static ssize_t workQueueAttrStore(struct kobject *kobj, - struct attribute *attr, - const char *buf, - size_t length) -{ - WorkQueueAttribute *wqAttr = container_of(attr, WorkQueueAttribute, attr); - if (wqAttr->store == NULL) { - return -EINVAL; - } - KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); - return wqAttr->store(queue, buf, length); -} - -/**********************************************************************/ -static struct sysfs_ops workQueueSysfsOps = { - .show = workQueueAttrShow, - .store = workQueueAttrStore, -}; - -/**********************************************************************/ -static void workQueueRelease(struct kobject *kobj) -{ - KvdoWorkQueue *queue = container_of(kobj, KvdoWorkQueue, kobj); - FREE(queue->name); - if (queue->roundRobinMode) { - FREE(asRoundRobinWorkQueue(queue)); - } else { - FREE(asSimpleWorkQueue(queue)); - } -} - -/**********************************************************************/ -struct kobj_type simpleWorkQueueKobjType = { - .default_attrs = simpleWorkQueueAttrs, - .release = workQueueRelease, - .sysfs_ops = &workQueueSysfsOps, -}; - -/**********************************************************************/ -struct kobj_type roundRobinWorkQueueKobjType = { - .default_attrs = roundRobinWorkQueueAttrs, - .release = workQueueRelease, - .sysfs_ops = &workQueueSysfsOps, -}; diff --git a/vdo/kernel/workQueueSysfs.h b/vdo/kernel/workQueueSysfs.h deleted file mode 100644 index 41f6af5..0000000 --- a/vdo/kernel/workQueueSysfs.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2020 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version 2 - * of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA. - * - * $Id: //eng/vdo-releases/aluminum/src/c++/vdo/kernel/workQueueSysfs.h#1 $ - */ - -#ifndef WORK_QUEUE_SYSFS_H -#define WORK_QUEUE_SYSFS_H - -#include - -extern struct kobj_type roundRobinWorkQueueKobjType; -extern struct kobj_type simpleWorkQueueKobjType; - -#endif // WORK_QUEUE_SYSFS_H